resource: Use MD5 to identify image files

But only a set of byte chunks spread around in the image file to calculate the fingerprint, which is much faster than reading the whole file:

```bash
BenchmarkMD5FromFileFast/full=false-4         	  300000	      4356 ns/op	     240 B/op	       5 allocs/op
BenchmarkMD5FromFileFast/full=true-4          	   30000	     42899 ns/op	   32944 B/op	       5 allocs/op
```

Fixes #4186
This commit is contained in:
Bjørn Erik Pedersen 2017-12-27 19:31:42 +01:00
parent 7e76a6fd3b
commit e50a8c7a14
7 changed files with 176 additions and 26 deletions

View file

@ -26,6 +26,8 @@ import (
"unicode"
"unicode/utf8"
"github.com/spf13/afero"
"github.com/jdkato/prose/transform"
bp "github.com/gohugoio/hugo/bufferpool"
@ -372,6 +374,57 @@ func MD5String(f string) string {
return hex.EncodeToString(h.Sum([]byte{}))
}
// MD5FromFileFast creates a MD5 hash from the given file. It only reads parts of
// the file for speed, so don't use it if the files are very subtly different.
// It will not close the file.
func MD5FromFileFast(f afero.File) (string, error) {
const (
// Do not change once set in stone!
maxChunks = 8
peekSize = 64
seek = 2048
)
h := md5.New()
buff := make([]byte, peekSize)
for i := 0; i < maxChunks; i++ {
if i > 0 {
_, err := f.Seek(seek, 0)
if err != nil {
if err == io.EOF {
break
}
return "", err
}
}
_, err := io.ReadAtLeast(f, buff, peekSize)
if err != nil {
if err == io.EOF || err == io.ErrUnexpectedEOF {
h.Write(buff)
break
}
return "", err
}
h.Write(buff)
}
h.Write(buff)
return hex.EncodeToString(h.Sum(nil)), nil
}
// MD5FromFile creates a MD5 hash from the given file.
// It will not close the file.
func MD5FromFile(f afero.File) (string, error) {
h := md5.New()
if _, err := io.Copy(h, f); err != nil {
return "", nil
}
return hex.EncodeToString(h.Sum(nil)), nil
}
// IsWhitespace determines if the given rune is whitespace.
func IsWhitespace(r rune) bool {
return r == ' ' || r == '\t' || r == '\n' || r == '\r'

View file

@ -14,10 +14,12 @@
package helpers
import (
"fmt"
"reflect"
"strings"
"testing"
"github.com/spf13/afero"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
@ -270,3 +272,91 @@ func TestToLowerMap(t *testing.T) {
}
}
}
func TestFastMD5FromFile(t *testing.T) {
fs := afero.NewMemMapFs()
if err := afero.WriteFile(fs, "small.txt", []byte("abc"), 0777); err != nil {
t.Fatal(err)
}
if err := afero.WriteFile(fs, "small2.txt", []byte("abd"), 0777); err != nil {
t.Fatal(err)
}
if err := afero.WriteFile(fs, "bigger.txt", []byte(strings.Repeat("a bc d e", 100)), 0777); err != nil {
t.Fatal(err)
}
if err := afero.WriteFile(fs, "bigger2.txt", []byte(strings.Repeat("c d e f g", 100)), 0777); err != nil {
t.Fatal(err)
}
req := require.New(t)
sf1, err := fs.Open("small.txt")
req.NoError(err)
sf2, err := fs.Open("small2.txt")
req.NoError(err)
bf1, err := fs.Open("bigger.txt")
req.NoError(err)
bf2, err := fs.Open("bigger2.txt")
req.NoError(err)
defer sf1.Close()
defer sf2.Close()
defer bf1.Close()
defer bf2.Close()
m1, err := MD5FromFileFast(sf1)
req.NoError(err)
req.Equal("308d8a1127b46524b51507424071c22c", m1)
m2, err := MD5FromFileFast(sf2)
req.NoError(err)
req.NotEqual(m1, m2)
m3, err := MD5FromFileFast(bf1)
req.NoError(err)
req.NotEqual(m2, m3)
m4, err := MD5FromFileFast(bf2)
req.NoError(err)
req.NotEqual(m3, m4)
m5, err := MD5FromFile(bf2)
req.NoError(err)
req.NotEqual(m4, m5)
}
func BenchmarkMD5FromFileFast(b *testing.B) {
fs := afero.NewMemMapFs()
for _, full := range []bool{false, true} {
b.Run(fmt.Sprintf("full=%t", full), func(b *testing.B) {
for i := 0; i < b.N; i++ {
b.StopTimer()
if err := afero.WriteFile(fs, "file.txt", []byte(strings.Repeat("1234567890", 2000)), 0777); err != nil {
b.Fatal(err)
}
f, err := fs.Open("file.txt")
if err != nil {
b.Fatal(err)
}
b.StartTimer()
if full {
if _, err := MD5FromFile(f); err != nil {
b.Fatal(err)
}
} else {
if _, err := MD5FromFileFast(f); err != nil {
b.Fatal(err)
}
}
f.Close()
}
})
}
}