From 1f309b72d01d15db9dc0156254322f61ed9a68f8 Mon Sep 17 00:00:00 2001 From: Jeromy <jeromyj@gmail.com> Date: Sat, 30 Aug 2014 10:53:26 -0700 Subject: [PATCH] implement basic rabin fingerprint file splitting --- importer/split_test.go | 50 ++++++++++++++++++++++++++++++++++++++++ importer/splitting.go | 52 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+) create mode 100644 importer/split_test.go create mode 100644 importer/splitting.go diff --git a/importer/split_test.go b/importer/split_test.go new file mode 100644 index 000000000..d6b6cc743 --- /dev/null +++ b/importer/split_test.go @@ -0,0 +1,50 @@ +package importer + +import ( + "testing" + "crypto/rand" + "bytes" +) + +func TestDataSplitting(t *testing.T) { + buf := make([]byte, 16*1024*1024) + rand.Read(buf) + + split := Rabin(buf) + + if len(split) == 1 { + t.Fatal("No split occurred!") + } + + min := 2 << 15 + max := 0 + + mxcount := 0 + + n := 0 + for _, b := range split { + if !bytes.Equal(b, buf[n:n+len(b)]) { + t.Fatal("Split lost data!") + } + n += len(b) + + if len(b) < min { + min = len(b) + } + + if len(b) > max { + max = len(b) + } + + if len(b) == 16384 { + mxcount++ + } + } + + if n != len(buf) { + t.Fatal("missing some bytes!") + } + t.Log(len(split)) + t.Log(min, max, mxcount) +} + diff --git a/importer/splitting.go b/importer/splitting.go new file mode 100644 index 000000000..3b559edb9 --- /dev/null +++ b/importer/splitting.go @@ -0,0 +1,52 @@ +package importer + +type BlockSplitter func([]byte) [][]byte + +// TODO: this should take a reader, not a byte array. what if we're splitting a 3TB file? +func Rabin(b []byte) [][]byte { + var out [][]byte + windowsize := uint64(48) + chunk_max := 1024 * 16 + min_blk_size := 2048 + blk_beg_i := 0 + prime := uint64(61) + + var poly uint64 + var curchecksum uint64 + + // Smaller than a window? Get outa here! + if len(b) <= int(windowsize) { + return [][]byte{b} + } + + i := 0 + for n := i; i < n+int(windowsize); i++ { + cur := uint64(b[i]) + curchecksum = (curchecksum * prime) + cur + poly = (poly * prime) + cur + } + + for ; i < len(b); i++ { + cur := uint64(b[i]) + curchecksum = (curchecksum * prime) + cur + poly = (poly * prime) + cur + curchecksum -= (uint64(b[i-1]) * prime) + + if i-blk_beg_i >= chunk_max { + // push block + out = append(out, b[blk_beg_i:i]) + blk_beg_i = i + } + + // first 13 bits of polynomial are 0 + if poly % 8192 == 0 && i-blk_beg_i >= min_blk_size { + // push block + out = append(out, b[blk_beg_i:i]) + blk_beg_i = i + } + } + if i > blk_beg_i { + out = append(out, b[blk_beg_i:]) + } + return out +} -- GitLab