From 1f309b72d01d15db9dc0156254322f61ed9a68f8 Mon Sep 17 00:00:00 2001
From: Jeromy <jeromyj@gmail.com>
Date: Sat, 30 Aug 2014 10:53:26 -0700
Subject: [PATCH] implement basic rabin fingerprint file splitting

---
 importer/split_test.go | 50 ++++++++++++++++++++++++++++++++++++++++
 importer/splitting.go  | 52 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 102 insertions(+)
 create mode 100644 importer/split_test.go
 create mode 100644 importer/splitting.go

diff --git a/importer/split_test.go b/importer/split_test.go
new file mode 100644
index 000000000..d6b6cc743
--- /dev/null
+++ b/importer/split_test.go
@@ -0,0 +1,50 @@
+package importer
+
+import (
+	"testing"
+	"crypto/rand"
+	"bytes"
+)
+
+func TestDataSplitting(t *testing.T) {
+	buf := make([]byte, 16*1024*1024)
+	rand.Read(buf)
+
+	split := Rabin(buf)
+
+	if len(split) == 1 {
+		t.Fatal("No split occurred!")
+	}
+
+	min := 2 << 15
+	max := 0
+
+	mxcount := 0
+
+	n := 0
+	for _, b := range split {
+		if !bytes.Equal(b, buf[n:n+len(b)]) {
+			t.Fatal("Split lost data!")
+		}
+		n += len(b)
+
+		if len(b) < min {
+			min = len(b)
+		}
+
+		if len(b) > max {
+			max = len(b)
+		}
+
+		if len(b) == 16384 {
+			mxcount++
+		}
+	}
+
+	if n != len(buf) {
+		t.Fatal("missing some bytes!")
+	}
+	t.Log(len(split))
+	t.Log(min, max, mxcount)
+}
+
diff --git a/importer/splitting.go b/importer/splitting.go
new file mode 100644
index 000000000..3b559edb9
--- /dev/null
+++ b/importer/splitting.go
@@ -0,0 +1,52 @@
+package importer
+
+type BlockSplitter func([]byte) [][]byte
+
+// TODO: this should take a reader, not a byte array. what if we're splitting a 3TB file?
+func Rabin(b []byte) [][]byte {
+	var out [][]byte
+	windowsize := uint64(48)
+	chunk_max := 1024 * 16
+	min_blk_size := 2048
+	blk_beg_i := 0
+	prime := uint64(61)
+
+	var poly uint64
+	var curchecksum uint64
+
+	// Smaller than a window?  Get outa here!
+	if len(b) <= int(windowsize) {
+		return [][]byte{b}
+	}
+
+	i := 0
+	for n := i; i < n+int(windowsize); i++ {
+		cur := uint64(b[i])
+		curchecksum = (curchecksum * prime) + cur
+		poly = (poly * prime) + cur
+	}
+
+	for ; i < len(b); i++ {
+		cur := uint64(b[i])
+		curchecksum = (curchecksum * prime) + cur
+		poly = (poly * prime) + cur
+		curchecksum -= (uint64(b[i-1]) * prime)
+
+		if i-blk_beg_i >= chunk_max {
+			// push block
+			out = append(out, b[blk_beg_i:i])
+			blk_beg_i = i
+		}
+
+		// first 13 bits of polynomial are 0
+		if poly % 8192 == 0 && i-blk_beg_i >= min_blk_size {
+			// push block
+			out = append(out, b[blk_beg_i:i])
+			blk_beg_i = i
+		}
+	}
+	if i > blk_beg_i {
+		out = append(out, b[blk_beg_i:])
+	}
+	return out
+}
-- 
GitLab