Commit 1f309b72 authored by Jeromy's avatar Jeromy

implement basic rabin fingerprint file splitting

parent fbd611f4
package importer
import (
"testing"
"crypto/rand"
"bytes"
)
func TestDataSplitting(t *testing.T) {
buf := make([]byte, 16*1024*1024)
rand.Read(buf)
split := Rabin(buf)
if len(split) == 1 {
t.Fatal("No split occurred!")
}
min := 2 << 15
max := 0
mxcount := 0
n := 0
for _, b := range split {
if !bytes.Equal(b, buf[n:n+len(b)]) {
t.Fatal("Split lost data!")
}
n += len(b)
if len(b) < min {
min = len(b)
}
if len(b) > max {
max = len(b)
}
if len(b) == 16384 {
mxcount++
}
}
if n != len(buf) {
t.Fatal("missing some bytes!")
}
t.Log(len(split))
t.Log(min, max, mxcount)
}
package importer
type BlockSplitter func([]byte) [][]byte
// TODO: this should take a reader, not a byte array. what if we're splitting a 3TB file?
func Rabin(b []byte) [][]byte {
var out [][]byte
windowsize := uint64(48)
chunk_max := 1024 * 16
min_blk_size := 2048
blk_beg_i := 0
prime := uint64(61)
var poly uint64
var curchecksum uint64
// Smaller than a window? Get outa here!
if len(b) <= int(windowsize) {
return [][]byte{b}
}
i := 0
for n := i; i < n+int(windowsize); i++ {
cur := uint64(b[i])
curchecksum = (curchecksum * prime) + cur
poly = (poly * prime) + cur
}
for ; i < len(b); i++ {
cur := uint64(b[i])
curchecksum = (curchecksum * prime) + cur
poly = (poly * prime) + cur
curchecksum -= (uint64(b[i-1]) * prime)
if i-blk_beg_i >= chunk_max {
// push block
out = append(out, b[blk_beg_i:i])
blk_beg_i = i
}
// first 13 bits of polynomial are 0
if poly % 8192 == 0 && i-blk_beg_i >= min_blk_size {
// push block
out = append(out, b[blk_beg_i:i])
blk_beg_i = i
}
}
if i > blk_beg_i {
out = append(out, b[blk_beg_i:])
}
return out
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment