rabin.go 1.92 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94
package chunk

import (
	"bufio"
	"bytes"
	"fmt"
	"io"
	"math"
)

type MaybeRabin struct {
	mask         int
	windowSize   int
	MinBlockSize int
	MaxBlockSize int
}

func NewMaybeRabin(avgBlkSize int) *MaybeRabin {
	blkbits := uint(math.Log2(float64(avgBlkSize)))
	rb := new(MaybeRabin)
	rb.mask = (1 << blkbits) - 1
	rb.windowSize = 16 // probably a good number...
	rb.MinBlockSize = avgBlkSize / 2
	rb.MaxBlockSize = (avgBlkSize / 2) * 3
	return rb
}

func (mr *MaybeRabin) Split(r io.Reader) chan []byte {
	out := make(chan []byte, 16)
	go func() {
		inbuf := bufio.NewReader(r)
		blkbuf := new(bytes.Buffer)

		// some bullshit numbers i made up
		a := 10         // honestly, no idea what this is
		MOD := 33554383 // randomly chosen (seriously)
		an := 1
		rollingHash := 0

		// Window is a circular buffer
		window := make([]byte, mr.windowSize)
		push := func(i int, val byte) (outval int) {
			outval = int(window[i%len(window)])
			window[i%len(window)] = val
			return
		}

		// Duplicate byte slice
		dup := func(b []byte) []byte {
			d := make([]byte, len(b))
			copy(d, b)
			return d
		}

		// Fill up the window
		i := 0
		for ; i < mr.windowSize; i++ {
			b, err := inbuf.ReadByte()
			if err != nil {
				fmt.Println(err)
				return
			}
			blkbuf.WriteByte(b)
			push(i, b)
			rollingHash = (rollingHash*a + int(b)) % MOD
			an = (an * a) % MOD
		}

		for ; true; i++ {
			b, err := inbuf.ReadByte()
			if err != nil {
				break
			}
			outval := push(i, b)
			blkbuf.WriteByte(b)
			rollingHash = (rollingHash*a + int(b) - an*outval) % MOD
			if (rollingHash&mr.mask == mr.mask && blkbuf.Len() > mr.MinBlockSize) ||
				blkbuf.Len() >= mr.MaxBlockSize {
				out <- dup(blkbuf.Bytes())
				blkbuf.Reset()
			}

			// Check if there are enough remaining
			peek, err := inbuf.Peek(mr.windowSize)
			if err != nil || len(peek) != mr.windowSize {
				break
			}
		}
		io.Copy(blkbuf, inbuf)
		out <- blkbuf.Bytes()
		close(out)
	}()
	return out
}