Commit 2c3f9f24 authored by Kristoffer Ström's avatar Kristoffer Ström

Add hamming distance calculation to bloom filters

parent 24063341
......@@ -217,6 +217,11 @@
"ImportPath": "github.com/mtchavez/jenkins",
"Rev": "5a816af6ef21ef401bff5e4b7dd255d63400f497"
},
{
"ImportPath": "github.com/steakknife/hamming",
"Comment": "0.0.2-2-g9ad4a62",
"Rev": "9ad4a620e3d573267a083c892f2b42a39302153b"
},
{
"ImportPath": "github.com/syndtr/goleveldb/leveldb",
"Rev": "87e4e645d80ae9c537e8f2dee52b28036a5dd75e"
......
Copyright (c) 2014 Barry Allard
MIT license
package hamming
// SSE4.x PopCnt is 10x slower
// References: check out Hacker's Delight
const (
m1 uint64 = 0x5555555555555555 //binary: 0101...
m2 uint64 = 0x3333333333333333 //binary: 00110011..
m4 uint64 = 0x0f0f0f0f0f0f0f0f //binary: 4 zeros, 4 ones ...
m8 uint64 = 0x00ff00ff00ff00ff //binary: 8 zeros, 8 ones ...
m16 uint64 = 0x0000ffff0000ffff //binary: 16 zeros, 16 ones ...
m32 uint64 = 0x00000000ffffffff //binary: 32 zeros, 32 ones
hff uint64 = 0xffffffffffffffff //binary: all ones
h01 uint64 = 0x0101010101010101 //the sum of 256 to the power of 0,1,2,3...
)
var table = [256]byte{0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8}
// hamming distance of two uint64's
func Uint64(x, y uint64) int {
return CountBitsUint64(x ^ y)
}
// hamming distance of two bytes
func Byte(x, y byte) int {
return CountBitsByte(x ^ y)
}
func CountBitsUint64(x uint64) int {
x -= (x >> 1) & m1 // put count of each 2 bits into those 2 bits
x = (x & m2) + ((x >> 2) & m2) // put count of each 4 bits into those 4 bits
x = (x + (x >> 4)) & m4 // put count of each 8 bits into those 8 bits
return int((x * h01) >> 56) // returns left 8 bits of x + (x<<8) + (x<<16) + (x<<24) + ...
}
func CountBitsByte(x byte) int {
return int(table[x])
}
package hamming
import (
"testing"
)
type testCountBitsUint64Case struct {
x uint64
n int
}
type testCountBitsByteCase struct {
x byte
n int
}
var testCountBitsByteCases = []testCountBitsByteCase{
{0x00, 0},
{0x01, 1},
{0x02, 1},
{0x03, 2},
{0xaa, 4},
{0x55, 4},
{0x7f, 7},
{0xff, 8},
}
var testCountBitsUint64Cases = []testCountBitsUint64Case{
{0x00, 0},
{0x01, 1},
{0x02, 1},
{0x03, 2},
{0xaa, 4},
{0x55, 4},
{0x7f, 7},
{0xff, 8},
{0xffff, 16},
{0xffffffff, 32},
{0x1ffffffff, 33},
{0x3ffffffff, 34},
{0x7ffffffff, 35},
{0xfffffffff, 36},
{0x3fffffffffffffff, 62},
{0x7fffffffffffffff, 63},
{0xffffffffffffffff, 64},
}
func TestCountBitByte(t *testing.T) {
for _, c := range testCountBitsByteCases {
if actualN := CountBitsByte(c.x); actualN != c.n {
t.Fatal("CountBitsByte(", c.x, ") = ", actualN, " != ", c.n)
} else {
t.Log("CountBitsByte(", c.x, ") == ", c.n)
}
}
}
func TestCountBitUint64(t *testing.T) {
for _, c := range testCountBitsUint64Cases {
if actualN := CountBitsUint64(c.x); actualN != c.n {
t.Fatal("CountBitsUint64(", c.x, ") = ", actualN, " != ", c.n)
} else {
t.Log("CountBitsUint64(", c.x, ") == ", c.n)
}
}
}
func BenchmarkCountBitsUint64(b *testing.B) {
j := 0
for i := 0; i < b.N; i++ {
CountBitsUint64(testCountBitsUint64Cases[j].x)
j++
if j == len(testCountBitsUint64Cases) {
j = 0
}
}
}
func BenchmarkCountBitsByte(b *testing.B) {
j := 0
for i := 0; i < b.N; i++ {
CountBitsByte(testCountBitsByteCases[j].x)
j++
if j == len(testCountBitsByteCases) {
j = 0
}
}
}
......@@ -6,6 +6,7 @@ import (
"errors"
// Non crypto hash, because speed
"github.com/ipfs/go-ipfs/Godeps/_workspace/src/github.com/mtchavez/jenkins"
"github.com/ipfs/go-ipfs/Godeps/_workspace/src/github.com/steakknife/hamming"
"hash"
)
......@@ -13,6 +14,7 @@ type Filter interface {
Add([]byte)
Find([]byte) bool
Merge(Filter) (Filter, error)
HammingDistance(Filter) (int, error)
}
func NewFilter(size int) Filter {
......@@ -100,3 +102,23 @@ func (f *filter) Merge(o Filter) (Filter, error) {
return nfilt, nil
}
func (f *filter) HammingDistance(o Filter) (int, error) {
casfil, ok := o.(*filter)
if !ok {
return 0, errors.New("Unsupported filter type")
}
if len(f.filter) != len(casfil.filter) {
return 0, errors.New("filter lengths must match!")
}
acc := 0
// xor together
for i := 0; i < len(f.filter); i++ {
acc += hamming.Byte(f.filter[i], casfil.filter[i])
}
return acc, nil
}
......@@ -78,3 +78,17 @@ func TestMerge(t *testing.T) {
}
}
}
func TestHamming(t *testing.T) {
f1 := NewFilter(128)
f2 := NewFilter(128)
f1.Add([]byte("no collision"))
f1.Add([]byte("collision? no!"))
dist, _ := f1.HammingDistance(f2)
if dist != 6 {
t.Fatal("Should have 6 bit difference")
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment