Commit 096420cb authored by Juan Batiz-Benet's avatar Juan Batiz-Benet

Merge pull request #1021 from ipfs/bloom-filter-fix

Make bloom filters simpler
parents b041ccdc 3d8e96a2
......@@ -213,6 +213,10 @@
"ImportPath": "github.com/mitchellh/go-homedir",
"Rev": "7d2d8c8a4e078ce3c58736ab521a40b37a504c52"
},
{
"ImportPath": "github.com/mtchavez/jenkins",
"Rev": "5a816af6ef21ef401bff5e4b7dd255d63400f497"
},
{
"ImportPath": "github.com/syndtr/goleveldb/leveldb",
"Rev": "87e4e645d80ae9c537e8f2dee52b28036a5dd75e"
......@@ -221,6 +225,10 @@
"ImportPath": "github.com/syndtr/gosnappy/snappy",
"Rev": "156a073208e131d7d2e212cb749feae7c339e846"
},
{
"ImportPath": "github.com/whyrusleeping/go-metrics",
"Rev": "1cd8009604ec2238b5a71305a0ecd974066e0e16"
},
{
"ImportPath": "golang.org/x/crypto/blowfish",
"Rev": "b7d6bf2c61544745a02f83dec90393985fc3a065"
......@@ -233,10 +241,6 @@
"ImportPath": "golang.org/x/net/context",
"Rev": "7dbad50ab5b31073856416cdcfeb2796d682f844"
},
{
"ImportPath": "github.com/whyrusleeping/go-metrics",
"Rev": "1cd8009604ec2238b5a71305a0ecd974066e0e16"
},
{
"ImportPath": "gopkg.in/fsnotify.v1",
"Comment": "v1.2.0",
......
# Compiled Object files, Static and Dynamic libs (Shared Objects)
*.o
*.a
*.so
# Folders
_obj
_test
# Architecture specific extensions/prefixes
*.[568vq]
[568vq].out
*.cgo1.go
*.cgo2.c
_cgo_defun.c
_cgo_gotypes.go
_cgo_export.*
_testmain.go
*.exe
*.test
go:
- 1.1
- tip
install:
- go get github.com/onsi/ginkgo
- go get github.com/onsi/gomega
before_script: go test -i ./...
script: go test ./...
build:
go build jenkins.go
run:
go run jenkins.go
test:
go test -cover
default:
go run jenkins.go
Jenkins
=================
Golang Jenkins hash
[![Build Status](https://travis-ci.org/mtchavez/go-jenkins-hashes.png?branch=master)](https://travis-ci.org/mtchavez/go-jenkins-hashes)
## Install
`go get -u github.com/mtchavez/jenkins`
## Usage
Jenkins follows the [Hash32](http://golang.org/pkg/hash/#Hash32) interface from the Go standard library
```go
// Create a new hash
jenkhash := New()
// Write a string of bytes to hash
key := []byte("my-random-key")
length, err := jenkhash(key)
// Get uint32 sum of hash
sum := jenkhash.Sum32()
// Sum hash with byte string
sumbytes := jenkhash.Sum(key)
```
## Testing
Uses [Ginkgo](http://onsi.github.io/ginkgo/) for testing.
Run via `make test` which will run `go test -cover`
## Documentation
Docs on [godoc](http://godoc.org/github.com/mtchavez/jenkins)
## License
Written by Chavez
Released under the MIT License: http://www.opensource.org/licenses/mit-license.php
package jenkins
import "hash"
type jenkhash uint32
func New() hash.Hash32 {
var j jenkhash = 0
return &j
}
func (j *jenkhash) Write(key []byte) (int, error) {
hash := *j
for _, b := range key {
hash += jenkhash(b)
hash += (hash << 10)
hash ^= (hash >> 6)
}
hash += (hash << 3)
hash ^= (hash >> 11)
hash += (hash << 15)
*j = hash
return len(key), nil
}
func (j *jenkhash) Reset() {
*j = 0
}
func (j *jenkhash) Size() int {
return 4
}
func (j *jenkhash) BlockSize() int {
return 1
}
func (j *jenkhash) Sum32() uint32 {
return uint32(*j)
}
func (j *jenkhash) Sum(in []byte) []byte {
v := j.Sum32()
return append(in, byte(v>>24), byte(v>>16), byte(v>>8), byte(v))
}
package jenkins
import (
. "github.com/onsi/ginkgo"
. "github.com/onsi/gomega"
"testing"
)
func TestJenkins(t *testing.T) {
RegisterFailHandler(Fail)
RunSpecs(t, "Jenkins Suite")
}
package jenkins
import (
. "github.com/onsi/ginkgo"
. "github.com/onsi/gomega"
"hash"
)
var _ = Describe("Jenkins", func() {
var jhash hash.Hash32
var key []byte
BeforeEach(func() {
jhash = New()
key = []byte("Apple")
})
Describe("New", func() {
It("returns jenkhash", func() {
var h *jenkhash
Expect(jhash).To(BeAssignableToTypeOf(h))
})
It("initializes offset to 0", func() {
Expect(jhash.Sum32()).To(Equal(uint32(0)))
})
})
Describe("Write", func() {
It("returns key length", func() {
length, _ := jhash.Write(key)
Expect(length).To(Equal(5))
})
It("has no error", func() {
_, err := jhash.Write(key)
Expect(err).To(BeNil())
})
})
Describe("Reset", func() {
It("sets back to 0", func() {
Expect(jhash.Sum32()).To(Equal(uint32(0)))
jhash.Write(key)
Expect(jhash.Sum32()).NotTo(Equal(uint32(0)))
jhash.Reset()
Expect(jhash.Sum32()).To(Equal(uint32(0)))
})
})
Describe("Size", func() {
It("is 4", func() {
Expect(jhash.Size()).To(Equal(4))
})
})
Describe("BlockSize", func() {
It("is 1", func() {
Expect(jhash.BlockSize()).To(Equal(1))
})
})
Describe("Sum32", func() {
It("defaults to 0", func() {
Expect(jhash.Sum32()).To(Equal(uint32(0)))
})
It("sums hash", func() {
jhash.Write(key)
Expect(jhash.Sum32()).To(Equal(uint32(884782484)))
})
})
Describe("Sum", func() {
It("default 0 hash byte returned", func() {
expected := []byte{0x41, 0x70, 0x70, 0x6c, 0x65, 0x0, 0x0, 0x0, 0x0}
Expect(jhash.Sum(key)).To(Equal(expected))
})
It("returns sum byte array", func() {
jhash.Write(key)
expected := []byte{0x41, 0x70, 0x70, 0x6c, 0x65, 0x34, 0xbc, 0xb5, 0x94}
Expect(jhash.Sum(key)).To(Equal(expected))
})
})
})
......@@ -2,13 +2,11 @@
package bloom
import (
"encoding/binary"
"errors"
"fmt"
// Non crypto hash, because speed
"github.com/ipfs/go-ipfs/Godeps/_workspace/src/github.com/mtchavez/jenkins"
"hash"
"hash/adler32"
"hash/crc32"
"hash/fnv"
"math/big"
)
type Filter interface {
......@@ -17,61 +15,66 @@ type Filter interface {
Merge(Filter) (Filter, error)
}
func BasicFilter() Filter {
// Non crypto hashes, because speed
return NewFilter(2048, adler32.New(), fnv.New32(), crc32.NewIEEE())
}
func NewFilter(size int, hashes ...hash.Hash) Filter {
func NewFilter(size int) Filter {
return &filter{
hash: jenkins.New(),
filter: make([]byte, size),
hashes: hashes,
k: 3,
}
}
type filter struct {
filter []byte
hashes []hash.Hash
hash hash.Hash32
k int
}
func BasicFilter() Filter {
return NewFilter(2048)
}
func (f *filter) Add(k []byte) {
for _, h := range f.hashes {
i := bytesMod(h.Sum(k), int64(len(f.filter)*8))
f.setBit(i)
func (f *filter) Add(bytes []byte) {
for _, bit := range f.getBitIndicies(bytes) {
f.setBit(bit)
}
}
func (f *filter) Find(k []byte) bool {
for _, h := range f.hashes {
i := bytesMod(h.Sum(k), int64(len(f.filter)*8))
if !f.getBit(i) {
func (f *filter) getBitIndicies(bytes []byte) []uint32 {
indicies := make([]uint32, f.k)
f.hash.Write(bytes)
b := make([]byte, 4)
for i := 0; i < f.k; i++ {
res := f.hash.Sum32()
indicies[i] = res % (uint32(len(f.filter)) * 8)
binary.LittleEndian.PutUint32(b, res)
f.hash.Write(b)
}
f.hash.Reset()
return indicies
}
func (f *filter) Find(bytes []byte) bool {
for _, bit := range f.getBitIndicies(bytes) {
if !f.getBit(bit) {
return false
}
}
return true
}
func (f *filter) setBit(i int64) {
fmt.Printf("setting bit %d\n", i)
func (f *filter) setBit(i uint32) {
f.filter[i/8] |= (1 << byte(i%8))
}
func (f *filter) getBit(i int64) bool {
fmt.Printf("getting bit %d\n", i)
func (f *filter) getBit(i uint32) bool {
return f.filter[i/8]&(1<<byte(i%8)) != 0
}
func bytesMod(b []byte, modulo int64) int64 {
i := big.NewInt(0)
i = i.SetBytes(b)
bigmod := big.NewInt(int64(modulo))
result := big.NewInt(0)
result.Mod(i, bigmod)
return result.Int64()
}
func (f *filter) Merge(o Filter) (Filter, error) {
casfil, ok := o.(*filter)
if !ok {
......@@ -82,12 +85,15 @@ func (f *filter) Merge(o Filter) (Filter, error) {
return nil, errors.New("filter lengths must match!")
}
nfilt := new(filter)
// this bit is sketchy, need a way of comparing hash functions
nfilt.hashes = f.hashes
if casfil.k != f.k {
return nil, errors.New("filter k-values must match!")
}
nfilt := new(filter)
nfilt.hash = f.hash
nfilt.filter = make([]byte, len(f.filter))
nfilt.k = f.k
for i, v := range f.filter {
nfilt.filter[i] = v | casfil.filter[i]
}
......
package bloom
import "testing"
import (
"encoding/binary"
"fmt"
"testing"
)
func TestFilter(t *testing.T) {
f := BasicFilter()
f := NewFilter(128)
keys := [][]byte{
[]byte("hello"),
[]byte("fish"),
[]byte("ipfsrocks"),
[]byte("i want ipfs socks"),
}
f.Add(keys[0])
......@@ -21,10 +27,54 @@ func TestFilter(t *testing.T) {
}
f.Add(keys[2])
f.Add(keys[3])
for _, k := range keys {
if !f.Find(k) {
t.Fatal("Couldnt find one of three keys")
}
}
if f.Find([]byte("beep boop")) {
t.Fatal("Got false positive! Super unlikely!")
}
fmt.Println(f)
}
func TestMerge(t *testing.T) {
f1 := NewFilter(128)
f2 := NewFilter(128)
fbork := NewFilter(32)
_, err := f1.Merge(fbork)
if err == nil {
t.Fatal("Merge should fail on filters with different lengths")
}
b := make([]byte, 4)
var i uint32
for i = 0; i < 10; i++ {
binary.LittleEndian.PutUint32(b, i)
f1.Add(b)
}
for i = 10; i < 20; i++ {
binary.LittleEndian.PutUint32(b, i)
f2.Add(b)
}
merged, _ := f1.Merge(f2)
for i = 0; i < 20; i++ {
binary.LittleEndian.PutUint32(b, i)
if !merged.Find(b) {
t.Fatal("Could not find all keys in merged filter")
}
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment