diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000000000000000000000000000000000000..4b86d7197f9a6f1a988c503defc8451392cd5e55 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,8 @@ +blank_issues_enabled: false +contact_links: + - name: Getting Help on IPFS + url: https://ipfs.io/help + about: All information about how and where to get help on IPFS. + - name: IPFS Official Forum + url: https://discuss.ipfs.io + about: Please post general questions, support requests, and discussions here. diff --git a/.github/ISSUE_TEMPLATE/open_an_issue.md b/.github/ISSUE_TEMPLATE/open_an_issue.md new file mode 100644 index 0000000000000000000000000000000000000000..4fcbd00aca0d513c2729d8df4afb5a01fdbe7d02 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/open_an_issue.md @@ -0,0 +1,19 @@ +--- +name: Open an issue +about: Only for actionable issues relevant to this repository. +title: '' +labels: need/triage +assignees: '' + +--- + diff --git a/.github/config.yml b/.github/config.yml new file mode 100644 index 0000000000000000000000000000000000000000..ed26646a0f7cdda6cf10ede2b0b98cac89cf67b0 --- /dev/null +++ b/.github/config.yml @@ -0,0 +1,68 @@ +# Configuration for welcome - https://github.com/behaviorbot/welcome + +# Configuration for new-issue-welcome - https://github.com/behaviorbot/new-issue-welcome +# Comment to be posted to on first time issues +newIssueWelcomeComment: > + Thank you for submitting your first issue to this repository! A maintainer + will be here shortly to triage and review. + + In the meantime, please double-check that you have provided all the + necessary information to make this process easy! Any information that can + help save additional round trips is useful! We currently aim to give + initial feedback within **two business days**. If this does not happen, feel + free to leave a comment. + + Please keep an eye on how this issue will be labeled, as labels give an + overview of priorities, assignments and additional actions requested by the + maintainers: + + - "Priority" labels will show how urgent this is for the team. + - "Status" labels will show if this is ready to be worked on, blocked, or in progress. + - "Need" labels will indicate if additional input or analysis is required. + + Finally, remember to use https://discuss.ipfs.io if you just need general + support. + +# Configuration for new-pr-welcome - https://github.com/behaviorbot/new-pr-welcome +# Comment to be posted to on PRs from first time contributors in your repository +newPRWelcomeComment: > + Thank you for submitting this PR! + + A maintainer will be here shortly to review it. + + We are super grateful, but we are also overloaded! Help us by making sure + that: + + * The context for this PR is clear, with relevant discussion, decisions + and stakeholders linked/mentioned. + + * Your contribution itself is clear (code comments, self-review for the + rest) and in its best form. Follow the [code contribution + guidelines](https://github.com/ipfs/community/blob/master/CONTRIBUTING.md#code-contribution-guidelines) + if they apply. + + Getting other community members to do a review would be great help too on + complex PRs (you can ask in the chats/forums). If you are unsure about + something, just leave us a comment. + + Next steps: + + * A maintainer will triage and assign priority to this PR, commenting on + any missing things and potentially assigning a reviewer for high + priority items. + + * The PR gets reviews, discussed and approvals as needed. + + * The PR is merged by maintainers when it has been approved and comments addressed. + + We currently aim to provide initial feedback/triaging within **two business + days**. Please keep an eye on any labelling actions, as these will indicate + priorities and status of your contribution. + + We are very grateful for your contribution! + + +# Configuration for first-pr-merge - https://github.com/behaviorbot/first-pr-merge +# Comment to be posted to on pull requests merged by a first time user +# Currently disabled +#firstPRMergeComment: "" diff --git a/.gx/lastpubver b/.gx/lastpubver new file mode 100644 index 0000000000000000000000000000000000000000..16921034f48ba21e3cd73352f2c44b91ebaa34c3 --- /dev/null +++ b/.gx/lastpubver @@ -0,0 +1 @@ +0.1.2: QmWaLViWQF8jgyoLLqqcSrnp6dJpHESiJfzor1vrfDyTZf diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000000000000000000000000000000000000..4cfe98c2424d60b3a50d870dc0da5df2ee9243ba --- /dev/null +++ b/.travis.yml @@ -0,0 +1,32 @@ +os: + - linux + +language: go + +go: + - 1.11.x + +env: + global: + - GOTFLAGS="-race" + matrix: + - BUILD_DEPTYPE=gx + - BUILD_DEPTYPE=gomod + + +# disable travis install +install: + - true + +script: + - bash <(curl -s https://raw.githubusercontent.com/ipfs/ci-helpers/master/travis-ci/run-standard-tests.sh) + + +cache: + directories: + - $GOPATH/src/gx + - $GOPATH/pkg/mod + - $HOME/.cache/go-build + +notifications: + email: false diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..4b20050e84365ee2dd8e2db90e06d2fd02f46ac5 --- /dev/null +++ b/LICENSE @@ -0,0 +1,35 @@ +bbloom.go + +// The MIT License (MIT) +// Copyright (c) 2014 Andreas Briese, eduToolbox@Bri-C GmbH, Sarstedt + +// Permission is hereby granted, free of charge, to any person obtaining a copy of +// this software and associated documentation files (the "Software"), to deal in +// the Software without restriction, including without limitation the rights to +// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +// the Software, and to permit persons to whom the Software is furnished to do so, +// subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +// FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +// IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +siphash.go + +// https://github.com/dchest/siphash +// +// Written in 2012 by Dmitry Chestnykh. +// +// To the extent possible under law, the author have dedicated all copyright +// and related and neighboring rights to this software to the public domain +// worldwide. This software is distributed without any warranty. +// http://creativecommons.org/publicdomain/zero/1.0/ +// +// Package siphash implements SipHash-2-4, a fast short-input PRF +// created by Jean-Philippe Aumasson and Daniel J. Bernstein. diff --git a/README.md b/README.md index a2512331caa8813a8a81e550d911d5c0cb8da03a..4ce909789fea573f317a696d401e8f780ba95b28 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,129 @@ -# bbloom +## bbloom: a bitset Bloom filter for go/golang +=== -binary bloom filter \ No newline at end of file +package implements a fast bloom filter with real 'bitset' and JSONMarshal/JSONUnmarshal to store/reload the Bloom filter. + +NOTE: the package uses unsafe.Pointer to set and read the bits from the bitset. If you're uncomfortable with using the unsafe package, please consider using my bloom filter package at github.com/AndreasBriese/bloom + +=== + +changelog 11/2015: new thread safe methods AddTS(), HasTS(), AddIfNotHasTS() following a suggestion from Srdjan Marinovic (github @a-little-srdjan), who used this to code a bloomfilter cache. + +This bloom filter was developed to strengthen a website-log database and was tested and optimized for this log-entry mask: "2014/%02i/%02i %02i:%02i:%02i /info.html". +Nonetheless bbloom should work with any other form of entries. + +~~Hash function is a modified Berkeley DB sdbm hash (to optimize for smaller strings). sdbm http://www.cse.yorku.ca/~oz/hash.html~~ + +Found sipHash (SipHash-2-4, a fast short-input PRF created by Jean-Philippe Aumasson and Daniel J. Bernstein.) to be about as fast. sipHash had been ported by Dimtry Chestnyk to Go (github.com/dchest/siphash ) + +Minimum hashset size is: 512 ([4]uint64; will be set automatically). + +###install + +```sh +go get github.com/AndreasBriese/bbloom +``` + +###test ++ change to folder ../bbloom ++ create wordlist in file "words.txt" (you might use `python permut.py`) ++ run 'go test -bench=.' within the folder + +```go +go test -bench=. +``` + +~~If you've installed the GOCONVEY TDD-framework http://goconvey.co/ you can run the tests automatically.~~ + +using go's testing framework now (have in mind that the op timing is related to 65536 operations of Add, Has, AddIfNotHas respectively) + +### usage + +after installation add + +```go +import ( + ... + "github.com/AndreasBriese/bbloom" + ... + ) +``` + +at your header. In the program use + +```go +// create a bloom filter for 65536 items and 1 % wrong-positive ratio +bf := bbloom.New(float64(1<<16), float64(0.01)) + +// or +// create a bloom filter with 650000 for 65536 items and 7 locs per hash explicitly +// bf = bbloom.New(float64(650000), float64(7)) +// or +bf = bbloom.New(650000.0, 7.0) + +// add one item +bf.Add([]byte("butter")) + +// Number of elements added is exposed now +// Note: ElemNum will not be included in JSON export (for compatability to older version) +nOfElementsInFilter := bf.ElemNum + +// check if item is in the filter +isIn := bf.Has([]byte("butter")) // should be true +isNotIn := bf.Has([]byte("Butter")) // should be false + +// 'add only if item is new' to the bloomfilter +added := bf.AddIfNotHas([]byte("butter")) // should be false because 'butter' is already in the set +added = bf.AddIfNotHas([]byte("buTTer")) // should be true because 'buTTer' is new + +// thread safe versions for concurrent use: AddTS, HasTS, AddIfNotHasTS +// add one item +bf.AddTS([]byte("peanutbutter")) +// check if item is in the filter +isIn = bf.HasTS([]byte("peanutbutter")) // should be true +isNotIn = bf.HasTS([]byte("peanutButter")) // should be false +// 'add only if item is new' to the bloomfilter +added = bf.AddIfNotHasTS([]byte("butter")) // should be false because 'peanutbutter' is already in the set +added = bf.AddIfNotHasTS([]byte("peanutbuTTer")) // should be true because 'penutbuTTer' is new + +// convert to JSON ([]byte) +Json := bf.JSONMarshal() + +// bloomfilters Mutex is exposed for external un-/locking +// i.e. mutex lock while doing JSON conversion +bf.Mtx.Lock() +Json = bf.JSONMarshal() +bf.Mtx.Unlock() + +// restore a bloom filter from storage +bfNew, _ := bbloom.JSONUnmarshal(Json) + +isInNew := bfNew.Has([]byte("butter")) // should be true +isNotInNew := bfNew.Has([]byte("Butter")) // should be false + +``` + +to work with the bloom filter. + +### why 'fast'? + +It's about 3 times faster than William Fitzgeralds bitset bloom filter https://github.com/willf/bloom . And it is about so fast as my []bool set variant for Boom filters (see https://github.com/AndreasBriese/bloom ) but having a 8times smaller memory footprint: + + + Bloom filter (filter size 524288, 7 hashlocs) + github.com/AndreasBriese/bbloom 'Add' 65536 items (10 repetitions): 6595800 ns (100 ns/op) + github.com/AndreasBriese/bbloom 'Has' 65536 items (10 repetitions): 5986600 ns (91 ns/op) + github.com/AndreasBriese/bloom 'Add' 65536 items (10 repetitions): 6304684 ns (96 ns/op) + github.com/AndreasBriese/bloom 'Has' 65536 items (10 repetitions): 6568663 ns (100 ns/op) + + github.com/willf/bloom 'Add' 65536 items (10 repetitions): 24367224 ns (371 ns/op) + github.com/willf/bloom 'Test' 65536 items (10 repetitions): 21881142 ns (333 ns/op) + github.com/dataence/bloom/standard 'Add' 65536 items (10 repetitions): 23041644 ns (351 ns/op) + github.com/dataence/bloom/standard 'Check' 65536 items (10 repetitions): 19153133 ns (292 ns/op) + github.com/cabello/bloom 'Add' 65536 items (10 repetitions): 131921507 ns (2012 ns/op) + github.com/cabello/bloom 'Contains' 65536 items (10 repetitions): 131108962 ns (2000 ns/op) + +(on MBPro15 OSX10.8.5 i7 4Core 2.4Ghz) + + +With 32bit bloom filters (bloom32) using modified sdbm, bloom32 does hashing with only 2 bit shifts, one xor and one substraction per byte. smdb is about as fast as fnv64a but gives less collisions with the dataset (see mask above). bloom.New(float64(10 * 1<<16),float64(7)) populated with 1<<16 random items from the dataset (see above) and tested against the rest results in less than 0.05% collisions. diff --git a/bbloom.go b/bbloom.go new file mode 100644 index 0000000000000000000000000000000000000000..36f12e0dea87922419481b97e37697607cedd5bb --- /dev/null +++ b/bbloom.go @@ -0,0 +1,326 @@ +// The MIT License (MIT) +// Copyright (c) 2014 Andreas Briese, eduToolbox@Bri-C GmbH, Sarstedt + +// Permission is hereby granted, free of charge, to any person obtaining a copy of +// this software and associated documentation files (the "Software"), to deal in +// the Software without restriction, including without limitation the rights to +// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +// the Software, and to permit persons to whom the Software is furnished to do so, +// subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +// FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +// IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +package bbloom + +import ( + "encoding/binary" + "encoding/json" + "errors" + "log" + "math" + "math/bits" + "sync" +) + +func getSize(ui64 uint64) (size uint64, exponent uint64) { + if ui64 < uint64(512) { + ui64 = uint64(512) + } + size = uint64(1) + for size < ui64 { + size <<= 1 + exponent++ + } + return size, exponent +} + +func calcSizeByWrongPositives(numEntries, wrongs float64) (uint64, uint64) { + size := -1 * numEntries * math.Log(wrongs) / math.Pow(float64(0.69314718056), 2) + locs := math.Ceil(float64(0.69314718056) * size / numEntries) + return uint64(size), uint64(locs) +} + +var ErrUsage = errors.New("usage: New(float64(number_of_entries), float64(number_of_hashlocations)) i.e. New(float64(1000), float64(3)) or New(float64(number_of_entries), float64(ratio_of_false_positives)) i.e. New(float64(1000), float64(0.03))") +var ErrInvalidParms = errors.New("One of parameters was outside of allowed range") + +// New +// returns a new bloomfilter +func New(params ...float64) (bloomfilter *Bloom, err error) { + var entries, locs uint64 + if len(params) == 2 { + if params[0] < 0 || params[1] < 0 { + return nil, ErrInvalidParms + } + if params[1] < 1 { + entries, locs = calcSizeByWrongPositives(math.Max(params[0], 1), params[1]) + } else { + entries, locs = uint64(params[0]), uint64(params[1]) + } + } else { + return nil, ErrUsage + } + size, exponent := getSize(uint64(entries)) + bloomfilter = &Bloom{ + sizeExp: exponent, + size: size - 1, + setLocs: locs, + shift: 64 - exponent, + bitset: make([]uint64, size>>6), + } + return bloomfilter, nil +} + +// NewWithBoolset +// takes a []byte slice and number of locs per entry +// returns the bloomfilter with a bitset populated according to the input []byte +func NewWithBoolset(bs []byte, locs uint64) (bloomfilter *Bloom) { + bloomfilter, err := New(float64(len(bs)<<3), float64(locs)) + if err != nil { + panic(err) // Should never happen + } + for i := range bloomfilter.bitset { + bloomfilter.bitset[i] = binary.BigEndian.Uint64((bs)[i<<3:]) + } + return bloomfilter +} + +// bloomJSONImExport +// Im/Export structure used by JSONMarshal / JSONUnmarshal +type bloomJSONImExport struct { + FilterSet []byte + SetLocs uint64 +} + +// +// Bloom filter +type Bloom struct { + Mtx sync.RWMutex + bitset []uint64 + sizeExp uint64 + size uint64 + setLocs uint64 + shift uint64 + + content uint64 +} + +// ElementsAdded returns the number of elements added to the bloom filter. +func (bl *Bloom) ElementsAdded() uint64 { + return bl.content +} + +// <--- http://www.cse.yorku.ca/~oz/hash.html +// modified Berkeley DB Hash (32bit) +// hash is casted to l, h = 16bit fragments +// func (bl Bloom) absdbm(b *[]byte) (l, h uint64) { +// hash := uint64(len(*b)) +// for _, c := range *b { +// hash = uint64(c) + (hash << 6) + (hash << bl.sizeExp) - hash +// } +// h = hash >> bl.shift +// l = hash << bl.shift >> bl.shift +// return l, h +// } + +// Update: found sipHash of Jean-Philippe Aumasson & Daniel J. Bernstein to be even faster than absdbm() +// https://131002.net/siphash/ +// siphash was implemented for Go by Dmitry Chestnykh https://github.com/dchest/siphash + +// Add +// set the bit(s) for entry; Adds an entry to the Bloom filter +func (bl *Bloom) Add(entry []byte) { + bl.content++ + l, h := bl.sipHash(entry) + for i := uint64(0); i < (*bl).setLocs; i++ { + bl.set((h + i*l) & (*bl).size) + } +} + +// AddTS +// Thread safe: Mutex.Lock the bloomfilter for the time of processing the entry +func (bl *Bloom) AddTS(entry []byte) { + bl.Mtx.Lock() + bl.Add(entry) + bl.Mtx.Unlock() +} + +// Has +// check if bit(s) for entry is/are set +// returns true if the entry was added to the Bloom Filter +func (bl *Bloom) Has(entry []byte) bool { + l, h := bl.sipHash(entry) + res := true + for i := uint64(0); i < bl.setLocs; i++ { + res = res && bl.isSet((h+i*l)&bl.size) + // Branching here (early escape) is not worth it + // This is my conclusion from benchmarks + // (prevents loop unrolling) + // if !res { + // return false + // } + } + return res +} + +// HasTS +// Thread safe: Mutex.Lock the bloomfilter for the time of processing the entry +func (bl *Bloom) HasTS(entry []byte) bool { + bl.Mtx.RLock() + has := bl.Has(entry[:]) + bl.Mtx.RUnlock() + return has +} + +// AddIfNotHas +// Only Add entry if it's not present in the bloomfilter +// returns true if entry was added +// returns false if entry was allready registered in the bloomfilter +func (bl *Bloom) AddIfNotHas(entry []byte) (added bool) { + l, h := bl.sipHash(entry) + contained := true + for i := uint64(0); i < bl.setLocs; i++ { + prev := bl.getSet((h + i*l) & bl.size) + contained = contained && prev + } + if !contained { + bl.content++ + } + return !contained +} + +// AddIfNotHasTS +// Tread safe: Only Add entry if it's not present in the bloomfilter +// returns true if entry was added +// returns false if entry was allready registered in the bloomfilter +func (bl *Bloom) AddIfNotHasTS(entry []byte) (added bool) { + bl.Mtx.Lock() + added = bl.AddIfNotHas(entry[:]) + bl.Mtx.Unlock() + return added +} + +// Clear +// resets the Bloom filter +func (bl *Bloom) Clear() { + bs := bl.bitset // important performance optimization. + for i := range bs { + bs[i] = 0 + } + bl.content = 0 +} + +// ClearTS clears the bloom filter (thread safe). +func (bl *Bloom) ClearTS() { + bl.Mtx.Lock() + bl.Clear() + bl.Mtx.Unlock() +} + +func (bl *Bloom) set(idx uint64) { + bl.bitset[idx>>6] |= 1 << (idx % 64) +} + +func (bl *Bloom) getSet(idx uint64) bool { + cur := bl.bitset[idx>>6] + bit := uint64(1 << (idx % 64)) + bl.bitset[idx>>6] = cur | bit + return (cur & bit) > 0 +} + +func (bl *Bloom) isSet(idx uint64) bool { + return bl.bitset[idx>>6]&(1<<(idx%64)) > 0 +} + +func (bl *Bloom) marshal() bloomJSONImExport { + bloomImEx := bloomJSONImExport{} + bloomImEx.SetLocs = uint64(bl.setLocs) + bloomImEx.FilterSet = make([]byte, len(bl.bitset)<<3) + for i, w := range bl.bitset { + binary.BigEndian.PutUint64(bloomImEx.FilterSet[i<<3:], w) + } + return bloomImEx +} + +// JSONMarshal +// returns JSON-object (type bloomJSONImExport) as []byte +func (bl *Bloom) JSONMarshal() []byte { + data, err := json.Marshal(bl.marshal()) + if err != nil { + log.Fatal("json.Marshal failed: ", err) + } + return data +} + +// JSONMarshalTS is a thread-safe version of JSONMarshal +func (bl *Bloom) JSONMarshalTS() []byte { + bl.Mtx.RLock() + export := bl.marshal() + bl.Mtx.RUnlock() + data, err := json.Marshal(export) + if err != nil { + log.Fatal("json.Marshal failed: ", err) + } + return data +} + +// JSONUnmarshal +// takes JSON-Object (type bloomJSONImExport) as []bytes +// returns bloom32 / bloom64 object +func JSONUnmarshal(dbData []byte) (*Bloom, error) { + bloomImEx := bloomJSONImExport{} + err := json.Unmarshal(dbData, &bloomImEx) + if err != nil { + return nil, err + } + bf := NewWithBoolset(bloomImEx.FilterSet, bloomImEx.SetLocs) + return bf, nil +} + +// FillRatio returns the fraction of bits set. +func (bl *Bloom) FillRatio() float64 { + count := uint64(0) + for _, b := range bl.bitset { + count += uint64(bits.OnesCount64(b)) + } + return float64(count) / float64(bl.size+1) +} + +// FillRatioTS is a thread-save version of FillRatio +func (bl *Bloom) FillRatioTS() float64 { + bl.Mtx.RLock() + fr := bl.FillRatio() + bl.Mtx.RUnlock() + return fr +} + +// // alternative hashFn +// func (bl Bloom) fnv64a(b *[]byte) (l, h uint64) { +// h64 := fnv.New64a() +// h64.Write(*b) +// hash := h64.Sum64() +// h = hash >> 32 +// l = hash << 32 >> 32 +// return l, h +// } +// +// // <-- http://partow.net/programming/hashfunctions/index.html +// // citation: An algorithm proposed by Donald E. Knuth in The Art Of Computer Programming Volume 3, +// // under the topic of sorting and search chapter 6.4. +// // modified to fit with boolset-length +// func (bl Bloom) DEKHash(b *[]byte) (l, h uint64) { +// hash := uint64(len(*b)) +// for _, c := range *b { +// hash = ((hash << 5) ^ (hash >> bl.shift)) ^ uint64(c) +// } +// h = hash >> bl.shift +// l = hash << bl.sizeExp >> bl.sizeExp +// return l, h +// } diff --git a/bbloom_test.go b/bbloom_test.go new file mode 100644 index 0000000000000000000000000000000000000000..453e9fc98ed0ff0b0b8f697a60b29a85e51d245f --- /dev/null +++ b/bbloom_test.go @@ -0,0 +1,271 @@ +package bbloom + +import ( + "bufio" + "fmt" + "log" + "math" + "os" + "testing" +) + +var ( + wordlist1 [][]byte + n = 1 << 16 + bf Bloom +) + +func TestMain(m *testing.M) { + file, err := os.Open("words.txt") + if err != nil { + log.Fatal(err) + } + defer file.Close() + scanner := bufio.NewScanner(file) + wordlist1 = make([][]byte, n) + for i := range wordlist1 { + if scanner.Scan() { + wordlist1[i] = []byte(scanner.Text()) + } + } + if err := scanner.Err(); err != nil { + log.Fatal(err) + } + fmt.Println("\n###############\nbbloom_test.go") + fmt.Print("Benchmarks relate to 2**16 OP. --> output/65536 op/ns\n###############\n\n") + + os.Exit(m.Run()) + +} + +func TestM_NumberOfWrongs(t *testing.T) { + bf, err := New(float64(n*10), float64(7)) + if err != nil { + t.Fatal(err) + } + + cnt := 0 + for i := range wordlist1 { + if !bf.AddIfNotHas(wordlist1[i]) { + cnt++ + } + } + fmt.Printf("Bloomfilter New(7* 2**16, 7) (-> size=%v bit): \n Check for 'false positives': %v wrong positive 'Has' results on 2**16 entries => %v %%\n", len(bf.bitset)<<6, cnt, float64(cnt)/float64(n)) + +} + +func TestM_JSON(t *testing.T) { + const shallBe = int(1 << 16) + + bf, err := New(float64(n*10), float64(7)) + if err != nil { + t.Fatal(err) + } + + cnt := 0 + for i := range wordlist1 { + if !bf.AddIfNotHas(wordlist1[i]) { + cnt++ + } + } + + json := bf.JSONMarshal() + if err != nil { + t.Fatal(err) + } + + // create new bloomfilter from bloomfilter's JSON representation + bf2, err := JSONUnmarshal(json) + if err != nil { + t.Fatal(err) + } + + cnt2 := 0 + for i := range wordlist1 { + if !bf2.AddIfNotHas(wordlist1[i]) { + cnt2++ + } + } + + if cnt2 != shallBe { + t.Errorf("FAILED !AddIfNotHas = %v; want %v", cnt2, shallBe) + } +} +func TestFillRatio(t *testing.T) { + bf, err := New(float64(512), float64(7)) + if err != nil { + t.Fatal(err) + } + bf.Add([]byte("test")) + r := bf.FillRatio() + if math.Abs(r-float64(7)/float64(512)) > 0.001 { + t.Error("ratio doesn't work") + } +} + +func ExampleBloom_AddIfNotHas() { + bf, err := New(float64(512), float64(1)) + if err != nil { + panic(err) + } + + fmt.Printf("%v %v %v %v\n", bf.sizeExp, bf.size, bf.setLocs, bf.shift) + + bf.Add([]byte("Manfred")) + fmt.Println("bf.Add([]byte(\"Manfred\"))") + fmt.Printf("bf.Has([]byte(\"Manfred\")) -> %v - should be true\n", bf.Has([]byte("Manfred"))) + fmt.Printf("bf.Add([]byte(\"manfred\")) -> %v - should be false\n", bf.Has([]byte("manfred"))) + fmt.Printf("bf.AddIfNotHas([]byte(\"Manfred\")) -> %v - should be false\n", bf.AddIfNotHas([]byte("Manfred"))) + fmt.Printf("bf.AddIfNotHas([]byte(\"manfred\")) -> %v - should be true\n", bf.AddIfNotHas([]byte("manfred"))) + + bf.AddTS([]byte("Hans")) + fmt.Println("bf.AddTS([]byte(\"Hans\")") + fmt.Printf("bf.HasTS([]byte(\"Hans\")) -> %v - should be true\n", bf.HasTS([]byte("Hans"))) + fmt.Printf("bf.AddTS([]byte(\"hans\")) -> %v - should be false\n", bf.HasTS([]byte("hans"))) + fmt.Printf("bf.AddIfNotHasTS([]byte(\"Hans\")) -> %v - should be false\n", bf.AddIfNotHasTS([]byte("Hans"))) + fmt.Printf("bf.AddIfNotHasTS([]byte(\"hans\")) -> %v - should be true\n", bf.AddIfNotHasTS([]byte("hans"))) + + // Output: 9 511 1 55 + // bf.Add([]byte("Manfred")) + // bf.Has([]byte("Manfred")) -> true - should be true + // bf.Add([]byte("manfred")) -> false - should be false + // bf.AddIfNotHas([]byte("Manfred")) -> false - should be false + // bf.AddIfNotHas([]byte("manfred")) -> true - should be true + // bf.AddTS([]byte("Hans") + // bf.HasTS([]byte("Hans")) -> true - should be true + // bf.AddTS([]byte("hans")) -> false - should be false + // bf.AddIfNotHasTS([]byte("Hans")) -> false - should be false + // bf.AddIfNotHasTS([]byte("hans")) -> true - should be true +} + +func BenchmarkM_New(b *testing.B) { + for r := 0; r < b.N; r++ { + _, _ = New(float64(n*10), float64(7)) + } +} + +func BenchmarkM_Clear(b *testing.B) { + bf, err := New(float64(n*10), float64(7)) + if err != nil { + b.Fatal(err) + } + for i := range wordlist1 { + bf.Add(wordlist1[i]) + } + b.ResetTimer() + for r := 0; r < b.N; r++ { + bf.Clear() + } +} + +func BenchmarkM_Add(b *testing.B) { + bf, err := New(float64(n*10), float64(7)) + if err != nil { + b.Fatal(err) + } + b.ResetTimer() + for r := 0; r < b.N; r++ { + for i := range wordlist1 { + bf.Add(wordlist1[i]) + } + } + +} + +func BenchmarkM_Has(b *testing.B) { + b.ResetTimer() + for r := 0; r < b.N; r++ { + for i := range wordlist1 { + bf.Has(wordlist1[i]) + } + } + +} + +func BenchmarkM_AddIfNotHasFALSE(b *testing.B) { + bf, err := New(float64(n*10), float64(7)) + if err != nil { + b.Fatal(err) + } + for i := range wordlist1 { + bf.Has(wordlist1[i]) + } + b.ResetTimer() + for r := 0; r < b.N; r++ { + for i := range wordlist1 { + bf.AddIfNotHas(wordlist1[i]) + } + } +} + +func BenchmarkM_AddIfNotHasClearTRUE(b *testing.B) { + bf, err := New(float64(n*10), float64(7)) + if err != nil { + b.Fatal(err) + } + + b.ResetTimer() + for r := 0; r < b.N; r++ { + for i := range wordlist1 { + bf.AddIfNotHas(wordlist1[i]) + } + bf.Clear() + } +} + +func BenchmarkM_AddTS(b *testing.B) { + bf, err := New(float64(n*10), float64(7)) + if err != nil { + b.Fatal(err) + } + + b.ResetTimer() + for r := 0; r < b.N; r++ { + for i := range wordlist1 { + bf.AddTS(wordlist1[i]) + } + } + +} + +func BenchmarkM_HasTS(b *testing.B) { + b.ResetTimer() + for r := 0; r < b.N; r++ { + for i := range wordlist1 { + bf.HasTS(wordlist1[i]) + } + } + +} + +func BenchmarkM_AddIfNotHasTSFALSE(b *testing.B) { + bf, err := New(float64(n*10), float64(7)) + if err != nil { + b.Fatal(err) + } + for i := range wordlist1 { + bf.Has(wordlist1[i]) + } + b.ResetTimer() + for r := 0; r < b.N; r++ { + for i := range wordlist1 { + bf.AddIfNotHasTS(wordlist1[i]) + } + } +} + +func BenchmarkM_AddIfNotHasTSClearTRUE(b *testing.B) { + bf, err := New(float64(n*10), float64(7)) + if err != nil { + b.Fatal(err) + } + + b.ResetTimer() + for r := 0; r < b.N; r++ { + for i := range wordlist1 { + bf.AddIfNotHasTS(wordlist1[i]) + } + bf.Clear() + } + +} diff --git a/collision_test.go b/collision_test.go new file mode 100644 index 0000000000000000000000000000000000000000..865e65cdfb74e4b93a5b3b1c99f2d280bc4649aa --- /dev/null +++ b/collision_test.go @@ -0,0 +1,44 @@ +package bbloom + +import ( + "math/rand" + "testing" + "time" +) + +func TestCollisionRate(t *testing.T) { + rand.Seed(time.Now().UTC().UnixNano()) + N := 1 << 20 + M := N * 12 + K := 2 + + bl, err := New(float64(M), float64(K)) + if err != nil { + t.Fatal(err) + } + var buf [64]byte + for i := 0; i < N; i++ { + _, err := rand.Read(buf[:]) + if err != nil { + t.Fatal(err) + } + + bl.Add(buf[:]) + } + + Ntest := int(1e6) + falsePositive := 0 + + for i := 0; i < Ntest; i++ { + _, err := rand.Read(buf[:]) + if err != nil { + t.Fatal(err) + } + + if bl.Has(buf[:]) { + falsePositive++ + } + } + + t.Logf("false positive ratio: %f", float64(falsePositive)/float64(Ntest)) +} diff --git a/go.mod b/go.mod new file mode 100644 index 0000000000000000000000000000000000000000..d215f4f103a3e13c2d358d307960133212bc7054 --- /dev/null +++ b/go.mod @@ -0,0 +1,3 @@ +module github.com/ipfs/bbloom + +go 1.12 diff --git a/package.json b/package.json new file mode 100644 index 0000000000000000000000000000000000000000..3d42cf9c8516ab1946ab4c335afa5457daa57a50 --- /dev/null +++ b/package.json @@ -0,0 +1,15 @@ +{ + "author": "AndreasBriese", + "bugs": { + "url": "https://github.com/ipfs/bbloom" + }, + "gx": { + "dvcsimport": "github.com/ipfs/bbloom" + }, + "gxVersion": "0.7.0", + "language": "go", + "license": "MIT", + "name": "bbloom", + "version": "0.1.2" +} + diff --git a/sipHash.go b/sipHash.go new file mode 100644 index 0000000000000000000000000000000000000000..4f2755ca9ee1acb82846812349f25dc5ecd25885 --- /dev/null +++ b/sipHash.go @@ -0,0 +1,225 @@ +// Written in 2012 by Dmitry Chestnykh. +// +// To the extent possible under law, the author have dedicated all copyright +// and related and neighboring rights to this software to the public domain +// worldwide. This software is distributed without any warranty. +// http://creativecommons.org/publicdomain/zero/1.0/ +// +// Package siphash implements SipHash-2-4, a fast short-input PRF +// created by Jean-Philippe Aumasson and Daniel J. Bernstein. + +package bbloom + +// Hash returns the 64-bit SipHash-2-4 of the given byte slice with two 64-bit +// parts of 128-bit key: k0 and k1. +func (bl *Bloom) sipHash(p []byte) (l, h uint64) { + // Initialization. + v0 := uint64(8317987320269560794) // k0 ^ 0x736f6d6570736575 + v1 := uint64(7237128889637516672) // k1 ^ 0x646f72616e646f6d + v2 := uint64(7816392314733513934) // k0 ^ 0x6c7967656e657261 + v3 := uint64(8387220255325274014) // k1 ^ 0x7465646279746573 + t := uint64(len(p)) << 56 + + // Compression. + for len(p) >= 8 { + + m := uint64(p[0]) | uint64(p[1])<<8 | uint64(p[2])<<16 | uint64(p[3])<<24 | + uint64(p[4])<<32 | uint64(p[5])<<40 | uint64(p[6])<<48 | uint64(p[7])<<56 + + v3 ^= m + + // Round 1. + v0 += v1 + v1 = v1<<13 | v1>>51 + v1 ^= v0 + v0 = v0<<32 | v0>>32 + + v2 += v3 + v3 = v3<<16 | v3>>48 + v3 ^= v2 + + v0 += v3 + v3 = v3<<21 | v3>>43 + v3 ^= v0 + + v2 += v1 + v1 = v1<<17 | v1>>47 + v1 ^= v2 + v2 = v2<<32 | v2>>32 + + // Round 2. + v0 += v1 + v1 = v1<<13 | v1>>51 + v1 ^= v0 + v0 = v0<<32 | v0>>32 + + v2 += v3 + v3 = v3<<16 | v3>>48 + v3 ^= v2 + + v0 += v3 + v3 = v3<<21 | v3>>43 + v3 ^= v0 + + v2 += v1 + v1 = v1<<17 | v1>>47 + v1 ^= v2 + v2 = v2<<32 | v2>>32 + + v0 ^= m + p = p[8:] + } + + // Compress last block. + switch len(p) { + case 7: + t |= uint64(p[6]) << 48 + fallthrough + case 6: + t |= uint64(p[5]) << 40 + fallthrough + case 5: + t |= uint64(p[4]) << 32 + fallthrough + case 4: + t |= uint64(p[3]) << 24 + fallthrough + case 3: + t |= uint64(p[2]) << 16 + fallthrough + case 2: + t |= uint64(p[1]) << 8 + fallthrough + case 1: + t |= uint64(p[0]) + } + + v3 ^= t + + // Round 1. + v0 += v1 + v1 = v1<<13 | v1>>51 + v1 ^= v0 + v0 = v0<<32 | v0>>32 + + v2 += v3 + v3 = v3<<16 | v3>>48 + v3 ^= v2 + + v0 += v3 + v3 = v3<<21 | v3>>43 + v3 ^= v0 + + v2 += v1 + v1 = v1<<17 | v1>>47 + v1 ^= v2 + v2 = v2<<32 | v2>>32 + + // Round 2. + v0 += v1 + v1 = v1<<13 | v1>>51 + v1 ^= v0 + v0 = v0<<32 | v0>>32 + + v2 += v3 + v3 = v3<<16 | v3>>48 + v3 ^= v2 + + v0 += v3 + v3 = v3<<21 | v3>>43 + v3 ^= v0 + + v2 += v1 + v1 = v1<<17 | v1>>47 + v1 ^= v2 + v2 = v2<<32 | v2>>32 + + v0 ^= t + + // Finalization. + v2 ^= 0xff + + // Round 1. + v0 += v1 + v1 = v1<<13 | v1>>51 + v1 ^= v0 + v0 = v0<<32 | v0>>32 + + v2 += v3 + v3 = v3<<16 | v3>>48 + v3 ^= v2 + + v0 += v3 + v3 = v3<<21 | v3>>43 + v3 ^= v0 + + v2 += v1 + v1 = v1<<17 | v1>>47 + v1 ^= v2 + v2 = v2<<32 | v2>>32 + + // Round 2. + v0 += v1 + v1 = v1<<13 | v1>>51 + v1 ^= v0 + v0 = v0<<32 | v0>>32 + + v2 += v3 + v3 = v3<<16 | v3>>48 + v3 ^= v2 + + v0 += v3 + v3 = v3<<21 | v3>>43 + v3 ^= v0 + + v2 += v1 + v1 = v1<<17 | v1>>47 + v1 ^= v2 + v2 = v2<<32 | v2>>32 + + // Round 3. + v0 += v1 + v1 = v1<<13 | v1>>51 + v1 ^= v0 + v0 = v0<<32 | v0>>32 + + v2 += v3 + v3 = v3<<16 | v3>>48 + v3 ^= v2 + + v0 += v3 + v3 = v3<<21 | v3>>43 + v3 ^= v0 + + v2 += v1 + v1 = v1<<17 | v1>>47 + v1 ^= v2 + v2 = v2<<32 | v2>>32 + + // Round 4. + v0 += v1 + v1 = v1<<13 | v1>>51 + v1 ^= v0 + v0 = v0<<32 | v0>>32 + + v2 += v3 + v3 = v3<<16 | v3>>48 + v3 ^= v2 + + v0 += v3 + v3 = v3<<21 | v3>>43 + v3 ^= v0 + + v2 += v1 + v1 = v1<<17 | v1>>47 + v1 ^= v2 + v2 = v2<<32 | v2>>32 + + // return v0 ^ v1 ^ v2 ^ v3 + + hash := v0 ^ v1 ^ v2 ^ v3 + h = hash >> bl.shift + l = hash << bl.shift >> bl.shift + return l, h + +} diff --git a/words.txt b/words.txt new file mode 100644 index 0000000000000000000000000000000000000000..ad86a31ac5f6a01ed2e68b99315b9c7e554007c6 --- /dev/null +++ b/words.txt @@ -0,0 +1,140 @@ +2014/01/01 00:00:00 /info.html +2014/01/01 00:00:00 /info.html +2014/01/01 00:00:01 /info.html +2014/01/01 00:00:02 /info.html +2014/01/01 00:00:03 /info.html +2014/01/01 00:00:04 /info.html +2014/01/01 00:00:05 /info.html +2014/01/01 00:00:06 /info.html +2014/01/01 00:00:07 /info.html +2014/01/01 00:00:08 /info.html +2014/01/01 00:00:09 /info.html +2014/01/01 00:00:10 /info.html +2014/01/01 00:00:11 /info.html +2014/01/01 00:00:12 /info.html +2014/01/01 00:00:13 /info.html +2014/01/01 00:00:14 /info.html +2014/01/01 00:00:15 /info.html +2014/01/01 00:00:16 /info.html +2014/01/01 00:00:17 /info.html +2014/01/01 00:00:18 /info.html +2014/01/01 00:00:19 /info.html +2014/01/01 00:00:20 /info.html +2014/01/01 00:00:21 /info.html +2014/01/01 00:00:22 /info.html +2014/01/01 00:00:23 /info.html +2014/01/01 00:00:24 /info.html +2014/01/01 00:00:25 /info.html +2014/01/01 00:00:26 /info.html +2014/01/01 00:00:27 /info.html +2014/01/01 00:00:28 /info.html +2014/01/01 00:00:29 /info.html +2014/01/01 00:00:30 /info.html +2014/01/01 00:00:31 /info.html +2014/01/01 00:00:32 /info.html +2014/01/01 00:00:33 /info.html +2014/01/01 00:00:34 /info.html +2014/01/01 00:00:35 /info.html +2014/01/01 00:00:36 /info.html +2014/01/01 00:00:37 /info.html +2014/01/01 00:00:38 /info.html +2014/01/01 00:00:39 /info.html +2014/01/01 00:00:40 /info.html +2014/01/01 00:00:41 /info.html +2014/01/01 00:00:42 /info.html +2014/01/01 00:00:43 /info.html +2014/01/01 00:00:44 /info.html +2014/01/01 00:00:45 /info.html +2014/01/01 00:00:46 /info.html +2014/01/01 00:00:47 /info.html +2014/01/01 00:00:48 /info.html +2014/01/01 00:00:49 /info.html +2014/01/01 00:00:50 /info.html +2014/01/01 00:00:51 /info.html +2014/01/01 00:00:52 /info.html +2014/01/01 00:00:53 /info.html +2014/01/01 00:00:54 /info.html +2014/01/01 00:00:55 /info.html +2014/01/01 00:00:56 /info.html +2014/01/01 00:00:57 /info.html +2014/01/01 00:00:58 /info.html +2014/01/01 00:00:59 /info.html +2014/01/01 00:01:00 /info.html +2014/01/01 00:01:01 /info.html +2014/01/01 00:01:02 /info.html +2014/01/01 00:01:03 /info.html +2014/01/01 00:01:04 /info.html +2014/01/01 00:01:05 /info.html +2014/01/01 00:01:06 /info.html +2014/01/01 00:01:07 /info.html +2014/01/01 00:01:08 /info.html +2014/01/01 00:01:09 /info.html +2014/01/01 00:01:10 /info.html +2014/01/01 00:01:11 /info.html +2014/01/01 00:01:12 /info.html +2014/01/01 00:01:13 /info.html +2014/01/01 00:01:14 /info.html +2014/01/01 00:01:15 /info.html +2014/01/01 00:01:16 /info.html +2014/01/01 00:01:17 /info.html +2014/01/01 00:01:18 /info.html +2014/01/01 00:01:19 /info.html +2014/01/01 00:01:20 /info.html +2014/01/01 00:01:21 /info.html +2014/01/01 00:01:22 /info.html +2014/01/01 00:01:23 /info.html +2014/01/01 00:01:24 /info.html +2014/01/01 00:01:25 /info.html +2014/01/01 00:01:26 /info.html +2014/01/01 00:01:27 /info.html +2014/01/01 00:01:28 /info.html +2014/01/01 00:01:29 /info.html +2014/01/01 00:01:30 /info.html +2014/01/01 00:01:31 /info.html +2014/01/01 00:01:32 /info.html +2014/01/01 00:01:33 /info.html +2014/01/01 00:01:34 /info.html +2014/01/01 00:01:35 /info.html +2014/01/01 00:01:36 /info.html +2014/01/01 00:01:37 /info.html +2014/01/01 00:01:38 /info.html +2014/01/01 00:01:39 /info.html +2014/01/01 00:01:40 /info.html +2014/01/01 00:01:41 /info.html +2014/01/01 00:01:42 /info.html +2014/01/01 00:01:43 /info.html +2014/01/01 00:01:44 /info.html +2014/01/01 00:01:45 /info.html +2014/01/01 00:01:46 /info.html +2014/01/01 00:01:47 /info.html +2014/01/01 00:01:48 /info.html +2014/01/01 00:01:49 /info.html +2014/01/01 00:01:50 /info.html +2014/01/01 00:01:51 /info.html +2014/01/01 00:01:52 /info.html +2014/01/01 00:01:53 /info.html +2014/01/01 00:01:54 /info.html +2014/01/01 00:01:55 /info.html +2014/01/01 00:01:56 /info.html +2014/01/01 00:01:57 /info.html +2014/01/01 00:01:58 /info.html +2014/01/01 00:01:59 /info.html +2014/01/01 00:02:00 /info.html +2014/01/01 00:02:01 /info.html +2014/01/01 00:02:02 /info.html +2014/01/01 00:02:03 /info.html +2014/01/01 00:02:04 /info.html +2014/01/01 00:02:05 /info.html +2014/01/01 00:02:06 /info.html +2014/01/01 00:02:07 /info.html +2014/01/01 00:02:08 /info.html +2014/01/01 00:02:09 /info.html +2014/01/01 00:02:10 /info.html +2014/01/01 00:02:11 /info.html +2014/01/01 00:02:12 /info.html +2014/01/01 00:02:13 /info.html +2014/01/01 00:02:14 /info.html +2014/01/01 00:02:15 /info.html +2014/01/01 00:02:16 /info.html +2014/01/01 00:02:17 /info.html +2014/01/01 00:02:18 /info.html