Repository: steakknife/bloomfilter Branch: master Commit: 6819c0d2a570 Files: 20 Total size: 29.5 KB Directory structure: gitextract_ejmhslmv/ ├── .travis.yml ├── MIT-LICENSE.txt ├── README.md ├── binarymarshaler.go ├── binaryunmarshaler.go ├── bloomfilter.go ├── bloomfilter_test.go ├── conformance.go ├── debug.go ├── errors.go ├── fileio.go ├── fileio_test.go ├── gob.go ├── iscompatible.go ├── new.go ├── optimal.go ├── optimal_test.go ├── statistics.go ├── textmarshaler.go └── textunmarshaler.go ================================================ FILE CONTENTS ================================================ ================================================ FILE: .travis.yml ================================================ language: go dist: trusty sudo: false go: - "1.8.x" - "1.9.x" - "1.10.x" - master before_script: - "go get -u gopkg.in/alecthomas/gometalinter.v2" - "gometalinter.v2 --install" script: - "go test -v -cover -benchmem -bench=. $(go list ./... | grep -v /vendor/ | sed \"s&_${PWD}&.&\")" - "gometalinter.v2 --enable-all ./..." ================================================ FILE: MIT-LICENSE.txt ================================================ The MIT License (MIT) Copyright © 2014, 2015 Barry Allard Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ **Important**: Zeroth, [consider](https://bdupras.github.io/filter-tutorial/) if a [Cuckoo filter](https://www.cs.cmu.edu/~dga/papers/cuckoo-conext2014.pdf) could be [right for your use-case](https://github.com/seiflotfy/cuckoofilter). [![GoDoc](https://godoc.org/github.com/steakknife/bloomfilter?status.png)](https://godoc.org/github.com/steakknife/bloomfilter) [![travis](https://img.shields.io/travis/steakknife/bloomfilter.svg)](https://travis-ci.org/steakknife/bloomfilter) # Face-meltingly fast, thread-safe, marshalable, unionable, probability- and optimal-size-calculating Bloom filter in go Copyright © 2014-2016,2018 Barry Allard [MIT license](MIT-LICENSE.txt) ## WTF is a bloom filter **TL;DR: **Probabilistic, extra lookup table to track a set of elements kept elsewhere to reduce expensive, unnecessary set element retrieval and/or iterator operations **when an element is not present in the set.** It's a classic time-storage tradeoff algoritm. ### Properties #### [See wikipedia](https://en.wikipedia.org/wiki/Bloom_filter) for algorithm details |Impact|What|Description| |---|---|---| |Good|No false negatives|know for certain if a given element is definitely NOT in the set| |Bad|False positives|uncertain if a given element is in the set| |Bad|Theoretical potential for hash collisions|in very large systems and/or badly hash.Hash64-conforming implementations| |Bad|Add only|Cannot remove an element, it would destroy information about other elements| |Good|Constant storage|uses only a fixed amount of memory| ## Naming conventions (Similar to algorithm) |Variable/function|Description|Range| |---|---|---| |m/M()|number of bits in the bloom filter (memory representation is about m/8 bytes in size)|>=2| |n/N()|number of elements present|>=0| |k/K()|number of keys to use (keys are kept private to user code but are de/serialized to Marshal and file I/O)|>=0| |maxN|maximum capacity of intended structure|>0| |p|maximum allowed probability of collision (for computing m and k for optimal sizing)|>0..<1| - Memory representation should be exactly `24 + 8*(k + (m+63)/64) + unsafe.Sizeof(RWMutex)` bytes. - Serialized (`BinaryMarshaler`) representation should be exactly `72 + 8*(k + (m+63)/64)` bytes. (Disk format is less due to compression.) ## Binary serialization format All values in Little-endian format |Offset|Offset (Hex)|Length (bytes)|Name|Type| |---|---|---|---|---| |0|00|8|k|`uint64`| |8|08|8|n|`uint64`| |16|10|8|m|`uint64`| |24|18|k|(keys)|`[k]uint64`| |24+8*k|...|(m+63)/64|(bloom filter)|`[(m+63)/64]uint64`| |24+8\*k+8\*((m+63)/64)|...|48|(SHA384 of all previous fields, hashed in order)|`[48]byte`| - `bloomfilter.Filter` conforms to `encoding.BinaryMarshaler` and `encoding.BinaryUnmarshaler' ## Usage ```go import "github.com/steakknife/bloomfilter" const ( maxElements = 100000 probCollide = 0.0000001 ) bf, err := bloomfilter.NewOptimal(maxElements, probCollide) if err != nil { panic(err) } someValue := ... // must conform to hash.Hash64 bf.Add(someValue) if bf.Contains(someValue) { // probably true, could be false // whatever } anotherValue := ... // must also conform to hash.Hash64 if bf.Contains(anotherValue) { panic("This should never happen") } err := bf.WriteFile("1.bf.gz") // saves this BF to a file if err != nil { panic(err) } bf2, err := bloomfilter.ReadFile("1.bf.gz") // read the BF to another var if err != nil { panic(err) } ``` ## Design Where possible, branch-free operations are used to avoid deep pipeline / execution unit stalls on branch-misses. ## Get go get -u github.com/steakknife/bloomfilter # master is always stable ## Source - On the web: [https://github.com/steakknife/bloomfilter](https://github.com/steakknife/bloomfilter) - Git: `git clone https://github.com/steakknife/bloomfilter` ## Contact - [Feedback](mailto:barry.allard@gmail.com) - [Issues](https://github.com/steakknife/bloomfilter/issues) ## License [MIT license](MIT-LICENSE.txt) Copyright © 2014-2016 Barry Allard ================================================ FILE: binarymarshaler.go ================================================ // Package bloomfilter is face-meltingly fast, thread-safe, // marshalable, unionable, probability- and // optimal-size-calculating Bloom filter in go // // https://github.com/steakknife/bloomfilter // // Copyright © 2014, 2015, 2018 Barry Allard // // MIT license // package bloomfilter import ( "bytes" "crypto/sha512" "encoding/binary" ) // conforms to encoding.BinaryMarshaler // marshalled binary layout (Little Endian): // // k 1 uint64 // n 1 uint64 // m 1 uint64 // keys [k]uint64 // bits [(m+63)/64]uint64 // hash sha384 (384 bits == 48 bytes) // // size = (3 + k + (m+63)/64) * 8 bytes // func (f *Filter) marshal() (buf *bytes.Buffer, hash [sha512.Size384]byte, err error, ) { f.lock.RLock() defer f.lock.RUnlock() debug("write bf k=%d n=%d m=%d\n", f.K(), f.n, f.m) buf = new(bytes.Buffer) err = binary.Write(buf, binary.LittleEndian, f.K()) if err != nil { return nil, hash, err } err = binary.Write(buf, binary.LittleEndian, f.n) if err != nil { return nil, hash, err } err = binary.Write(buf, binary.LittleEndian, f.m) if err != nil { return nil, hash, err } err = binary.Write(buf, binary.LittleEndian, f.keys) if err != nil { return nil, hash, err } err = binary.Write(buf, binary.LittleEndian, f.bits) if err != nil { return nil, hash, err } hash = sha512.Sum384(buf.Bytes()) err = binary.Write(buf, binary.LittleEndian, hash) return buf, hash, err } // MarshalBinary converts a Filter into []bytes func (f *Filter) MarshalBinary() (data []byte, err error) { buf, hash, err := f.marshal() if err != nil { return nil, err } debug( "bloomfilter.MarshalBinary: Successfully wrote %d byte(s), sha384 %v", buf.Len(), hash, ) data = buf.Bytes() return data, nil } ================================================ FILE: binaryunmarshaler.go ================================================ // Package bloomfilter is face-meltingly fast, thread-safe, // marshalable, unionable, probability- and // optimal-size-calculating Bloom filter in go // // https://github.com/steakknife/bloomfilter // // Copyright © 2014, 2015, 2018 Barry Allard // // MIT license // package bloomfilter import ( "bytes" "crypto/hmac" "crypto/sha512" "encoding/binary" "io" ) func unmarshalBinaryHeader(r io.Reader) (k, n, m uint64, err error) { err = binary.Read(r, binary.LittleEndian, &k) if err != nil { return k, n, m, err } if k < KMin { return k, n, m, errK() } err = binary.Read(r, binary.LittleEndian, &n) if err != nil { return k, n, m, err } err = binary.Read(r, binary.LittleEndian, &m) if err != nil { return k, n, m, err } if m < MMin { return k, n, m, errM() } debug("read bf k=%d n=%d m=%d\n", k, n, m) return k, n, m, err } func unmarshalBinaryBits(r io.Reader, m uint64) (bits []uint64, err error) { bits, err = newBits(m) if err != nil { return bits, err } err = binary.Read(r, binary.LittleEndian, bits) return bits, err } func unmarshalBinaryKeys(r io.Reader, k uint64) (keys []uint64, err error) { keys = make([]uint64, k) err = binary.Read(r, binary.LittleEndian, keys) return keys, err } func checkBinaryHash(r io.Reader, data []byte) (err error) { expectedHash := make([]byte, sha512.Size384) err = binary.Read(r, binary.LittleEndian, expectedHash) if err != nil { return err } actualHash := sha512.Sum384(data[:len(data)-sha512.Size384]) if !hmac.Equal(expectedHash, actualHash[:]) { debug("bloomfilter.UnmarshalBinary() sha384 hash failed:"+ " actual %v expected %v", actualHash, expectedHash) return errHash() } debug("bloomfilter.UnmarshalBinary() successfully read"+ " %d byte(s), sha384 %v", len(data), actualHash) return nil } // UnmarshalBinary converts []bytes into a Filter // conforms to encoding.BinaryUnmarshaler func (f *Filter) UnmarshalBinary(data []byte) (err error) { f.lock.Lock() defer f.lock.Unlock() buf := bytes.NewBuffer(data) var k uint64 k, f.n, f.m, err = unmarshalBinaryHeader(buf) if err != nil { return err } f.keys, err = unmarshalBinaryKeys(buf, k) if err != nil { return err } f.bits, err = unmarshalBinaryBits(buf, f.m) if err != nil { return err } return checkBinaryHash(buf, data) } ================================================ FILE: bloomfilter.go ================================================ // Package bloomfilter is face-meltingly fast, thread-safe, // marshalable, unionable, probability- and // optimal-size-calculating Bloom filter in go // // https://github.com/steakknife/bloomfilter // // Copyright © 2014, 2015, 2018 Barry Allard // // MIT license // package bloomfilter import ( "hash" "sync" ) // Filter is an opaque Bloom filter type type Filter struct { lock sync.RWMutex bits []uint64 keys []uint64 m uint64 // number of bits the "bits" field should recognize n uint64 // number of inserted elements } // Hashable -> hashes func (f *Filter) hash(v hash.Hash64) []uint64 { rawHash := v.Sum64() n := len(f.keys) hashes := make([]uint64, n) for i := 0; i < n; i++ { hashes[i] = rawHash ^ f.keys[i] } return hashes } // M is the size of Bloom filter, in bits func (f *Filter) M() uint64 { return f.m } // K is the count of keys func (f *Filter) K() uint64 { return uint64(len(f.keys)) } // Add a hashable item, v, to the filter func (f *Filter) Add(v hash.Hash64) { f.lock.Lock() defer f.lock.Unlock() for _, i := range f.hash(v) { // f.setBit(i) i %= f.m f.bits[i>>6] |= 1 << uint(i&0x3f) } f.n++ } // Contains tests if f contains v // false: f definitely does not contain value v // true: f maybe contains value v func (f *Filter) Contains(v hash.Hash64) bool { f.lock.RLock() defer f.lock.RUnlock() r := uint64(1) for _, i := range f.hash(v) { // r |= f.getBit(k) i %= f.m r &= (f.bits[i>>6] >> uint(i&0x3f)) & 1 } return uint64ToBool(r) } // Copy f to a new Bloom filter func (f *Filter) Copy() (*Filter, error) { f.lock.RLock() defer f.lock.RUnlock() out, err := f.NewCompatible() if err != nil { return nil, err } copy(out.bits, f.bits) out.n = f.n return out, nil } // UnionInPlace merges Bloom filter f2 into f func (f *Filter) UnionInPlace(f2 *Filter) error { if !f.IsCompatible(f2) { return errIncompatibleBloomFilters() } f.lock.Lock() defer f.lock.Unlock() for i, bitword := range f2.bits { f.bits[i] |= bitword } return nil } // Union merges f2 and f2 into a new Filter out func (f *Filter) Union(f2 *Filter) (out *Filter, err error) { if !f.IsCompatible(f2) { return nil, errIncompatibleBloomFilters() } f.lock.RLock() defer f.lock.RUnlock() out, err = f.NewCompatible() if err != nil { return nil, err } for i, bitword := range f2.bits { out.bits[i] = f.bits[i] | bitword } return out, nil } ================================================ FILE: bloomfilter_test.go ================================================ // Package bloomfilter is face-meltingly fast, thread-safe, // marshalable, unionable, probability- and // optimal-size-calculating Bloom filter in go // // https://github.com/steakknife/bloomfilter // // Copyright © 2014, 2015, 2018 Barry Allard // // MIT license // package bloomfilter import ( "math/rand" "testing" ) // a read-only type that conforms to hash.Hash64, but only Sum64() works. // It is set by writing the underlying value. type hashableUint64 uint64 func (h hashableUint64) Write([]byte) (int, error) { panic("Unimplemented") } func (h hashableUint64) Sum([]byte) []byte { panic("Unimplemented") } func (h hashableUint64) Reset() { panic("Unimplemented") } func (h hashableUint64) BlockSize() int { panic("Unimplemented") } func (h hashableUint64) Size() int { panic("Unimplemented") } func (h hashableUint64) Sum64() uint64 { return uint64(h) } func hashableUint64Values() []hashableUint64 { return []hashableUint64{ 0, 7, 0x0c0ffee0, 0xdeadbeef, 0xffffffff, } } func hashableUint64NotValues() []hashableUint64 { return []hashableUint64{ 1, 5, 42, 0xa5a5a5a5, 0xfffffffe, } } func Test0(t *testing.T) { bf, _ := New(10000, 5) t.Log("Filled ratio before adds :", bf.PreciseFilledRatio()) for _, x := range hashableUint64Values() { bf.Add(x) } t.Log("Filled ratio after adds :", bf.PreciseFilledRatio()) // these may or may not be true for _, y := range hashableUint64Values() { if bf.Contains(y) { t.Log("value in set querties: may contain ", y) } else { t.Fatal("value in set queries: definitely does not contain ", y, ", but it should") } } // these must all be false for _, z := range hashableUint64NotValues() { if bf.Contains(z) { t.Log("value not in set queries: may or may not contain ", z) } else { t.Log("value not in set queries: definitely does not contain ", z, " which is correct") } } } func BenchmarkAddX10kX5(b *testing.B) { b.StopTimer() bf, _ := New(10000, 5) b.StartTimer() for i := 0; i < b.N; i++ { bf.Add(hashableUint64(rand.Uint32())) } } func BenchmarkContains1kX10kX5(b *testing.B) { b.StopTimer() bf, _ := New(10000, 5) for i := 0; i < 1000; i++ { bf.Add(hashableUint64(rand.Uint32())) } b.StartTimer() for i := 0; i < b.N; i++ { bf.Contains(hashableUint64(rand.Uint32())) } } func BenchmarkContains100kX10BX20(b *testing.B) { b.StopTimer() bf, _ := New(10*1000*1000*1000, 20) for i := 0; i < 100*1000; i++ { bf.Add(hashableUint64(rand.Uint32())) } b.StartTimer() for i := 0; i < b.N; i++ { bf.Contains(hashableUint64(rand.Uint32())) } } ================================================ FILE: conformance.go ================================================ // Package bloomfilter is face-meltingly fast, thread-safe, // marshalable, unionable, probability- and // optimal-size-calculating Bloom filter in go // // https://github.com/steakknife/bloomfilter // // Copyright © 2014, 2015, 2018 Barry Allard // // MIT license // package bloomfilter import ( "encoding" "encoding/gob" "io" ) // compile-time conformance tests var ( _ encoding.BinaryMarshaler = (*Filter)(nil) _ encoding.BinaryUnmarshaler = (*Filter)(nil) _ encoding.TextMarshaler = (*Filter)(nil) _ encoding.TextUnmarshaler = (*Filter)(nil) _ io.ReaderFrom = (*Filter)(nil) _ io.WriterTo = (*Filter)(nil) _ gob.GobDecoder = (*Filter)(nil) _ gob.GobEncoder = (*Filter)(nil) ) ================================================ FILE: debug.go ================================================ // Package bloomfilter is face-meltingly fast, thread-safe, // marshalable, unionable, probability- and // optimal-size-calculating Bloom filter in go // // https://github.com/steakknife/bloomfilter // // Copyright © 2014, 2015, 2018 Barry Allard // // MIT license // package bloomfilter import ( "log" "os" ) const debugVar = "GOLANG_STEAKKNIFE_BLOOMFILTER_DEBUG" // EnableDebugging permits debug() logging of details to stderr func EnableDebugging() { err := os.Setenv(debugVar, "1") if err != nil { panic("Unable to Setenv " + debugVar) } } func debugging() bool { return os.Getenv(debugVar) != "" } // debug printing when debugging() is true func debug(format string, a ...interface{}) { if debugging() { log.Printf(format, a...) } } ================================================ FILE: errors.go ================================================ // Package bloomfilter is face-meltingly fast, thread-safe, // marshalable, unionable, probability- and // optimal-size-calculating Bloom filter in go // // https://github.com/steakknife/bloomfilter // // Copyright © 2014, 2015, 2018 Barry Allard // // MIT license // package bloomfilter import "fmt" func errHash() error { return fmt.Errorf( "Hash mismatch, the Bloom filter is probably corrupt") } func errK() error { return fmt.Errorf( "keys must have length %d or greater", KMin) } func errM() error { return fmt.Errorf( "m (number of bits in the Bloom filter) must be >= %d", MMin) } func errUniqueKeys() error { return fmt.Errorf( "Bloom filter keys must be unique") } func errIncompatibleBloomFilters() error { return fmt.Errorf( "Cannot perform union on two incompatible Bloom filters") } ================================================ FILE: fileio.go ================================================ // Package bloomfilter is face-meltingly fast, thread-safe, // marshalable, unionable, probability- and // optimal-size-calculating Bloom filter in go // // https://github.com/steakknife/bloomfilter // // Copyright © 2014, 2015, 2018 Barry Allard // // MIT license // package bloomfilter import ( "compress/gzip" "io" "io/ioutil" "os" ) // ReadFrom r and overwrite f with new Bloom filter data func (f *Filter) ReadFrom(r io.Reader) (n int64, err error) { f2, n, err := ReadFrom(r) if err != nil { return -1, err } f.lock.Lock() defer f.lock.Unlock() f.m = f2.m f.n = f2.n f.bits = f2.bits f.keys = f2.keys return n, nil } // ReadFrom Reader r into a lossless-compressed Bloom filter f func ReadFrom(r io.Reader) (f *Filter, n int64, err error) { rawR, err := gzip.NewReader(r) if err != nil { return nil, -1, err } defer func() { err = rawR.Close() }() content, err := ioutil.ReadAll(rawR) if err != nil { return nil, -1, err } f = new(Filter) n = int64(len(content)) err = f.UnmarshalBinary(content) if err != nil { return nil, -1, err } return f, n, nil } // ReadFile from filename into a lossless-compressed Bloom Filter f // Suggested file extension: .bf.gz func ReadFile(filename string) (f *Filter, n int64, err error) { r, err := os.Open(filename) if err != nil { return nil, -1, err } defer func() { err = r.Close() }() return ReadFrom(r) } // WriteTo a Writer w from lossless-compressed Bloom Filter f func (f *Filter) WriteTo(w io.Writer) (n int64, err error) { f.lock.RLock() defer f.lock.RUnlock() rawW := gzip.NewWriter(w) defer func() { err = rawW.Close() }() content, err := f.MarshalBinary() if err != nil { return -1, err } intN, err := rawW.Write(content) n = int64(intN) return n, err } // WriteFile filename from a a lossless-compressed Bloom Filter f // Suggested file extension: .bf.gz func (f *Filter) WriteFile(filename string) (n int64, err error) { w, err := os.Create(filename) if err != nil { return -1, err } defer func() { err = w.Close() }() return f.WriteTo(w) } ================================================ FILE: fileio_test.go ================================================ // Package bloomfilter is face-meltingly fast, thread-safe, // marshalable, unionable, probability- and // optimal-size-calculating Bloom filter in go // // https://github.com/steakknife/bloomfilter // // Copyright © 2014, 2015, 2018 Barry Allard // // MIT license // package bloomfilter import ( "bytes" "testing" ) func TestWriteRead(t *testing.T) { // minimal filter f, _ := New(2, 1) v := hashableUint64(0) f.Add(v) var b bytes.Buffer _, err := f.WriteTo(&b) if err != nil { t.Error(err) } f2, _, err := ReadFrom(&b) if err != nil { t.Error(err) } if !f2.Contains(v) { t.Error("Filters not equal") } } ================================================ FILE: gob.go ================================================ // Package bloomfilter is face-meltingly fast, thread-safe, // marshalable, unionable, probability- and // optimal-size-calculating Bloom filter in go // // https://github.com/steakknife/bloomfilter // // Copyright © 2014, 2015, 2018 Barry Allard // // MIT license // package bloomfilter import _ "encoding/gob" // make sure gob is available // GobDecode conforms to interface gob.GobDecoder func (f *Filter) GobDecode(data []byte) error { return f.UnmarshalBinary(data) } // GobEncode conforms to interface gob.GobEncoder func (f *Filter) GobEncode() ([]byte, error) { return f.MarshalBinary() } ================================================ FILE: iscompatible.go ================================================ // Package bloomfilter is face-meltingly fast, thread-safe, // marshalable, unionable, probability- and // optimal-size-calculating Bloom filter in go // // https://github.com/steakknife/bloomfilter // // Copyright © 2014, 2015, 2018 Barry Allard // // MIT license // package bloomfilter import "unsafe" func uint64ToBool(x uint64) bool { return *(*bool)(unsafe.Pointer(&x)) // #nosec } // returns 0 if equal, does not compare len(b0) with len(b1) func noBranchCompareUint64s(b0, b1 []uint64) uint64 { r := uint64(0) for i, b0i := range b0 { r |= b0i ^ b1[i] } return r } // IsCompatible is true if f and f2 can be Union()ed together func (f *Filter) IsCompatible(f2 *Filter) bool { f.lock.RLock() defer f.lock.RUnlock() f.lock.RLock() defer f2.lock.RUnlock() // 0 is true, non-0 is false compat := f.M() ^ f2.M() compat |= f.K() ^ f2.K() compat |= noBranchCompareUint64s(f.keys, f2.keys) return uint64ToBool(^compat) } ================================================ FILE: new.go ================================================ // Package bloomfilter is face-meltingly fast, thread-safe, // marshalable, unionable, probability- and // optimal-size-calculating Bloom filter in go // // https://github.com/steakknife/bloomfilter // // Copyright © 2014, 2015, 2018 Barry Allard // // MIT license // package bloomfilter import ( "crypto/rand" "encoding/binary" "log" ) const ( // MMin is the minimum Bloom filter bits count MMin = 2 // KMin is the minimum number of keys KMin = 1 // Uint64Bytes is the number of bytes in type uint64 Uint64Bytes = 8 ) // New Filter with CSPRNG keys // // m is the size of the Bloom filter, in bits, >= 2 // // k is the number of random keys, >= 1 func New(m, k uint64) (*Filter, error) { return NewWithKeys(m, newRandKeys(k)) } func newRandKeys(k uint64) []uint64 { keys := make([]uint64, k) err := binary.Read(rand.Reader, binary.LittleEndian, keys) if err != nil { log.Panicf( "Cannot read %d bytes from CSRPNG crypto/rand.Read (err=%v)", Uint64Bytes, err, ) } return keys } // NewCompatible Filter compatible with f func (f *Filter) NewCompatible() (*Filter, error) { return NewWithKeys(f.m, f.keys) } // NewOptimal Bloom filter with random CSPRNG keys func NewOptimal(maxN uint64, p float64) (*Filter, error) { m := OptimalM(maxN, p) k := OptimalK(m, maxN) debug("New optimal bloom filter ::"+ " requested max elements (n):%d,"+ " probability of collision (p):%1.10f "+ "-> recommends -> bits (m): %d (%f GiB), "+ "number of keys (k): %d", maxN, p, m, float64(m)/(gigabitsPerGiB), k) return New(m, k) } // UniqueKeys is true if all keys are unique func UniqueKeys(keys []uint64) bool { for j := 0; j < len(keys)-1; j++ { elem := keys[j] for i := 1; i < j; i++ { if keys[i] == elem { return false } } } return true } // NewWithKeys creates a new Filter from user-supplied origKeys func NewWithKeys(m uint64, origKeys []uint64) (f *Filter, err error) { bits, err := newBits(m) if err != nil { return nil, err } keys, err := newKeysCopy(origKeys) if err != nil { return nil, err } return &Filter{ m: m, n: 0, bits: bits, keys: keys, }, nil } func newBits(m uint64) ([]uint64, error) { if m < MMin { return nil, errM() } return make([]uint64, (m+63)/64), nil } func newKeysBlank(k uint64) ([]uint64, error) { if k < KMin { return nil, errK() } return make([]uint64, k), nil } func newKeysCopy(origKeys []uint64) (keys []uint64, err error) { if !UniqueKeys(origKeys) { return nil, errUniqueKeys() } keys, err = newKeysBlank(uint64(len(origKeys))) if err != nil { return keys, err } copy(keys, origKeys) return keys, err } func newWithKeysAndBits(m uint64, keys []uint64, bits []uint64, n uint64) ( f *Filter, err error, ) { f, err = NewWithKeys(m, keys) if err != nil { return nil, err } copy(f.bits, bits) f.n = n return f, nil } ================================================ FILE: optimal.go ================================================ // Package bloomfilter is face-meltingly fast, thread-safe, // marshalable, unionable, probability- and // optimal-size-calculating Bloom filter in go // // https://github.com/steakknife/bloomfilter // // Copyright © 2014, 2015, 2018 Barry Allard // // MIT license // package bloomfilter import "math" const gigabitsPerGiB float64 = 8.0 * 1024 * 1024 * 1024 // OptimalK calculates the optimal k value for creating a new Bloom filter // maxn is the maximum anticipated number of elements func OptimalK(m, maxN uint64) uint64 { return uint64(math.Ceil(float64(m) * math.Ln2 / float64(maxN))) } // OptimalM calculates the optimal m value for creating a new Bloom filter // p is the desired false positive probability // optimal m = ceiling( - n * ln(p) / ln(2)**2 ) func OptimalM(maxN uint64, p float64) uint64 { return uint64(math.Ceil(-float64(maxN) * math.Log(p) / (math.Ln2 * math.Ln2))) } ================================================ FILE: optimal_test.go ================================================ package bloomfilter import ( "testing" ) func TestOptimal(t *testing.T) { tests := []struct { n uint64 p float64 k, m uint64 }{ { n: 1000, p: 0.01 / 100, k: 14, m: 19171, }, { n: 10000, p: 0.01 / 100, k: 14, m: 191702, }, { n: 10000, p: 0.01 / 100, k: 14, m: 191702, }, { n: 1000, p: 0.001 / 100, k: 17, m: 23963, }, } for _, test := range tests { m := OptimalM(test.n, test.p) k := OptimalK(m, test.n) if k != test.k || m != test.m { t.Errorf( "n=%d p=%f: expected (m=%d, k=%d), got (m=%d, k=%d)", test.n, test.p, test.m, test.k, m, k, ) } } } ================================================ FILE: statistics.go ================================================ // Package bloomfilter is face-meltingly fast, thread-safe, // marshalable, unionable, probability- and // optimal-size-calculating Bloom filter in go // // https://github.com/steakknife/bloomfilter // // Copyright © 2014, 2015, 2018 Barry Allard // // MIT license // package bloomfilter import ( "math" "github.com/steakknife/hamming" ) // PreciseFilledRatio is an exhaustive count # of 1's func (f *Filter) PreciseFilledRatio() float64 { f.lock.RLock() defer f.lock.RUnlock() return float64(hamming.CountBitsUint64s(f.bits)) / float64(f.M()) } // N is how many elements have been inserted // (actually, how many Add()s have been performed?) func (f *Filter) N() uint64 { f.lock.RLock() defer f.lock.RUnlock() return f.n } // FalsePosititveProbability is the upper-bound probability of false positives // (1 - exp(-k*(n+0.5)/(m-1))) ** k func (f *Filter) FalsePosititveProbability() float64 { k := float64(f.K()) n := float64(f.N()) m := float64(f.M()) return math.Pow(1.0-math.Exp(-k)*(n+0.5)/(m-1), k) } ================================================ FILE: textmarshaler.go ================================================ // Package bloomfilter is face-meltingly fast, thread-safe, // marshalable, unionable, probability- and // optimal-size-calculating Bloom filter in go // // https://github.com/steakknife/bloomfilter // // Copyright © 2014, 2015, 2018 Barry Allard // // MIT license // package bloomfilter import "fmt" // MarshalText conforms to encoding.TextMarshaler func (f *Filter) MarshalText() (text []byte, err error) { f.lock.RLock() defer f.lock.RUnlock() s := fmt.Sprintln("k") s += fmt.Sprintln(f.K()) s += fmt.Sprintln("n") s += fmt.Sprintln(f.n) s += fmt.Sprintln("m") s += fmt.Sprintln(f.m) s += fmt.Sprintln("keys") for key := range f.keys { s += fmt.Sprintf(keyFormat, key) + nl() } s += fmt.Sprintln("bits") for w := range f.bits { s += fmt.Sprintf(bitsFormat, w) + nl() } _, hash, err := f.marshal() if err != nil { return nil, err } s += fmt.Sprintln("sha384") for b := range hash { s += fmt.Sprintf("%02x", b) } s += nl() text = []byte(s) return text, nil } ================================================ FILE: textunmarshaler.go ================================================ // Package bloomfilter is face-meltingly fast, thread-safe, // marshalable, unionable, probability- and // optimal-size-calculating Bloom filter in go // // https://github.com/steakknife/bloomfilter // // Copyright © 2014, 2015, 2018 Barry Allard // // MIT license // package bloomfilter import ( "bytes" "crypto/hmac" "crypto/sha512" "fmt" "io" ) const ( keyFormat = "%016x" bitsFormat = "%016x" ) func nl() string { return fmt.Sprintln() } func unmarshalTextHeader(r io.Reader) (k, n, m uint64, err error) { format := "k" + nl() + "%d" + nl() format += "n" + nl() + "%d" + nl() format += "m" + nl() + "%d" + nl() format += "keys" + nl() _, err = fmt.Fscanf(r, format, k, n, m) return k, n, m, err } func unmarshalTextKeys(r io.Reader, keys []uint64) (err error) { for i := range keys { _, err = fmt.Fscanf(r, keyFormat, keys[i]) if err != nil { return err } } return nil } func unmarshalTextBits(r io.Reader, bits []uint64) (err error) { _, err = fmt.Fscanf(r, "bits") if err != nil { return err } for i := range bits { _, err = fmt.Fscanf(r, bitsFormat, bits[i]) if err != nil { return err } } return nil } func unmarshalAndCheckTextHash(r io.Reader, f *Filter) (err error) { _, err = fmt.Fscanf(r, "sha384") if err != nil { return err } actualHash := [sha512.Size384]byte{} for i := range actualHash { _, err = fmt.Fscanf(r, "%02x", actualHash[i]) if err != nil { return err } } _, expectedHash, err := f.marshal() if err != nil { return err } if !hmac.Equal(expectedHash[:], actualHash[:]) { return errHash() } return nil } // UnmarshalText conforms to TextUnmarshaler func UnmarshalText(text []byte) (f *Filter, err error) { r := bytes.NewBuffer(text) k, n, m, err := unmarshalTextHeader(r) if err != nil { return nil, err } keys, err := newKeysBlank(k) if err != nil { return nil, err } err = unmarshalTextKeys(r, keys) if err != nil { return nil, err } bits, err := newBits(m) if err != nil { return nil, err } err = unmarshalTextBits(r, bits) if err != nil { return nil, err } f, err = newWithKeysAndBits(m, keys, bits, n) if err != nil { return nil, err } err = unmarshalAndCheckTextHash(r, f) if err != nil { return nil, err } return f, nil } // UnmarshalText method overwrites f with data decoded from text func (f *Filter) UnmarshalText(text []byte) error { f.lock.Lock() defer f.lock.Unlock() f2, err := UnmarshalText(text) if err != nil { return err } f.m = f2.m f.n = f2.n copy(f.bits, f2.bits) copy(f.keys, f2.keys) return nil }