Repository: 10XGenomics/rust-boomphf Branch: master Commit: 4509a5bd8450 Files: 11 Total size: 76.1 KB Directory structure: gitextract_r2bq9488/ ├── .github/ │ └── workflows/ │ └── test.yml ├── .gitignore ├── .travis.yml ├── Cargo.toml ├── LICENSE ├── README.md ├── benches/ │ └── build.rs └── src/ ├── bitvector.rs ├── hashmap.rs ├── lib.rs └── par_iter.rs ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/workflows/test.yml ================================================ name: Run tests on: pull_request: push: branches: - master permissions: contents: read # Write permissions are required in order to produce annotations. checks: write jobs: test: runs-on: ubuntu-20.04 strategy: matrix: rust: - "1.60.0" - "1.65.0" steps: - uses: dtolnay/rust-toolchain@master with: toolchain: ${{ matrix.rust }} components: rustfmt, clippy - name: Checkout Git repository uses: actions/checkout@v3 - name: Cache dependencies uses: Swatinem/rust-cache@v2 - name: Check Rust formatting run: cargo fmt -- --check - name: lint with clippy uses: 10XGenomics/clippy-check@main with: token: ${{ secrets.GITHUB_TOKEN }} args: | --all-targets --all-features -- -D clippy::perf -D clippy::wildcard_imports -D clippy::redundant_closure_for_method_calls -D clippy::enum_glob_use -A clippy::float_cmp -F clippy::unused_io_amount -W future_incompatible -D nonstandard_style -D rust_2018_compatibility -D rust_2021_compatibility -D unused - name: Run Rust tests run: cargo test --release - name: Run Rust tests without parallelism run: cargo test --release --no-default-features ================================================ FILE: .gitignore ================================================ target/ **/*.rs.bk Cargo.lock .vscode/ ================================================ FILE: .travis.yml ================================================ language: rust # Cache cargo symbols for faster build cache: cargo env: global: - RUST_BACKTRACE=1 rust: - stable before_script: - export PATH=$HOME/.cargo/bin:$PATH - cargo install cargo-update || echo "cargo-update already installed" - cargo install cargo-travis || echo "cargo-travis already installed" - cargo install-update -a # update outdated cached binaries # the main build script: - cargo build - cargo test - cargo doc --no-deps after_success: # upload documentation to github.io (gh-pages branch) - cargo doc-upload ================================================ FILE: Cargo.toml ================================================ [package] name = "boomphf" version = "0.6.0" authors = ["Patrick Marks "] repository = "https://github.com/10XGenomics/rust-boomphf" homepage = "https://github.com/10XGenomics/rust-boomphf" license = "MIT" description = "Scalable and Efficient Minimal Perfect Hash Functions" documentation = "https://10xgenomics.github.io/rust-boomphf/master/boomphf/index.html" readme = "README.md" keywords = ["hashing", "minimal", "perfect"] edition = "2021" include = ["src/**/*", "benches/*", "LICENSE", "README.md"] [dependencies] serde = { version=">=1.0", optional = true, features=["derive"] } rayon = { version=">=1.0", optional = true } crossbeam-utils = { version=">=0.7.2, <0.9", optional = true } wyhash = ">=0.3, <=0.5" log = "0.4.*" [dev-dependencies] quickcheck = "1.0.2" bencher = ">=0.1" [[bench]] name = "build" harness = false [features] default = ["parallel"] parallel = ["rayon", "crossbeam-utils"] ================================================ FILE: LICENSE ================================================ The MIT License (MIT) Copyright (c) 2014-2017 10x Genomics, Inc. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ================================================ FILE: README.md ================================================ # Fast and Scalable Minimal Perfect Hash Functions in Rust A Rust impl of [**Fast and scalable minimal perfect hashing for massive key sets**](https://arxiv.org/abs/1702.03154). The library generates a minimal perfect hash functions (MPHF) for a collection of hashable objects. This algorithm generates MPHFs that consume ~3-6 bits/item. The memory consumption during construction is a small multiple (< 2x) of the size of the dataset and final size of the MPHF. Note, minimal perfect hash functions only return a usable hash value for objects in the set used to create the MPHF. Hashing a new object will return an arbitrary hash value. If your use case may result in hashing new values, you will need an auxiliary scheme to detect this condition. See [Docs](https://10xgenomics.github.io/rust-boomphf/) Example usage: ```rust use boomphf::*; // sample set of obejcts let possible_objects = vec![1, 10, 1000, 23, 457, 856, 845, 124, 912]; let n = possible_objects.len(); // generate a minimal perfect hash function of these items let phf = Mphf::new(1.7, possible_objects.clone(), None); // Get hash value of all objects let mut hashes = Vec::new(); for v in possible_objects { hashes.push(phf.hash(&v)); } hashes.sort(); // Expected hash output is set of all integers from 0..n let expected_hashes: Vec = (0 .. n as u64).collect(); assert!(hashes == expected_hashes) ``` Note: this crate carries it's own bit-vector implementation to support rank-select queries and multi-threaded read-write access. ================================================ FILE: benches/build.rs ================================================ #[cfg(test)] #[macro_use] extern crate bencher; use bencher::Bencher; use boomphf::Mphf; fn build1_ser(bench: &mut Bencher) { bench.iter(|| { let items: Vec = (0..1000000u64).map(|x| x * 2).collect(); let _ = Mphf::new(2.0, &items); }); } #[allow(dead_code)] fn build1_par(bench: &mut Bencher) { #[cfg(feature = "parallel")] bench.iter(|| { let items: Vec = (0..1000000u64).map(|x| x * 2).collect(); let _ = Mphf::new_parallel(2.0, &items, None); }); } fn scan1_ser(bench: &mut Bencher) { let items: Vec = (0..1000000u64).map(|x| x * 2).collect(); let phf = Mphf::new(2.0, &items); bench.iter(|| { for i in (0..1000000u64).map(|x| x * 2) { phf.hash(&i); } }); } benchmark_group!(benches, build1_ser, build1_par, scan1_ser); benchmark_main!(benches); ================================================ FILE: src/bitvector.rs ================================================ // Copyright (c) 2018 10x Genomics, Inc. All rights reserved. // // Note this code was copied from https://github.com/zhaihj/bitvector (MIT licensed), // and modified to add rank/select operations, and to use atomic primitives to allow // multi-threaded access. The original copyright license text is here: // // The MIT License (MIT) // // Copyright (c) 2016 Hongjie Zhai //! ### BitVector Module //! //! BitVector uses one bit to represent a bool state. //! BitVector is useful for the programs that need fast set operation (intersection, union, //! difference), because that all these operations can be done with simple bitand, bitor, bitxor. //! //! ### Implementation Details //! //! BitVector is realized with a `Vec`. Each bit of an u64 represent if a elements exists. //! BitVector always increases from the end to begin, it meats that if you add element `0` to an //! empty bitvector, then the `Vec` will change from `0x00` to `0x01`. //! //! Of course, if the real length of set can not be divided by 64, //! it will have a `capacity() % 64` bit memory waste. //! use std::fmt; #[cfg(feature = "parallel")] use std::sync::atomic::{AtomicU64, Ordering}; #[cfg(feature = "serde")] use serde::{self, Deserialize, Serialize}; #[cfg(feature = "parallel")] type Word = AtomicU64; #[cfg(not(feature = "parallel"))] type Word = u64; /// Bitvector #[derive(Debug)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub struct BitVector { bits: u64, #[cfg(feature = "parallel")] #[cfg_attr( feature = "serde", serde(serialize_with = "ser_atomic_vec", deserialize_with = "de_atomic_vec") )] vector: Box<[AtomicU64]>, #[cfg(not(feature = "parallel"))] vector: Box<[u64]>, } // Custom serializer #[cfg(all(feature = "serde", feature = "parallel"))] fn ser_atomic_vec(v: &[AtomicU64], serializer: S) -> Result where S: serde::Serializer, { use serde::ser::SerializeSeq; let mut seq = serializer.serialize_seq(Some(v.len()))?; for x in v { seq.serialize_element(&x.load(Ordering::SeqCst))?; } seq.end() } // Custom deserializer #[cfg(all(feature = "serde", feature = "parallel"))] pub fn de_atomic_vec<'de, D>(deserializer: D) -> Result, D::Error> where D: serde::Deserializer<'de>, { struct AtomicU64SeqVisitor; impl<'de> serde::de::Visitor<'de> for AtomicU64SeqVisitor { type Value = Box<[AtomicU64]>; fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { formatter.write_str("a 64bit unsigned integer") } fn visit_seq(self, mut access: S) -> Result where S: serde::de::SeqAccess<'de>, { let mut vec = Vec::::with_capacity(access.size_hint().unwrap_or(0)); while let Some(x) = access.next_element()? { vec.push(AtomicU64::new(x)); } Ok(vec.into_boxed_slice()) } } let x = AtomicU64SeqVisitor; deserializer.deserialize_seq(x) } impl core::clone::Clone for BitVector { fn clone(&self) -> Self { Self { bits: self.bits, #[cfg(feature = "parallel")] vector: self .vector .iter() .map(|x| AtomicU64::new(x.load(Ordering::SeqCst))) .collect(), #[cfg(not(feature = "parallel"))] vector: self.vector.clone(), } } } impl fmt::Display for BitVector { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "[")?; write!( f, "{}", self.iter() .fold(String::new(), |x0, x| x0 + &format!("{}, ", x)) )?; write!(f, "]")?; Ok(()) } } impl PartialEq for BitVector { fn eq(&self, other: &BitVector) -> bool { self.eq_left(other, self.bits) } } impl BitVector { /// Build a new empty bitvector pub fn new(bits: u64) -> Self { let n = u64s(bits); let mut v: Vec = Vec::with_capacity(n as usize); for _ in 0..n { v.push(Word::default()); } BitVector { bits, vector: v.into_boxed_slice(), } } /// new bitvector contains all elements /// /// If `bits % 64 > 0`, the last u64 is guaranteed not to /// have any extra 1 bits. #[allow(dead_code)] pub fn ones(bits: u64) -> Self { let (word, offset) = word_offset(bits); let mut bvec: Vec = Vec::with_capacity((word + 1) as usize); for _ in 0..word { bvec.push(u64::max_value().into()); } let last_val = u64::max_value() >> (64 - offset); bvec.push(last_val.into()); BitVector { bits, vector: bvec.into_boxed_slice(), } } /// return if this set is empty /// /// if set does not contain any elements, return true; /// else return false. /// /// This method is averagely faster than `self.len() > 0`. #[allow(dead_code)] pub fn is_empty(&self) -> bool { #[cfg(feature = "parallel")] return self.vector.iter().all(|x| x.load(Ordering::Relaxed) == 0); #[cfg(not(feature = "parallel"))] return self.vector.iter().all(|x| *x == 0); } /// the number of elements in set pub fn len(&self) -> u64 { self.vector.iter().fold(0u64, |x0, x| { #[cfg(feature = "parallel")] return x0 + x.load(Ordering::Relaxed).count_ones() as u64; #[cfg(not(feature = "parallel"))] return x0 + x.count_ones() as u64; }) } /* /// Clear all elements from a bitvector pub fn clear(&mut self) { for p in &mut self.vector { *p = 0; } } */ /// If `bit` belongs to set, return `true`, else return `false`. /// /// Insert, remove and contains do not do bound check. #[inline] pub fn contains(&self, bit: u64) -> bool { let (word, mask) = word_mask(bit); (self.get_word(word) & mask) != 0 } /// compare if the following is true: /// /// self \cap {0, 1, ... , bit - 1} == other \cap {0, 1, ... ,bit - 1} pub fn eq_left(&self, other: &BitVector, bit: u64) -> bool { if bit == 0 { return true; } let (word, offset) = word_offset(bit - 1); // We can also use slice comparison, which only take 1 line. // However, it has been reported that the `Eq` implementation of slice // is extremly slow. // // self.vector.as_slice()[0 .. word] == other.vector.as_slice[0 .. word] // self.vector .iter() .zip(other.vector.iter()) .take(word as usize) .all(|(s1, s2)| { #[cfg(feature = "parallel")] return s1.load(Ordering::Relaxed) == s2.load(Ordering::Relaxed); #[cfg(not(feature = "parallel"))] return s1 == s2; }) && (self.get_word(word as usize) << (63 - offset)) == (other.get_word(word as usize) << (63 - offset)) } /// insert a new element to set /// /// If value is inserted, return true, /// if value already exists in set, return false. /// /// Insert, remove and contains do not do bound check. #[inline] #[cfg(feature = "parallel")] pub fn insert(&self, bit: u64) -> bool { let (word, mask) = word_mask(bit); let data = &self.vector[word]; let prev = data.fetch_or(mask, Ordering::Relaxed); prev & mask == 0 } #[inline] #[cfg(not(feature = "parallel"))] pub fn insert(&mut self, bit: u64) -> bool { let (word, mask) = word_mask(bit); let data = &mut self.vector[word]; let prev = *data; *data = *data | mask; prev & mask == 0 } /// insert a new element synchronously. /// requires &mut self, but doesn't use /// atomic instructions so may be faster /// than `insert()`. /// /// If value is inserted, return true, /// if value already exists in set, return false. /// /// Insert, remove and contains do not do bound check. #[inline] pub fn insert_sync(&mut self, bit: u64) -> bool { let (word, mask) = word_mask(bit); #[cfg(feature = "parallel")] let data = self.vector[word].get_mut(); #[cfg(not(feature = "parallel"))] let data = &mut self.vector[word]; let old_data = *data; *data |= mask; old_data & mask == 0 } /// remove an element from set /// /// If value is removed, return true, /// if value doesn't exist in set, return false. /// /// Insert, remove and contains do not do bound check. #[cfg(feature = "parallel")] pub fn remove(&self, bit: u64) -> bool { let (word, mask) = word_mask(bit); let data = &self.vector[word]; let prev = data.fetch_and(!mask, Ordering::Relaxed); prev & mask != 0 } #[cfg(not(feature = "parallel"))] pub fn remove(&mut self, bit: u64) -> bool { let (word, mask) = word_mask(bit); let data = &mut self.vector[word]; let prev = *data; *data = *data & !mask; prev & mask != 0 } /// import elements from another bitvector /// /// If any new value is inserted, return true, /// else return false. #[allow(dead_code)] #[cfg(feature = "parallel")] pub fn insert_all(&self, all: &BitVector) -> bool { assert!(self.vector.len() == all.vector.len()); let mut changed = false; for (i, j) in self.vector.iter().zip(all.vector.iter()) { let prev = i.fetch_or(j.load(Ordering::Relaxed), Ordering::Relaxed); if prev != i.load(Ordering::Relaxed) { changed = true; } } changed } #[allow(dead_code)] #[cfg(not(feature = "parallel"))] pub fn insert_all(&mut self, all: &BitVector) -> bool { assert!(self.vector.len() == all.vector.len()); let mut changed = false; for (i, j) in self.vector.iter_mut().zip(all.vector.iter()) { let prev = *i; *i |= *j; if prev != *i { changed = true; } } changed } /// the max number of elements can be inserted into set pub fn capacity(&self) -> u64 { self.bits } #[inline] pub fn get_word(&self, word: usize) -> u64 { #[cfg(feature = "parallel")] return self.vector[word].load(Ordering::Relaxed) as u64; #[cfg(not(feature = "parallel"))] return self.vector[word] as u64; } pub fn num_words(&self) -> usize { self.vector.len() } /// Return a iterator of the set element in the bitvector, pub fn iter(&self) -> BitVectorIter<'_> { BitVectorIter { iter: self.vector.iter(), current: 0, idx: 0, size: self.bits, } } } /// Iterator for BitVector pub struct BitVectorIter<'a> { iter: ::std::slice::Iter<'a, Word>, current: u64, idx: u64, size: u64, } impl<'a> Iterator for BitVectorIter<'a> { type Item = u64; fn next(&mut self) -> Option { if self.idx >= self.size { return None; } while self.current == 0 { self.current = if let Some(_i) = self.iter.next() { #[cfg(feature = "parallel")] let i = _i.load(Ordering::Relaxed); #[cfg(not(feature = "parallel"))] let i = *_i; if i == 0 { self.idx += 64; continue; } else { self.idx = u64s(self.idx) * 64; i } } else { return None; } } let offset = self.current.trailing_zeros() as u64; self.current >>= offset; self.current >>= 1; // shift otherwise overflows for 0b1000_0000_…_0000 self.idx += offset + 1; Some(self.idx - 1) } } fn u64s(elements: u64) -> u64 { (elements + 63) / 64 } fn word_offset(index: u64) -> (u64, u64) { (index / 64, index % 64) } #[inline] fn word_mask(index: u64) -> (usize, u64) { let word = (index / 64) as usize; let mask = 1 << (index % 64); (word, mask) } #[cfg(test)] mod tests { use super::*; #[test] fn union_two_vecs() { #[allow(unused_mut)] let mut vec1 = BitVector::new(65); #[allow(unused_mut)] let mut vec2 = BitVector::new(65); assert!(vec1.insert(3)); assert!(!vec1.insert(3)); assert!(vec2.insert(5)); assert!(vec2.insert(64)); assert!(vec1.insert_all(&vec2)); assert!(!vec1.insert_all(&vec2)); assert!(vec1.contains(3)); assert!(!vec1.contains(4)); assert!(vec1.contains(5)); assert!(!vec1.contains(63)); assert!(vec1.contains(64)); } #[test] fn bitvec_iter_works() { #[allow(unused_mut)] let mut bitvec = BitVector::new(100); bitvec.insert(1); bitvec.insert(10); bitvec.insert(19); bitvec.insert(62); bitvec.insert(63); bitvec.insert(64); bitvec.insert(65); bitvec.insert(66); bitvec.insert(99); assert_eq!( bitvec.iter().collect::>(), [1, 10, 19, 62, 63, 64, 65, 66, 99] ); } #[test] fn bitvec_iter_works_2() { #[allow(unused_mut)] let mut bitvec = BitVector::new(319); bitvec.insert(0); bitvec.insert(127); bitvec.insert(191); bitvec.insert(255); bitvec.insert(319); assert_eq!(bitvec.iter().collect::>(), [0, 127, 191, 255, 319]); } #[test] fn eq_left() { #[allow(unused_mut)] let mut bitvec = BitVector::new(50); for i in &[0, 1, 3, 5, 11, 12, 19, 23] { bitvec.insert(*i); } #[allow(unused_mut)] let mut bitvec2 = BitVector::new(50); for i in &[0, 1, 3, 5, 7, 11, 13, 17, 19, 23] { bitvec2.insert(*i); } assert!(bitvec.eq_left(&bitvec2, 1)); assert!(bitvec.eq_left(&bitvec2, 2)); assert!(bitvec.eq_left(&bitvec2, 3)); assert!(bitvec.eq_left(&bitvec2, 4)); assert!(bitvec.eq_left(&bitvec2, 5)); assert!(bitvec.eq_left(&bitvec2, 6)); assert!(bitvec.eq_left(&bitvec2, 7)); assert!(!bitvec.eq_left(&bitvec2, 8)); assert!(!bitvec.eq_left(&bitvec2, 9)); assert!(!bitvec.eq_left(&bitvec2, 50)); } #[test] fn eq() { #[allow(unused_mut)] let mut bitvec = BitVector::new(50); for i in &[0, 1, 3, 5, 11, 12, 19, 23] { bitvec.insert(*i); } #[allow(unused_mut)] let mut bitvec2 = BitVector::new(50); for i in &[0, 1, 3, 5, 7, 11, 13, 17, 19, 23] { bitvec2.insert(*i); } #[allow(unused_mut)] let mut bitvec3 = BitVector::new(50); for i in &[0, 1, 3, 5, 11, 12, 19, 23] { bitvec3.insert(*i); } assert!(bitvec != bitvec2); assert!(bitvec == bitvec3); assert!(bitvec2 != bitvec3); } #[test] fn remove() { #[allow(unused_mut)] let mut bitvec = BitVector::new(50); for i in &[0, 1, 3, 5, 11, 12, 19, 23] { bitvec.insert(*i); } assert!(bitvec.contains(3)); bitvec.remove(3); assert!(!bitvec.contains(3)); assert_eq!( bitvec.iter().collect::>(), vec![0, 1, 5, 11, 12, 19, 23] ); } #[test] fn is_empty() { assert!(!BitVector::ones(60).is_empty()); assert!(!BitVector::ones(65).is_empty()); #[allow(unused_mut)] let mut bvec = BitVector::new(60); assert!(bvec.is_empty()); bvec.insert(5); assert!(!bvec.is_empty()); bvec.remove(5); assert!(bvec.is_empty()); #[allow(unused_mut)] let mut bvec = BitVector::ones(65); for i in 0..65 { bvec.remove(i); } assert!(bvec.is_empty()); } #[test] fn test_ones() { let bvec = BitVector::ones(60); for i in 0..60 { assert!(bvec.contains(i)); } assert_eq!(bvec.iter().collect::>(), (0..60).collect::>()); } #[test] fn len() { assert_eq!(BitVector::ones(60).len(), 60); assert_eq!(BitVector::ones(65).len(), 65); assert_eq!(BitVector::new(65).len(), 0); #[allow(unused_mut)] let mut bvec = BitVector::new(60); bvec.insert(5); assert_eq!(bvec.len(), 1); bvec.insert(6); assert_eq!(bvec.len(), 2); bvec.remove(5); assert_eq!(bvec.len(), 1); } } #[cfg(all(feature = "unstable", test))] mod bench { extern crate test; use self::test::Bencher; use super::*; use std::collections::{BTreeSet, HashSet}; #[bench] fn bench_bitset_operator(b: &mut Bencher) { b.iter(|| { #[allow(unused_mut)] let mut vec1 = BitVector::new(65); #[allow(unused_mut)] let mut vec2 = BitVector::new(65); for i in vec![0, 1, 2, 10, 15, 18, 25, 31, 40, 42, 60, 64] { vec1.insert(i); } for i in vec![3, 5, 7, 12, 13, 15, 21, 25, 30, 29, 42, 50, 61, 62, 63, 64] { vec2.insert(i); } vec1.intersection(&vec2); vec1.union(&vec2); vec1.difference(&vec2); }); } #[bench] fn bench_bitset_operator_inplace(b: &mut Bencher) { b.iter(|| { #[allow(unused_mut)] let mut vec1 = BitVector::new(65); #[allow(unused_mut)] let mut vec2 = BitVector::new(65); for i in vec![0, 1, 2, 10, 15, 18, 25, 31, 40, 42, 60, 64] { vec1.insert(i); } for i in vec![3, 5, 7, 12, 13, 15, 21, 25, 30, 29, 42, 50, 61, 62, 63, 64] { vec2.insert(i); } vec1.intersection_inplace(&vec2); vec1.union_inplace(&vec2); vec1.difference_inplace(&vec2); }); } #[bench] fn bench_hashset_operator(b: &mut Bencher) { b.iter(|| { #[allow(unused_mut)] let mut vec1 = HashSet::with_capacity(65); #[allow(unused_mut)] let mut vec2 = HashSet::with_capacity(65); for i in vec![0, 1, 2, 10, 15, 18, 25, 31, 40, 42, 60, 64] { vec1.insert(i); } for i in vec![3, 5, 7, 12, 13, 15, 21, 25, 30, 29, 42, 50, 61, 62, 63, 64] { vec2.insert(i); } vec1.intersection(&vec2).cloned().collect::>(); vec1.union(&vec2).cloned().collect::>(); vec1.difference(&vec2).cloned().collect::>(); }); } #[bench] fn bench_btreeset_operator(b: &mut Bencher) { b.iter(|| { #[allow(unused_mut)] let mut vec1 = BTreeSet::new(); #[allow(unused_mut)] let mut vec2 = BTreeSet::new(); for i in vec![0, 1, 2, 10, 15, 18, 25, 31, 40, 42, 60, 64] { vec1.insert(i); } for i in vec![3, 5, 7, 12, 13, 15, 21, 25, 30, 29, 42, 50, 61, 62, 63, 64] { vec2.insert(i); } vec1.intersection(&vec2).cloned().collect::>(); vec1.union(&vec2).cloned().collect::>(); vec1.difference(&vec2).cloned().collect::>(); }); } } ================================================ FILE: src/hashmap.rs ================================================ //! HashMap data structures, using MPHFs to encode the position of each key in a dense array. #[cfg(feature = "serde")] use serde::{self, Deserialize, Serialize}; use crate::Mphf; use std::borrow::Borrow; use std::fmt::Debug; use std::hash::Hash; use std::iter::ExactSizeIterator; /// A HashMap data structure where the mapping between keys and values is encoded in a Mphf. This lets us store the keys and values in dense /// arrays, with ~3 bits/item overhead in the Mphf. #[derive(Debug, Clone)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub struct BoomHashMap { mphf: Mphf, pub(crate) keys: Vec, pub(crate) values: Vec, } impl BoomHashMap where K: Hash + Debug + PartialEq, D: Debug, { fn create_map(mut keys: Vec, mut values: Vec, mphf: Mphf) -> BoomHashMap { // reorder the keys and values according to the Mphf for i in 0..keys.len() { loop { let kmer_slot = mphf.hash(&keys[i]) as usize; if i == kmer_slot { break; } keys.swap(i, kmer_slot); values.swap(i, kmer_slot); } } BoomHashMap { mphf, keys, values } } /// Create a new hash map from the parallel array `keys` and `values` pub fn new(keys: Vec, data: Vec) -> BoomHashMap { let mphf = Mphf::new(1.7, &keys); Self::create_map(keys, data, mphf) } /// Get the value associated with `key`. You must use a key that was supplied during the creation of the BoomHashMap. Querying for a new key will yield `Some` with a random value, or `None`. Querying with a valid key will always return `Some`. pub fn get(&self, kmer: &Q) -> Option<&D> where K: Borrow, Q: Hash + Eq, { let maybe_pos = self.mphf.try_hash(kmer); match maybe_pos { Some(pos) => { let hashed_kmer = &self.keys[pos as usize]; if kmer == hashed_kmer.borrow() { Some(&self.values[pos as usize]) } else { None } } None => None, } } /// Mutably get the value associated with `key`. You must use a key that was supplied during the creation of the BoomHashMap. Querying for a new key will yield `Some` with a random value, or `None`. Querying with a valid key will always return `Some`. pub fn get_mut(&mut self, kmer: &Q) -> Option<&mut D> where K: Borrow, Q: Hash + Eq, { let maybe_pos = self.mphf.try_hash(kmer); match maybe_pos { Some(pos) => { let hashed_kmer = &self.keys[pos as usize]; if kmer == hashed_kmer.borrow() { Some(&mut self.values[pos as usize]) } else { None } } None => None, } } /// Get the position in the Mphf of a key, if the key exists. pub fn get_key_id(&self, kmer: &Q) -> Option where K: Borrow, Q: Hash + Eq, { let maybe_pos = self.mphf.try_hash(kmer); match maybe_pos { Some(pos) => { let hashed_kmer = &self.keys[pos as usize]; if kmer == hashed_kmer.borrow() { Some(pos as usize) } else { None } } None => None, } } /// Total number of key/value pairs pub fn len(&self) -> usize { self.keys.len() } pub fn is_empty(&self) -> bool { self.keys.is_empty() } pub fn get_key(&self, id: usize) -> Option<&K> { let max_key_id = self.len(); if id > max_key_id { None } else { Some(&self.keys[id]) } } pub fn iter(&self) -> BoomIterator { BoomIterator { hash: self, index: 0, } } } impl core::iter::FromIterator<(K, D)> for BoomHashMap where K: Hash + Debug + PartialEq, D: Debug, { fn from_iter>(iter: I) -> Self { let mut keys = Vec::new(); let mut values = Vec::new(); for (k, v) in iter { keys.push(k); values.push(v); } Self::new(keys, values) } } #[cfg(feature = "parallel")] pub trait ConstructibleKey: Hash + Debug + PartialEq + Send + Sync {} #[cfg(feature = "parallel")] impl ConstructibleKey for T where T: Hash + Debug + PartialEq + Send + Sync {} #[cfg(not(feature = "parallel"))] pub trait ConstructibleKey: Hash + Debug + PartialEq {} #[cfg(not(feature = "parallel"))] impl ConstructibleKey for T where T: Hash + Debug + PartialEq {} #[cfg(feature = "parallel")] impl BoomHashMap where K: Hash + Debug + PartialEq + Send + Sync, D: Debug, { /// Create a new hash map from the parallel array `keys` and `values`, using a parallelized method to construct the Mphf. pub fn new_parallel(keys: Vec, data: Vec) -> BoomHashMap { let mphf = Mphf::new_parallel(1.7, &keys, None); Self::create_map(keys, data, mphf) } } /// Iterate over key-value pairs in a BoomHashMap pub struct BoomIterator<'a, K: Hash + 'a, D: 'a> { hash: &'a BoomHashMap, index: usize, } impl<'a, K: Hash, D> Iterator for BoomIterator<'a, K, D> { type Item = (&'a K, &'a D); fn next(&mut self) -> Option { if self.index == self.hash.keys.len() { return None; } let elements = Some((&self.hash.keys[self.index], &self.hash.values[self.index])); self.index += 1; elements } fn size_hint(&self) -> (usize, Option) { let remaining = self.hash.keys.len() - self.index; (remaining, Some(remaining)) } } impl<'a, K: Hash, D1> ExactSizeIterator for BoomIterator<'a, K, D1> {} impl<'a, K: Hash, D> IntoIterator for &'a BoomHashMap { type Item = (&'a K, &'a D); type IntoIter = BoomIterator<'a, K, D>; fn into_iter(self) -> BoomIterator<'a, K, D> { BoomIterator { hash: self, index: 0, } } } /// A HashMap data structure where the mapping between keys and 2 values is encoded in a Mphf. You should usually use `BoomHashMap` with a tuple/struct value type. /// If the layout overhead of the struct / tuple must be avoided, this variant of is an alternative. /// This lets us store the keys and values in dense /// arrays, with ~3 bits/item overhead in the Mphf. #[derive(Debug, Clone)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub struct BoomHashMap2 { mphf: Mphf, keys: Vec, values: Vec, aux_values: Vec, } pub struct Boom2Iterator<'a, K: Hash + 'a, D1: 'a, D2: 'a> { hash: &'a BoomHashMap2, index: usize, } impl<'a, K: Hash, D1, D2> Iterator for Boom2Iterator<'a, K, D1, D2> { type Item = (&'a K, &'a D1, &'a D2); fn next(&mut self) -> Option { if self.index == self.hash.keys.len() { return None; } let elements = Some(( &self.hash.keys[self.index], &self.hash.values[self.index], &self.hash.aux_values[self.index], )); self.index += 1; elements } fn size_hint(&self) -> (usize, Option) { let remaining = self.hash.keys.len() - self.index; (remaining, Some(remaining)) } } impl<'a, K: Hash, D1, D2> ExactSizeIterator for Boom2Iterator<'a, K, D1, D2> {} impl<'a, K: Hash, D1, D2> IntoIterator for &'a BoomHashMap2 { type Item = (&'a K, &'a D1, &'a D2); type IntoIter = Boom2Iterator<'a, K, D1, D2>; fn into_iter(self) -> Boom2Iterator<'a, K, D1, D2> { Boom2Iterator { hash: self, index: 0, } } } impl BoomHashMap2 where K: Hash + Debug + PartialEq, D1: Debug, D2: Debug, { fn create_map( mut keys: Vec, mut values: Vec, mut aux_values: Vec, mphf: Mphf, ) -> BoomHashMap2 { // reorder the keys and values according to the Mphf for i in 0..keys.len() { loop { let kmer_slot = mphf.hash(&keys[i]) as usize; if i == kmer_slot { break; } keys.swap(i, kmer_slot); values.swap(i, kmer_slot); aux_values.swap(i, kmer_slot); } } BoomHashMap2 { mphf, keys, values, aux_values, } } /// Create a new hash map from the parallel arrays `keys` and `values`, and `aux_values` pub fn new(keys: Vec, values: Vec, aux_values: Vec) -> BoomHashMap2 { let mphf = Mphf::new(1.7, &keys); Self::create_map(keys, values, aux_values, mphf) } pub fn get(&self, kmer: &Q) -> Option<(&D1, &D2)> where K: Borrow, Q: Hash + Eq, { let maybe_pos = self.mphf.try_hash(kmer); match maybe_pos { Some(pos) => { let hashed_kmer = &self.keys[pos as usize]; if kmer == hashed_kmer.borrow() { Some((&self.values[pos as usize], &self.aux_values[pos as usize])) } else { None } } None => None, } } pub fn get_mut(&mut self, kmer: &Q) -> Option<(&mut D1, &mut D2)> where K: Borrow, Q: Hash + Eq, { let maybe_pos = self.mphf.try_hash(kmer); match maybe_pos { Some(pos) => { let hashed_kmer = &self.keys[pos as usize]; if kmer == hashed_kmer.borrow() { Some(( &mut self.values[pos as usize], &mut self.aux_values[pos as usize], )) } else { None } } None => None, } } pub fn get_key_id(&self, kmer: &Q) -> Option where K: Borrow, Q: Hash + Eq, { let maybe_pos = self.mphf.try_hash(kmer); match maybe_pos { Some(pos) => { let hashed_kmer = &self.keys[pos as usize]; if kmer == hashed_kmer.borrow() { Some(pos as usize) } else { None } } None => None, } } pub fn len(&self) -> usize { self.keys.len() } pub fn is_empty(&self) -> bool { self.keys.is_empty() } // Return iterator over key-values pairs pub fn iter(&self) -> Boom2Iterator { Boom2Iterator { hash: self, index: 0, } } pub fn get_key(&self, id: usize) -> Option<&K> { let max_key_id = self.len(); if id > max_key_id { None } else { Some(&self.keys[id]) } } } impl core::iter::FromIterator<(K, D1, D2)> for BoomHashMap2 where K: Hash + Debug + PartialEq, D1: Debug, D2: Debug, { fn from_iter>(iter: I) -> Self { let mut keys = Vec::new(); let mut values1 = Vec::new(); let mut values2 = Vec::new(); for (k, v1, v2) in iter { keys.push(k); values1.push(v1); values2.push(v2); } Self::new(keys, values1, values2) } } #[cfg(feature = "parallel")] impl BoomHashMap2 where K: Hash + Debug + PartialEq + Send + Sync, D1: Debug, D2: Debug, { /// Create a new hash map from the parallel arrays `keys` and `values`, and `aux_values`, using a parallel algorithm to construct the Mphf. pub fn new_parallel(keys: Vec, data: Vec, aux_data: Vec) -> BoomHashMap2 { let mphf = Mphf::new_parallel(1.7, &keys, None); Self::create_map(keys, data, aux_data, mphf) } } /// A HashMap data structure where the mapping between keys and values is encoded in a Mphf. *Keys are not stored* - this can greatly improve the memory consumption, /// but can only be used if you can guarantee that you will only query for keys that were in the original set. Querying for a new key will return a random value, silently. #[derive(Debug, Clone)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub struct NoKeyBoomHashMap { pub mphf: Mphf, pub values: Vec, } impl core::iter::FromIterator<(K, D1)> for NoKeyBoomHashMap where K: ConstructibleKey, D1: Debug, { fn from_iter>(iter: I) -> Self { let mut keys = Vec::new(); let mut values1 = Vec::new(); for (k, v1) in iter { keys.push(k); values1.push(v1); } #[cfg(feature = "parallel")] return Self::new_parallel(keys, values1); #[cfg(not(feature = "parallel"))] return Self::new(keys, values1); } } impl NoKeyBoomHashMap where K: ConstructibleKey, D1: Debug, { fn create_map(mut keys: Vec, mut values: Vec, mphf: Mphf) -> NoKeyBoomHashMap { for i in 0..keys.len() { loop { let kmer_slot = mphf.hash(&keys[i]) as usize; if i == kmer_slot { break; } keys.swap(i, kmer_slot); values.swap(i, kmer_slot); } } NoKeyBoomHashMap { mphf, values } } /// Create a new hash map from the parallel array `keys` and `values` /// serially using only this thread. pub fn new(keys: Vec, data: Vec) -> NoKeyBoomHashMap { let mphf = Mphf::new(1.7, &keys); Self::create_map(keys, data, mphf) } /// Create a new hash map from the parallel array `keys` and `values`. #[cfg(feature = "parallel")] pub fn new_parallel(keys: Vec, values: Vec) -> NoKeyBoomHashMap { let mphf = Mphf::new_parallel(1.7, &keys, None); Self::create_map(keys, values, mphf) } pub fn new_with_mphf(mphf: Mphf, values: Vec) -> NoKeyBoomHashMap { NoKeyBoomHashMap { mphf, values } } /// Get the value associated with `key`. You must use a key that was supplied during the creation of the BoomHashMap. Querying for a new key will yield `Some` with a random value, or `None`. Querying with a valid key will always return `Some`. pub fn get(&self, kmer: &Q) -> Option<&D1> where K: Borrow, Q: Hash + Eq, { let maybe_pos = self.mphf.try_hash(kmer); match maybe_pos { Some(pos) => Some(&self.values[pos as usize]), _ => None, } } /// Mutably get the value associated with `key`. You must use a key that was supplied during the creation of the BoomHashMap. Querying for a new key will yield `Some` with a random value, or `None`. Querying with a valid key will always return `Some`. pub fn get_mut(&mut self, kmer: &Q) -> Option<&mut D1> where K: Borrow, Q: Hash + Eq, { let maybe_pos = self.mphf.try_hash(kmer); match maybe_pos { Some(pos) => Some(&mut self.values[pos as usize]), _ => None, } } } /// A HashMap data structure where the mapping between keys and values is encoded in a Mphf. *Keys are not stored* - this can greatly improve the memory consumption, /// but can only be used if you can guarantee that you will only query for keys that were in the original set. Querying for a new key will return a random value, silently. #[derive(Debug, Clone)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub struct NoKeyBoomHashMap2 { pub mphf: Mphf, pub values: Vec, pub aux_values: Vec, } impl core::iter::FromIterator<(K, D1, D2)> for NoKeyBoomHashMap2 where K: ConstructibleKey, D1: Debug, D2: Debug, { fn from_iter>(iter: I) -> Self { let mut keys = Vec::new(); let mut values1 = Vec::new(); let mut values2 = Vec::new(); for (k, v1, v2) in iter { keys.push(k); values1.push(v1); values2.push(v2); } #[cfg(feature = "parallel")] return Self::new_parallel(keys, values1, values2); #[cfg(not(feature = "parallel"))] return Self::new(keys, values1, values2); } } impl NoKeyBoomHashMap2 where K: ConstructibleKey, D1: Debug, D2: Debug, { fn create_map( mphf: Mphf, mut keys: Vec, mut values: Vec, mut aux_values: Vec, ) -> Self { for i in 0..keys.len() { loop { let kmer_slot = mphf.hash(&keys[i]) as usize; if i == kmer_slot { break; } keys.swap(i, kmer_slot); values.swap(i, kmer_slot); aux_values.swap(i, kmer_slot); } } NoKeyBoomHashMap2 { mphf, values, aux_values, } } pub fn new(keys: Vec, values: Vec, aux_values: Vec) -> NoKeyBoomHashMap2 { let mphf = Mphf::new(1.7, &keys); Self::create_map(mphf, keys, values, aux_values) } #[cfg(feature = "parallel")] pub fn new_parallel( keys: Vec, values: Vec, aux_values: Vec, ) -> NoKeyBoomHashMap2 { let mphf = Mphf::new_parallel(1.7, &keys, None); Self::create_map(mphf, keys, values, aux_values) } pub fn new_with_mphf( mphf: Mphf, values: Vec, aux_values: Vec, ) -> NoKeyBoomHashMap2 { NoKeyBoomHashMap2 { mphf, values, aux_values, } } /// Get the value associated with `key`. You must use a key that was supplied during the creation of the BoomHashMap. Querying for a new key will yield `Some` with a random value, or `None`. Querying with a valid key will always return `Some`. pub fn get(&self, kmer: &Q) -> Option<(&D1, &D2)> where K: Borrow, Q: Hash + Eq, { let maybe_pos = self.mphf.try_hash(kmer); maybe_pos.map(|pos| (&self.values[pos as usize], &self.aux_values[pos as usize])) } /// Mutably get the value associated with `key`. You must use a key that was supplied during the creation of the BoomHashMap. Querying for a new key will yield `Some` with a random value, or `None`. Querying with a valid key will always return `Some`. pub fn get_mut(&mut self, kmer: &Q) -> Option<(&mut D1, &mut D2)> where K: Borrow, Q: Hash + Eq, { let maybe_pos = self.mphf.try_hash(kmer); maybe_pos.map(|pos| { ( &mut self.values[pos as usize], &mut self.aux_values[pos as usize], ) }) } } ================================================ FILE: src/lib.rs ================================================ // Copyright (c) 2017 10X Genomics, Inc. All rights reserved. // Copyright (c) 2015 Guillaume Rizk // Some portions of this code are derived from https://github.com/rizkg/BBHash (MIT license) //! ### boomphf - Fast and scalable minimal perfect hashing for massive key sets //! A Rust implementation of the BBHash method for constructing minimal perfect hash functions, //! as described in "Fast and scalable minimal perfect hashing for massive key sets" //! [https://arxiv.org/abs/1702.03154](https://arxiv.org/abs/1702.03154). The library generates //! a minimal perfect hash function (MPHF) for a collection of hashable objects. Note: minimal //! perfect hash functions can only be used with the set of objects used when hash function //! was created. Hashing a new object will return an arbitrary hash value. If your use case //! may result in hashing new values, you will need an auxiliary scheme to detect this condition. //! //! ``` //! use boomphf::*; //! // Generate MPHF //! let possible_objects = vec![1, 10, 1000, 23, 457, 856, 845, 124, 912]; //! let n = possible_objects.len(); //! let phf = Mphf::new(1.7, &possible_objects); //! // Get hash value of all objects //! let mut hashes = Vec::new(); //! for v in possible_objects { //! hashes.push(phf.hash(&v)); //! } //! hashes.sort(); //! //! // Expected hash output is set of all integers from 0..n //! let expected_hashes: Vec = (0 .. n as u64).collect(); //! assert!(hashes == expected_hashes) //! ``` #[cfg(feature = "parallel")] use rayon::prelude::*; mod bitvector; pub mod hashmap; #[cfg(feature = "parallel")] mod par_iter; use bitvector::BitVector; use log::error; use std::borrow::Borrow; use std::fmt::Debug; use std::hash::Hash; use std::hash::Hasher; use std::marker::PhantomData; #[cfg(feature = "parallel")] use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; #[cfg(feature = "parallel")] use std::sync::{Arc, Mutex}; #[cfg(feature = "serde")] use serde::{self, Deserialize, Serialize}; #[inline] fn fold(v: u64) -> u32 { ((v & 0xFFFFFFFF) as u32) ^ ((v >> 32) as u32) } #[inline] fn hash_with_seed(iter: u64, v: &T) -> u64 { let mut state = wyhash::WyHash::with_seed(1 << (iter + iter)); v.hash(&mut state); state.finish() } #[inline] fn hash_with_seed32(iter: u64, v: &T) -> u32 { fold(hash_with_seed(iter, v)) } #[inline] fn fastmod(hash: u32, n: u32) -> u64 { ((hash as u64) * (n as u64)) >> 32 } #[inline] fn hashmod(iter: u64, v: &T, n: u64) -> u64 { // when n < 2^32, use the fast alternative to modulo described here: // https://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/ if n < (1 << 32) { let h = hash_with_seed32(iter, v); fastmod(h, n as u32) as u64 } else { let h = hash_with_seed(iter, v); h % (n as u64) } } /// A minimal perfect hash function over a set of objects of type `T`. #[derive(Clone, Debug)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub struct Mphf { bitvecs: Box<[(BitVector, Box<[u64]>)]>, phantom: PhantomData, } const MAX_ITERS: u64 = 100; impl<'a, T: 'a + Hash + Debug> Mphf { /// Constructs an MPHF from a (possibly lazy) iterator over iterators. /// This allows construction of very large MPHFs without holding all the keys /// in memory simultaneously. /// `objects` is an `IntoInterator` yielding a stream of `IntoIterator`s that must not contain any duplicate items. /// `objects` must be able to be iterated over multiple times and yield the same stream of items each time. /// `gamma` controls the tradeoff between the construction-time and run-time speed, /// and the size of the datastructure representing the hash function. See the paper for details. /// `n` is the total number of items that will be produced by iterating over all the input iterators. /// NOTE: the inner iterator `N::IntoIter` should override `nth` if there's an efficient way to skip /// over items when iterating. This is important because later iterations of the MPHF construction algorithm /// skip most of the items. pub fn from_chunked_iterator(gamma: f64, objects: &'a I, n: u64) -> Mphf where &'a I: IntoIterator, N: IntoIterator + Send, ::IntoIter: ExactSizeIterator, <&'a I as IntoIterator>::IntoIter: Send, I: Sync, { let mut iter = 0; let mut bitvecs = Vec::new(); #[allow(unused_mut)] let mut done_keys = BitVector::new(std::cmp::max(255, n)); assert!(gamma > 1.01); loop { if iter > MAX_ITERS { error!("ran out of key space. items: {:?}", done_keys.len()); panic!("counldn't find unique hashes"); } let keys_remaining = if iter == 0 { n } else { n - (done_keys.len() as u64) }; let size = std::cmp::max(255, (gamma * keys_remaining as f64) as u64); let mut a = BitVector::new(size); let mut collide = BitVector::new(size); let seed = iter; let mut offset = 0u64; for object in objects { let mut object_iter = object.into_iter(); // Note: we will use Iterator::nth() to advance the iterator if // we've skipped over some items. let mut object_pos = 0; let len = object_iter.len() as u64; for object_index in 0..len { let index = offset + object_index; if !done_keys.contains(index) { let key = match object_iter.nth((object_index - object_pos) as usize) { None => panic!("ERROR: max number of items overflowed"), Some(key) => key, }; object_pos = object_index + 1; let idx = hashmod(seed, &key, size); if collide.contains(idx) { continue; } let a_was_set = !a.insert_sync(idx); if a_was_set { collide.insert_sync(idx); } } } // end-window for offset += len; } // end-objects for let mut offset = 0u64; for object in objects { let mut object_iter = object.into_iter(); // Note: we will use Iterator::nth() to advance the iterator if // we've skipped over some items. let mut object_pos = 0; let len = object_iter.len() as u64; for object_index in 0..len { let index = offset + object_index; if !done_keys.contains(index) { // This will fast-forward the iterator over unneeded items. let key = match object_iter.nth((object_index - object_pos) as usize) { None => panic!("ERROR: max number of items overflowed"), Some(key) => key, }; object_pos = object_index + 1; let idx = hashmod(seed, &key, size); if collide.contains(idx) { a.remove(idx); } else { done_keys.insert(index); } } } // end-window for offset += len; } // end- objects for bitvecs.push(a); if done_keys.len() as u64 == n { break; } iter += 1; } Mphf { bitvecs: Self::compute_ranks(bitvecs), phantom: PhantomData, } } } impl Mphf { /// Generate a minimal perfect hash function for the set of `objects`. /// `objects` must not contain any duplicate items. /// `gamma` controls the tradeoff between the construction-time and run-time speed, /// and the size of the datastructure representing the hash function. See the paper for details. /// `max_iters` - None to never stop trying to find a perfect hash (safe if no duplicates). pub fn new(gamma: f64, objects: &[T]) -> Mphf { assert!(gamma > 1.01); let mut bitvecs = Vec::new(); let mut iter = 0; let mut cx = Context::new( std::cmp::max(255, (gamma * objects.len() as f64) as u64), iter, ); objects.iter().for_each(|v| cx.find_collisions_sync(v)); let mut redo_keys = objects .iter() .filter_map(|v| cx.filter(v)) .collect::>(); bitvecs.push(cx.a); iter += 1; while !redo_keys.is_empty() { let mut cx = Context::new( std::cmp::max(255, (gamma * redo_keys.len() as f64) as u64), iter, ); redo_keys.iter().for_each(|&v| cx.find_collisions_sync(v)); redo_keys = redo_keys.into_iter().filter_map(|v| cx.filter(v)).collect(); bitvecs.push(cx.a); iter += 1; if iter > MAX_ITERS { error!("ran out of key space. items: {:?}", redo_keys); panic!("counldn't find unique hashes"); } } Mphf { bitvecs: Self::compute_ranks(bitvecs), phantom: PhantomData, } } fn compute_ranks(bvs: Vec) -> Box<[(BitVector, Box<[u64]>)]> { let mut ranks = Vec::new(); let mut pop = 0_u64; for bv in bvs { let mut rank: Vec = Vec::new(); for i in 0..bv.num_words() { let v = bv.get_word(i); if i % 8 == 0 { rank.push(pop) } pop += v.count_ones() as u64; } ranks.push((bv, rank.into_boxed_slice())) } ranks.into_boxed_slice() } #[inline] fn get_rank(&self, hash: u64, i: usize) -> u64 { let idx = hash as usize; let (bv, ranks) = self.bitvecs.get(i).expect("that level doesn't exist"); // Last pre-computed rank let mut rank = ranks[idx / 512]; // Add rank of intervening words for j in (idx / 64) & !7..idx / 64 { rank += bv.get_word(j).count_ones() as u64; } // Add rank of final word up to hash let final_word = bv.get_word(idx / 64); if idx % 64 > 0 { rank += (final_word << (64 - (idx % 64))).count_ones() as u64; } rank } /// Compute the hash value of `item`. This method should only be used /// with items known to be in construction set. Use `try_hash` if you cannot /// guarantee that `item` was in the construction set. If `item` was not present /// in the construction set this function may panic. pub fn hash(&self, item: &T) -> u64 { for i in 0..self.bitvecs.len() { let (bv, _) = &self.bitvecs[i]; let hash = hashmod(i as u64, item, bv.capacity() as u64); if bv.contains(hash) { return self.get_rank(hash, i); } } unreachable!("must find a hash value"); } /// Compute the hash value of `item`. If `item` was not present /// in the set of objects used to construct the hash function, the return /// value will an arbitrary value Some(x), or None. pub fn try_hash(&self, item: &Q) -> Option where T: Borrow, Q: ?Sized + Hash, { for i in 0..self.bitvecs.len() { let (bv, _) = &(self.bitvecs)[i]; let hash = hashmod(i as u64, item, bv.capacity() as u64); if bv.contains(hash) { return Some(self.get_rank(hash, i)); } } None } } #[cfg(feature = "parallel")] impl Mphf { /// Same as `new`, but parallelizes work on the rayon default Rayon threadpool. /// Configure the number of threads on that threadpool to control CPU usage. #[cfg(feature = "parallel")] pub fn new_parallel(gamma: f64, objects: &[T], starting_seed: Option) -> Mphf { assert!(gamma > 1.01); let mut bitvecs = Vec::new(); let mut iter = 0; let cx = Context::new( std::cmp::max(255, (gamma * objects.len() as f64) as u64), starting_seed.unwrap_or(0) + iter, ); objects.into_par_iter().for_each(|v| cx.find_collisions(v)); let mut redo_keys = objects .into_par_iter() .filter_map(|v| cx.filter(v)) .collect::>(); bitvecs.push(cx.a); iter += 1; while !redo_keys.is_empty() { let cx = Context::new( std::cmp::max(255, (gamma * redo_keys.len() as f64) as u64), starting_seed.unwrap_or(0) + iter, ); (&redo_keys) .into_par_iter() .for_each(|&v| cx.find_collisions(v)); redo_keys = (&redo_keys) .into_par_iter() .filter_map(|&v| cx.filter(v)) .collect(); bitvecs.push(cx.a); iter += 1; if iter > MAX_ITERS { println!("ran out of key space. items: {:?}", redo_keys); panic!("counldn't find unique hashes"); } } Mphf { bitvecs: Self::compute_ranks(bitvecs), phantom: PhantomData, } } } struct Context { size: u64, seed: u64, a: BitVector, collide: BitVector, } impl Context { fn new(size: u64, seed: u64) -> Self { Self { size: size as u64, seed, a: BitVector::new(size), collide: BitVector::new(size), } } #[cfg(feature = "parallel")] fn find_collisions(&self, v: &T) { let idx = hashmod(self.seed, v, self.size); if !self.collide.contains(idx) && !self.a.insert(idx) { self.collide.insert(idx); } } fn find_collisions_sync(&mut self, v: &T) { let idx = hashmod(self.seed, v, self.size); if !self.collide.contains(idx) && !self.a.insert_sync(idx) { self.collide.insert_sync(idx); } } #[cfg(feature = "parallel")] fn filter<'t, T: Hash>(&self, v: &'t T) -> Option<&'t T> { let idx = hashmod(self.seed, v, self.size); if self.collide.contains(idx) { self.a.remove(idx); Some(v) } else { None } } #[cfg(not(feature = "parallel"))] fn filter<'t, T: Hash>(&mut self, v: &'t T) -> Option<&'t T> { let idx = hashmod(self.seed, v, self.size); if self.collide.contains(idx) { self.a.remove(idx); Some(v) } else { None } } } #[cfg(feature = "parallel")] struct Queue<'a, I: 'a, T> where &'a I: IntoIterator, <&'a I as IntoIterator>::Item: IntoIterator, { keys_object: &'a I, queue: <&'a I as IntoIterator>::IntoIter, num_keys: u64, last_key_index: u64, job_id: u8, phantom_t: PhantomData, } #[cfg(feature = "parallel")] impl<'a, I: 'a, N1, N2, T> Queue<'a, I, T> where &'a I: IntoIterator, N2: Iterator + ExactSizeIterator, N1: IntoIterator + Clone, { fn new(keys_object: &'a I, num_keys: u64) -> Queue<'a, I, T> { Queue { keys_object, queue: keys_object.into_iter(), num_keys, last_key_index: 0, job_id: 0, phantom_t: PhantomData, } } fn next(&mut self, done_keys_count: &AtomicU64) -> Option<(N2, u8, u64, u64)> { if self.last_key_index == self.num_keys { loop { let done_count = done_keys_count.load(Ordering::SeqCst); if self.num_keys == done_count { self.queue = self.keys_object.into_iter(); done_keys_count.store(0, Ordering::SeqCst); self.last_key_index = 0; self.job_id += 1; break; } } } if self.job_id > 1 { return None; } let node = self.queue.next().unwrap(); let node_keys_start = self.last_key_index; let num_keys = node.clone().into_iter().len() as u64; self.last_key_index += num_keys; Some((node.into_iter(), self.job_id, node_keys_start, num_keys)) } } #[cfg(feature = "parallel")] impl<'a, T: 'a + Hash + Debug + Send + Sync> Mphf { /// Same as to `from_chunked_iterator` but parallelizes work over `num_threads` threads. #[cfg(feature = "parallel")] pub fn from_chunked_iterator_parallel( gamma: f64, objects: &'a I, max_iters: Option, n: u64, num_threads: usize, ) -> Mphf where &'a I: IntoIterator, N: IntoIterator + Send + Clone, ::IntoIter: ExactSizeIterator, <&'a I as IntoIterator>::IntoIter: Send, I: Sync, { // TODO CONSTANT, might have to change // Allowing atmost 381Mb for buffer const MAX_BUFFER_SIZE: u64 = 50000000; const ONE_PERCENT_KEYS: f32 = 0.01; let min_buffer_keys_threshold: u64 = (ONE_PERCENT_KEYS * n as f32) as u64; let mut iter: u64 = 0; let mut bitvecs = Vec::::new(); assert!(gamma > 1.01); let global = Arc::new(GlobalContext { done_keys: BitVector::new(std::cmp::max(255, n)), buffered_keys: Mutex::new(Vec::new()), buffer_keys: AtomicBool::new(false), }); loop { if max_iters.is_some() && iter > max_iters.unwrap() { error!("ran out of key space. items: {:?}", global.done_keys.len()); panic!("counldn't find unique hashes"); } let keys_remaining = if iter == 0 { n } else { n - global.done_keys.len() }; if keys_remaining == 0 { break; } if keys_remaining < MAX_BUFFER_SIZE && keys_remaining < min_buffer_keys_threshold { global.buffer_keys.store(true, Ordering::SeqCst); } let size = std::cmp::max(255, (gamma * keys_remaining as f64) as u64); let cx = Arc::new(IterContext { done_keys_count: AtomicU64::new(0), work_queue: Mutex::new(Queue::new(objects, n)), collide: BitVector::new(size), a: BitVector::new(size), }); crossbeam_utils::thread::scope(|scope| { for _ in 0..num_threads { let global = global.clone(); let cx = cx.clone(); scope.spawn(move |_| { loop { let (mut node, job_id, offset, num_keys) = match cx.work_queue.lock().unwrap().next(&cx.done_keys_count) { None => break, Some(val) => val, }; let mut node_pos = 0; for index in 0..num_keys { let key_index = offset + index; if global.done_keys.contains(key_index) { continue; } let key = node.nth((index - node_pos) as usize).unwrap(); node_pos = index + 1; let idx = hashmod(iter, &key, size); let collision = cx.collide.contains(idx); if job_id == 0 { if !collision && !cx.a.insert(idx) { cx.collide.insert(idx); } } else if collision { cx.a.remove(idx); if global.buffer_keys.load(Ordering::SeqCst) { global.buffered_keys.lock().unwrap().push(key); } } else { global.done_keys.insert(key_index); } } cx.done_keys_count.fetch_add(num_keys, Ordering::SeqCst); } //end-loop }); //end-scope } //end-threads-for }) .unwrap(); //end-crossbeam match Arc::try_unwrap(cx) { Ok(cx) => bitvecs.push(cx.a), Err(_) => unreachable!(), } iter += 1; if global.buffer_keys.load(Ordering::SeqCst) { break; } } //end-loop let buffered_keys_vec = global.buffered_keys.lock().unwrap(); if buffered_keys_vec.len() > 1 { let mut buffered_mphf = Mphf::new_parallel(1.7, &buffered_keys_vec, Some(iter)); for i in 0..buffered_mphf.bitvecs.len() { let buff_vec = std::mem::replace(&mut buffered_mphf.bitvecs[i].0, BitVector::new(0)); bitvecs.push(buff_vec); } } Mphf { bitvecs: Self::compute_ranks(bitvecs), phantom: PhantomData, } } } #[cfg(feature = "parallel")] struct IterContext<'a, I: 'a, N1, N2, T> where &'a I: IntoIterator, N2: Iterator + ExactSizeIterator, N1: IntoIterator + Clone, { done_keys_count: AtomicU64, work_queue: Mutex>, collide: BitVector, a: BitVector, } #[cfg(feature = "parallel")] struct GlobalContext { done_keys: BitVector, buffered_keys: Mutex>, buffer_keys: AtomicBool, } #[cfg(test)] #[macro_use] extern crate quickcheck; #[cfg(test)] mod tests { use super::*; use std::collections::HashSet; use std::iter::FromIterator; /// Check that a Minimal perfect hash function (MPHF) is generated for the set xs fn check_mphf(xs: HashSet) -> bool where T: Sync + Hash + PartialEq + Eq + Debug + Send, { let xsv: Vec = xs.into_iter().collect(); // test single-shot data input check_mphf_serial(&xsv) && check_mphf_parallel(&xsv) } /// Check that a Minimal perfect hash function (MPHF) is generated for the set xs fn check_mphf_serial(xsv: &[T]) -> bool where T: Hash + PartialEq + Eq + Debug, { // Generate the MPHF let phf = Mphf::new(1.7, xsv); // Hash all the elements of xs let mut hashes: Vec = xsv.iter().map(|v| phf.hash(v)).collect(); hashes.sort_unstable(); // Hashes must equal 0 .. n let gt: Vec = (0..xsv.len() as u64).collect(); hashes == gt } /// Check that a Minimal perfect hash function (MPHF) is generated for the set xs #[cfg(feature = "parallel")] fn check_mphf_parallel(xsv: &[T]) -> bool where T: Sync + Hash + PartialEq + Eq + Debug + Send, { // Generate the MPHF let phf = Mphf::new_parallel(1.7, xsv, None); // Hash all the elements of xs let mut hashes: Vec = xsv.iter().map(|v| phf.hash(v)).collect(); hashes.sort_unstable(); // Hashes must equal 0 .. n let gt: Vec = (0..xsv.len() as u64).collect(); hashes == gt } #[cfg(not(feature = "parallel"))] fn check_mphf_parallel(_xsv: &[T]) -> bool where T: Hash + PartialEq + Eq + Debug, { true } fn check_chunked_mphf(values: Vec>, total: u64) -> bool where T: Sync + Hash + PartialEq + Eq + Debug + Send, { let phf = Mphf::from_chunked_iterator(1.7, &values, total); // Hash all the elements of xs let mut hashes: Vec = values .iter() .flat_map(|x| x.iter().map(|v| phf.hash(&v))) .collect(); hashes.sort_unstable(); // Hashes must equal 0 .. n let gt: Vec = (0..total as u64).collect(); hashes == gt } #[cfg(feature = "parallel")] fn check_chunked_mphf_parallel(values: Vec>, total: u64) -> bool where T: Sync + Hash + PartialEq + Eq + Debug + Send, { let phf = Mphf::from_chunked_iterator_parallel(1.7, &values, None, total, 2); // Hash all the elements of xs let mut hashes: Vec = values .iter() .flat_map(|x| x.iter().map(|v| phf.hash(&v))) .collect(); hashes.sort_unstable(); // Hashes must equal 0 .. n let gt: Vec = (0..total as u64).collect(); hashes == gt } #[cfg(not(feature = "parallel"))] fn check_chunked_mphf_parallel(_values: Vec>, _total: u64) -> bool where T: Sync + Hash + PartialEq + Eq + Debug + Send, { true } // this does not work under WASI. #[test] #[cfg(feature = "parallel")] fn check_crossbeam_scope() { crossbeam_utils::thread::scope(|scope| { let mut handles = vec![]; for i in 0..2 { let h = scope.spawn(move |_| i * i); handles.push(h); } for (i, h) in handles.into_iter().enumerate() { assert_eq!(i * i, h.join().unwrap()); } }) .unwrap() } quickcheck! { fn check_int_slices(v: HashSet, lens: Vec) -> bool { let mut lens = lens; let items: Vec = v.iter().cloned().collect(); if lens.is_empty() || lens.iter().all(|x| *x == 0) { lens.clear(); lens.push(items.len()) } let mut slices: Vec> = Vec::new(); let mut total = 0_usize; for slc_len in lens { let end = std::cmp::min(items.len(), total.saturating_add(slc_len)); let slc = Vec::from(&items[total..end]); slices.push(slc); total = end; if total == items.len() { break; } } check_chunked_mphf(slices.clone(), total as u64) && check_chunked_mphf_parallel(slices, total as u64) } } quickcheck! { fn check_string(v: HashSet>) -> bool { check_mphf(v) } } quickcheck! { fn check_u32(v: HashSet) -> bool { check_mphf(v) } } quickcheck! { fn check_isize(v: HashSet) -> bool { check_mphf(v) } } quickcheck! { fn check_u64(v: HashSet) -> bool { check_mphf(v) } } quickcheck! { fn check_vec_u8(v: HashSet>) -> bool { check_mphf(v) } } #[test] fn from_ints_serial() { let items = (0..1000000).map(|x| x * 2); assert!(check_mphf(HashSet::from_iter(items))); } } ================================================ FILE: src/par_iter.rs ================================================ use std::hash::Hash; use crate::hashmap::BoomHashMap; use rayon::iter::plumbing::{bridge, Consumer, Producer, ProducerCallback, UnindexedConsumer}; use rayon::iter::{IndexedParallelIterator, IntoParallelIterator, ParallelIterator}; impl<'data, K, V> IntoParallelIterator for &'data BoomHashMap where K: Hash + Sync + 'data, V: Sync + 'data, { type Item = (&'data K, &'data V); type Iter = Iter<'data, K, V>; fn into_par_iter(self) -> Self::Iter { Iter { keys: &self.keys, values: &self.values, } } } /// Parallel iterator over immutable items in a slice #[derive(Debug)] pub struct Iter<'data, K, V> { keys: &'data [K], values: &'data [V], } impl<'data, K, V> ParallelIterator for Iter<'data, K, V> where K: Sync + 'data, V: Sync + 'data, { type Item = (&'data K, &'data V); fn drive_unindexed(self, consumer: C) -> C::Result where C: UnindexedConsumer, { bridge(self, consumer) } fn opt_len(&self) -> Option { Some(self.len()) } } impl<'data, K, V> IndexedParallelIterator for Iter<'data, K, V> where K: Sync + 'data, V: Sync + 'data, { fn drive(self, consumer: C) -> C::Result where C: Consumer, { bridge(self, consumer) } fn len(&self) -> usize { self.keys.len() } fn with_producer(self, callback: CB) -> CB::Output where CB: ProducerCallback, { callback.callback(IterProducer { keys: self.keys, values: self.values, }) } } struct IterProducer<'data, K, V> { keys: &'data [K], values: &'data [V], } impl<'data, K, V> Producer for IterProducer<'data, K, V> where K: Sync + 'data, V: Sync + 'data, { type Item = (&'data K, &'data V); type IntoIter = KeyValIter<'data, K, V>; fn into_iter(self) -> Self::IntoIter { KeyValIter { keys: self.keys, values: self.values, } } fn split_at(self, index: usize) -> (Self, Self) { let (left_keys, right_keys) = self.keys.split_at(index); let (left_vals, right_vals) = self.values.split_at(index); ( IterProducer { keys: left_keys, values: left_vals, }, IterProducer { keys: right_keys, values: right_vals, }, ) } } struct KeyValIter<'data, K, V> { keys: &'data [K], values: &'data [V], } impl<'data, K, V> Iterator for KeyValIter<'data, K, V> { type Item = (&'data K, &'data V); fn next(&mut self) -> Option { if self.keys.is_empty() { return None; } let item = (&self.keys[0], &self.values[0]); self.keys = &self.keys[1..]; self.values = &self.values[1..]; Some(item) } fn size_hint(&self) -> (usize, Option) { (self.keys.len(), Some(self.keys.len())) } } impl<'data, K, V> ExactSizeIterator for KeyValIter<'data, K, V> {} impl<'data, K, V> DoubleEndedIterator for KeyValIter<'data, K, V> { fn next_back(&mut self) -> Option { if self.keys.is_empty() { return None; } let len = self.keys.len(); let item = (&self.keys[len - 1], &self.values[len - 1]); self.keys = &self.keys[..len - 1]; self.values = &self.values[..len - 1]; Some(item) } }