Repository: ZJULearning/efanna Branch: master Commit: a65bb84e5cd5 Files: 28 Total size: 176.6 KB Directory structure: gitextract_2l4uult3/ ├── LICENSE ├── Makefile ├── Makefile.debug ├── Makefile.silent ├── README.md ├── algorithm/ │ ├── base_index.hpp │ ├── hashing_index.hpp │ ├── init_indices.hpp │ └── kdtreeub_index.hpp ├── efanna.hpp ├── general/ │ ├── distance.hpp │ ├── matrix.hpp │ └── params.hpp ├── matlab/ │ ├── .gitignore │ ├── README.md │ ├── efanna.m │ ├── findex.cc │ ├── fvecs_read.m │ ├── handle_wrapper.hpp │ └── samples/ │ ├── example_buildall.m │ ├── example_buildgraph.m │ ├── example_buildtree.m │ └── example_search.m └── samples/ ├── efanna_index_buildall.cc ├── efanna_index_buildgraph.cc ├── efanna_index_buildtrees.cc ├── efanna_search.cc └── evaluate.cc ================================================ FILE CONTENTS ================================================ ================================================ FILE: LICENSE ================================================ BSD License ----------- Copyright (c) 2016 Cong Fu, Deng Cai (http://wiki.zjulearning.org:8081/wiki/Main_Page) All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ================================================ FILE: Makefile ================================================ GXX=g++ -std=c++11 #OPTM=-O3 -msse2 -msse4 -fopenmp OPTM=-O3 -march=native -fopenmp CPFLAGS=$(OPTM) -Wall -DINFO LDFLAGS=$(OPTM) -Wall -lboost_timer -lboost_chrono -lboost_system -DINFO INCLUDES=-I./ -I./algorithm -I./general SAMPLES=$(patsubst %.cc, %, $(wildcard samples/*.cc samples_hashing/*.cc)) SAMPLE_OBJS=$(foreach sample, $(SAMPLES), $(sample).o) HEADERS=$(wildcard ./*.hpp ./*/*.hpp) #EFNN is currently header only, so only samples will be compiled #SHARED_LIB=libefnn.so #OBJS=src/efnn.o all: $(SHARED_LIB) $(SAMPLES) #$(SHARED_LIB): $(OBJS) # $(GXX) $(LDFLAGS) $(LIBS) $(OBJS) -shared -o $(SHARED_LIB) $(SAMPLES): %: %.o $(GXX) $^ -o $@ $(LDFLAGS) $(LIBS) %.o: %.cpp $(HEADERS) $(GXX) $(CPFLAGS) $(INCLUDES) -c $*.cpp -o $@ %.o: %.cc $(HEADERS) $(GXX) $(CPFLAGS) $(INCLUDES) -c $*.cc -o $@ clean: rm -rf $(OBJS) rm -rf $(SHARED_LIB) rm -rf $(SAMPLES) rm -rf $(SAMPLE_OBJS) ================================================ FILE: Makefile.debug ================================================ GXX=g++ -std=c++11 OPTM=-O2 -msse4 CPFLAGS=$(OPTM) -Wall -Werror -g LDFLAGS=$(OPTM) -Wall INCLUDES=-I./ -I./algorithm -I./general SAMPLES=$(patsubst %.cc, %, $(wildcard samples/*.cc samples_hashing/*.cc)) SAMPLE_OBJS=$(foreach sample, $(SAMPLES), $(sample).o) HEADERS=$(wildcard ./*.hpp ./*/*.hpp) #EFNN is currently header only, so only samples will be compiled #SHARED_LIB=libefnn.so #OBJS=src/efnn.o all: $(SHARED_LIB) $(SAMPLES) #$(SHARED_LIB): $(OBJS) # $(GXX) $(LDFLAGS) $(LIBS) $(OBJS) -shared -o $(SHARED_LIB) $(SAMPLES): %: %.o $(GXX) $(LDFLAGS) $(LIBS) $^ -o $@ %.o: %.cpp $(HEADERS) $(GXX) $(CPFLAGS) $(INCLUDES) -c $*.cpp -o $@ %.o: %.cc $(HEADERS) $(GXX) $(CPFLAGS) $(INCLUDES) -c $*.cc -o $@ clean: rm -rf $(OBJS) rm -rf $(SHARED_LIB) rm -rf $(SAMPLES) rm -rf $(SAMPLE_OBJS) ================================================ FILE: Makefile.silent ================================================ GXX=g++ -std=c++11 #OPTM=-O3 -msse2 -msse4 -fopenmp OPTM=-O3 -march=native -fopenmp CPFLAGS=$(OPTM) -Wall LDFLAGS=$(OPTM) -Wall -lboost_timer -lboost_system INCLUDES=-I./ -I./algorithm -I./general SAMPLES=$(patsubst %.cc, %, $(wildcard samples/*.cc samples_hashing/*.cc)) SAMPLE_OBJS=$(foreach sample, $(SAMPLES), $(sample).o) HEADERS=$(wildcard ./*.hpp ./*/*.hpp) #EFNN is currently header only, so only samples will be compiled #SHARED_LIB=libefnn.so #OBJS=src/efnn.o all: $(SHARED_LIB) $(SAMPLES) #$(SHARED_LIB): $(OBJS) # $(GXX) $(LDFLAGS) $(LIBS) $(OBJS) -shared -o $(SHARED_LIB) $(SAMPLES): %: %.o $(GXX) $(LDFLAGS) $(LIBS) $^ -o $@ %.o: %.cpp $(HEADERS) $(GXX) $(CPFLAGS) $(INCLUDES) -c $*.cpp -o $@ %.o: %.cc $(HEADERS) $(GXX) $(CPFLAGS) $(INCLUDES) -c $*.cc -o $@ clean: rm -rf $(OBJS) rm -rf $(SHARED_LIB) rm -rf $(SAMPLES) rm -rf $(SAMPLE_OBJS) ================================================ FILE: README.md ================================================ EFANNA: an Extremely Fast Approximate Nearest Neighbor search Algorithm framework based on kNN graph ============ EFANNA is a ***flexible*** and ***efficient*** library for approximate nearest neighbor search (ANN search) on large scale data. It implements the algorithms of our paper [EFANNA : Extremely Fast Approximate Nearest Neighbor Search Algorithm Based on kNN Graph](http://arxiv.org/abs/1609.07228). EFANNA provides fast solutions on both ***approximate nearest neighbor graph construction*** and ***ANN search*** problems. EFANNA is also flexible to adopt all kinds of hierarchical structure for initialization, such as random projection tree, hierarchical clustering tree, [multi-table hashing](https://github.com/fc731097343/efanna/tree/master/samples_hashing) and so on. What's new ------- + **Please see our more advanced search algorithm [NSG](https://github.com/ZJULearning/nsg)** Jan 13, 2018 + **The [paper](http://arxiv.org/abs/1609.07228) updated significantly.** Dec 6, 2016 + **Algorithm improved and AVX instructions supported.** Nov 30, 2016 + **Parallelism with OpenMP.** Sep 26, 2016 Benchmark data set ------- * [SIFT1M and GIST1M](http://corpus-texmex.irisa.fr/) ANN search performance ------ The performance was tested without parallelism. ![SIFT1nn](http://www.cad.zju.edu.cn/home/dengcai/Data/Hashing/SIFT_1nn.png) ![SIFT100nn](http://www.cad.zju.edu.cn/home/dengcai/Data/Hashing/SIFT_100nn.png) ![GIST1nn](http://www.cad.zju.edu.cn/home/dengcai/Data/Hashing/GIST_1nn.png) ![GIST100nn](http://www.cad.zju.edu.cn/home/dengcai/Data/Hashing/GIST_100nn.png) Compared Algorithms: * [kGraph](http://www.kgraph.org) * [flann](http://www.cs.ubc.ca/research/flann/) * [IEH](http://ieeexplore.ieee.org/document/6734715/) : Fast and accurate hashing via iterative nearest neighbors expansion * [GNNS](https://webdocs.cs.ualberta.ca/~abbasiya/gnns.pdf) : Fast Approximate Nearest-Neighbor Search with k-Nearest Neighbor Graph kNN Graph Construction Performance ------ The performance was tested without parallelism. ![SIFT1nnGraph](http://www.cad.zju.edu.cn/home/dengcai/Data/Hashing/SIFT_graph.png) ![SIFT100nnGraph](http://www.cad.zju.edu.cn/home/dengcai/Data/Hashing/GIST_graph.png) Compared Algorithms: * [Kgraph](http://www.kgraph.org) (same with NN-descent) * [NN-expansion](https://webdocs.cs.ualberta.ca/~abbasiya/gnns.pdf) (same with GNNS) * [SGraph](http://ieeexplore.ieee.org/document/6247790/) : Scalable k-NN graph construction for visual descriptors * [FastKNN](http://link.springer.com/chapter/10.1007/978-3-642-40991-2_42) : Fast kNN Graph Construction with Locality Sensitive Hashing * [LargeVis](http://dl.acm.org/citation.cfm?id=2883041) : Visualizing Large-scale and High-dimensional Data How To Complie ------- Go to the root directory of EFANNA and make. cd efanna/ make How To Use ------ EFANNA uses a composite index to carry out ANN search, which includes an approximate kNN graph and a number of tree structures. They can be built by this library as a whole or seperately. You may build the kNN graph seperately for other use, like other graph based machine learning algorithms. Below are some demos. * kNN graph building : cd efanna/samples/ ./efanna_index_buildgraph sift_base.fvecs sift.graph 8 8 8 30 25 10 10 Meaning of the parameters(from left to right): sift_base.fvecs -- database points sift.graph -- graph built by EFANNA 8 -- number of trees used to build the graph (larger is more accurate but slower) 8 -- conquer-to-depeth(smaller is more accurate but slower) 8 -- number of iterations to build the graph 30 -- L (larger is more accurate but slower, no smaller than K) 25 -- check (larger is more accurate but slower, no smaller than K) 10 -- K, for KNN graph 10 -- S (larger is more accurate but slower) * tree building : cd efanna/samples/ ./efanna_index_buildtrees sift_base.fvecs sift.trees 32 Meaning of the parameters(from left to right): sift_base.fvecs -- database points sift.trees -- struncated KD-trees built by EFANNA 32 -- number of trees to build * index building at one time: cd efanna/samples/ ./efanna_index_buildall sift_base.fvecs sift.graph sift.trees 32 8 8 200 200 100 10 8 Meaning of the parameters(from left to right) sift_base.fvecs -- database points sift.trees -- struncated KD-trees built by EFANNA sift.graph -- approximate KNN graph built by EFANNA 32 -- number of trees in total for building index 8 -- conquer-to-depth 8 -- iteration number 200 -- L (larger is more accurate but slower, no smaller than K) 200 -- check (larger is more accurate but slower, no smaller than K) 100 -- K, for KNN graph 10 -- S (larger is more accurate but slower) 8 -- 8 out of 32 trees are used for building graph * ANN search cd efanna/samples/ ./efanna_search sift_base.fvecs sift.trees sift.graph sift_query.fvecs sift.results 16 4 1200 200 10 Meaning of the parameters(from left to right): sift_base.fvecs -- database points sift.trees -- prebuilt struncated KD-trees used for search sift.graph -- prebuilt kNN graph sift_query -- sift query points sift.results -- path to save ANN search results of given query 16 -- number of trees to use (no greater than the number of prebuilt trees) 4 -- number of epoches 1200 -- pool size factor (larger is more accurate but slower, usually 6~10 times larger than extend factor) 200 -- extend factor (larger is more accurate but slower) 10 -- required number of returned neighbors (i.e. k of k-NN) * Evaluation cd efanna/samples/ ./evaluate sift.results sift_groundtruth.ivecs 10 Meaning of the parameters(from left to right): sift.results -- search results file sift_groundtruth.ivecs -- ground truth file 10 -- evaluate the 10NN accuracy (the only first 10 points returned by the algorithm are examined, how many points are among the true 10 nearest neighbors of the query) See our paper or user manual for more details about the parameters and interfaces. Output format ------ The file format of approximate kNN graph and ANN search results are the same. Suppose the database has N points, and numbered from 0 to N-1. You want to build an approximate kNN graph. The graph can be regarded as a N * k Matrix. The saved kNN graph binary file saves the matrix by row. The first 4 bytes of each row saves the int value of k, next 4 bytes saves the value of M and next 4 bytes saves the float value of the norm of the point. Then it follows k*4 bytes, saving the indices of the k nearest neighbors of respective point. The N rows are saved continuously without seperating characters. Similarly, suppose the query data has n points, numbered 0 to n-1. You want EFANNA to return k nearest neighbors for each query. The result file will save n rows like the graph file. It saves the returned indices row by row. Each row starts with 4 bytes recording value of k, and follows k*4 bytes recording neighbors' indices. Input of EFANNA ------ Because there is no unified format for input data, users may need to write input function to read your own data. You may imitate the input function in our sample code (sample/efanna\_efanna\_index\_buildgraph.cc) to load the data into our matrix. To use SIMD instruction optimization, you should pay attention to the data alignment problem of SSE / AVX instruction. Compare with EFANNA without parallelism and SSE/AVX instructions ------ To disable the parallelism, there is no need to modify the code. Simply export OMP_NUM_THREADS=1 before you run the code. Then the code will only use one thread. This is a very convenient way to control the number of threads used. To disable SSE/AVX instructions, you need to modify samples/xxxx.cc, find the line FIndex index(dataset, new L2DistanceAVX(), efanna::KDTreeUbIndexParams(true, trees ,mlevel ,epochs,checkK,L, kNN, trees, S)); Change **L2DistanceAVX** to **L2Distance** and build the project. Now the SSE/AVX instructions are disabled. If you want to try SSE instead of AVX, try **L2DistanceSSE** Parameters to get the Fig. 4/5 (10-NN approximate graph construction) in our paper ------ SIFT1M: ./efanna_index_buildgraph sift_base.fvecs sift.graph 8 8 0 20 10 10 10 ./efanna_index_buildgraph sift_base.fvecs sift.graph 8 8 1 20 10 10 10 ./efanna_index_buildgraph sift_base.fvecs sift.graph 8 8 2 20 10 10 10 ./efanna_index_buildgraph sift_base.fvecs sift.graph 8 8 3 20 10 10 10 ./efanna_index_buildgraph sift_base.fvecs sift.graph 8 8 5 20 10 10 10 ./efanna_index_buildgraph sift_base.fvecs sift.graph 8 8 6 20 20 10 10 ./efanna_index_buildgraph sift_base.fvecs sift.graph 8 8 6 20 30 10 10 GIST1M: ./efanna_index_buildgraph gist_base.fvecs gist.graph 8 8 2 30 30 10 10 ./efanna_index_buildgraph gist_base.fvecs gist.graph 8 8 3 30 30 10 10 ./efanna_index_buildgraph gist_base.fvecs gist.graph 8 8 4 30 30 10 10 ./efanna_index_buildgraph gist_base.fvecs gist.graph 8 8 5 30 30 10 10 ./efanna_index_buildgraph gist_base.fvecs gist.graph 8 8 6 30 30 10 10 ./efanna_index_buildgraph gist_base.fvecs gist.graph 8 8 7 30 30 10 10 ./efanna_index_buildgraph gist_base.fvecs gist.graph 8 8 10 30 40 10 10 Acknowledgment ------ Our code framework imitates [Flann](http://www.cs.ubc.ca/research/flann/) to make it scalable, and the implemnetation of NN-descent is taken from [Kgraph](http://www.kgraph.org). They proposed the NN-descent algorithm. Many thanks to them for inspiration. What to do ------- * Add more initial algorithm choice ================================================ FILE: algorithm/base_index.hpp ================================================ #ifndef EFANNA_BASE_INDEX_H_ #define EFANNA_BASE_INDEX_H_ #include "general/params.hpp" #include "general/distance.hpp" #include "general/matrix.hpp" #include #include #include #include #include #include #include "boost/smart_ptr/detail/spinlock.hpp" #include //#define BATCH_SIZE 200 namespace efanna{ typedef boost::detail::spinlock Lock; typedef std::lock_guard LockGuard; struct Point { unsigned id; float dist; bool flag; Point () {} Point (unsigned i, float d, bool f = true): id(i), dist(d), flag(f) { } bool operator < (const Point &n) const{ return this->dist < n.dist; } }; typedef std::vector Points; static inline unsigned InsertIntoKnn (Point *addr, unsigned K, Point nn) { // find the location to insert unsigned j; unsigned i = K; while (i > 0) { j = i - 1; if (addr[j].dist <= nn.dist) break; i = j; } // check for equal ID unsigned l = i; while (l > 0) { j = l - 1; if (addr[j].dist < nn.dist) break; if (addr[j].id == nn.id) return K + 1; l = j; } // i <= K-1 j = K; while (j > i) { addr[j] = addr[j-1]; --j; } addr[i] = nn; return i; } struct Neighbor { std::shared_ptr lock; float radius; float radiusM; Points pool; unsigned L; unsigned Range; bool found; std::vector nn_old; std::vector nn_new; std::vector rnn_old; std::vector rnn_new; Neighbor() : lock(std::make_shared()) { } unsigned insert (unsigned id, float dist) { if (dist > radius) return pool.size(); LockGuard guard(*lock); unsigned l = InsertIntoKnn(&pool[0], L, Point(id, dist, true)); if (l <= L) { if (L + 1 < pool.size()) { ++L; } else { radius = pool[L-1].dist; } } return l; } template void join (C callback) const { for (unsigned const i: nn_new) { for (unsigned const j: nn_new) { if (i < j) { callback(i, j); } } for (unsigned j: nn_old) { callback(i, j); } } } }; template class InitIndex{ public: InitIndex(const Matrix& features, const Distance* d, const IndexParams& params): features_(features), distance_(d), params_(params) { } virtual ~InitIndex() {}; virtual void buildTrees(){} virtual void buildIndex() { buildIndexImpl(); } virtual void buildIndexImpl() = 0; virtual void loadIndex(char* filename) = 0; virtual void saveIndex(char* filename) = 0; virtual void loadTrees(char* filename) = 0; virtual void saveTrees(char* filename) = 0; virtual void loadGraph(char* filename) = 0; virtual void saveGraph(char* filename) = 0; virtual void outputVisitBucketNum() = 0; void saveResults(char* filename){ std::ofstream out(filename,std::ios::binary); std::vector>::iterator i; //std::cout<::iterator j; int dim = i->size(); //std::cout<begin(); j != i->end(); j++){ int id = *j; out.write((char*)&id, sizeof(int)); } } out.close(); } SearchParams SP; void setSearchParams(int epochs, int init_num, int extend_to,int search_trees, int search_lv, int search_method){ SP.search_epoches = epochs; SP.search_init_num = init_num; if(extend_to>init_num) SP.extend_to = init_num; else SP.extend_to = extend_to; SP.search_depth = search_lv; SP.tree_num = search_trees; SP.search_method = search_method; } void nnExpansion_kgraph(size_t K, const DataType* qNow, std::vector& pool, std::vector& results){ unsigned int base_n = features_.get_rows(); boost::dynamic_bitset<> tbflag(base_n, false); boost::dynamic_bitset<> newflag(base_n, false); std::vector knn(K + SP.extend_to +1); int remainder = SP.search_init_num % SP.extend_to; int nSeg = SP.search_init_num / SP.extend_to; //clock_t s,f; int Iter = nSeg; if (remainder > 0) Iter++; int Jter = SP.extend_to; for(int i = 0; i 0) && (i == Iter-1)) Jter=remainder; unsigned int L = 0; for(int j=0; j compare(qNow, features_.get_row(knn[k].id), features_.get_cols()); newflag.set(knn[k].id); } std::sort(knn.begin(), knn.begin() + L); //s = clock(); unsigned int k = 0; while (k < L) { unsigned int nk = L; if (newflag.test(knn[k].id)){ newflag.reset(knn[k].id); tbflag.set(knn[k].id); typename CandidateHeap::reverse_iterator neighbor = knn_graph[knn[k].id].rbegin(); for(size_t nnk = 0;nnk < params_.K && neighbor != knn_graph[knn[k].id].rend(); neighbor++, nnk++){ if(tbflag.test(neighbor->row_id))continue; tbflag.set(neighbor->row_id); newflag.set(neighbor->row_id); float dist = distance_->compare(qNow, features_.get_row(neighbor->row_id), features_.get_cols()); Point nn(neighbor->row_id, dist); unsigned int r = InsertIntoKnn(&knn[0], L, nn); if ( (r <= L) && (L + 1 < knn.size())) ++L; if (r < nk) nk = r; } } if (nk <= k) k = nk; else ++k; } //f = clock(); //sum = sum + f-s; if (L > K) L = K; if (results.empty()) { results.reserve(K + 1); results.resize(L + 1); std::copy(knn.begin(), knn.begin() + L, results.begin()); } else { for (unsigned int l = 0; l < L; ++l) { unsigned r = InsertIntoKnn(&results[0], results.size() - 1, knn[l]); if (r < results.size() /* inserted */ && results.size() < (K + 1)) { results.resize(results.size() + 1); } } } } results.pop_back(); } void nnExpansion(size_t K, const DataType* qNow, std::vector& pool, std::vector& res){ unsigned int base_n = features_.get_rows(); boost::dynamic_bitset<> tbflag(base_n, false); boost::dynamic_bitset<> newflag(base_n, false); CandidateHeap Candidates; int remainder = SP.search_init_num % SP.extend_to; int nSeg = SP.search_init_num / SP.extend_to; int segIter = nSeg; if (remainder > 0) segIter++; int Jter = SP.extend_to; CandidateHeap Results; for(int seg = 0; seg 0) && (seg == segIter-1)) Jter=remainder; for(int j=0; j =base_n) std::cout << "query:" << cur << " Init "<< nn << std::endl; if(!tbflag.test(nn)){ newflag.set(nn); Candidate c(nn, distance_->compare(qNow, features_.get_row(nn), features_.get_cols())); Candidates.insert(c); } } std::vector ids; int iter=0; while(iter++ < SP.search_epoches){ //the heap is max heap ids.clear(); typename CandidateHeap::reverse_iterator it = Candidates.rbegin(); for(unsigned j = 0; j < SP.extend_to && it != Candidates.rend(); j++,it++){ // if(it->row_id>=base_n) std::cout<<"query:"<< cur<<" Judge node "<row_id<row_id)){ newflag.reset(it->row_id); typename CandidateHeap::reverse_iterator neighbor = knn_graph[it->row_id].rbegin(); for(; neighbor != knn_graph[it->row_id].rend(); neighbor++){ // if(neighbor->row_id>=base_n) std::cout<<"query:"<< cur<<" Judge neighbor "<row_id<row_id))continue; tbflag.set(neighbor->row_id); ids.push_back(neighbor->row_id); } } } for(size_t j = 0; j < ids.size(); j++){ Candidate c(ids[j], distance_->compare(qNow, features_.get_row(ids[j]), features_.get_cols()) ); Candidates.insert(c); newflag.set(ids[j]); if(Candidates.size() > (unsigned int)SP.extend_to)Candidates.erase(Candidates.begin()); } } typename CandidateHeap::reverse_iterator it = Candidates.rbegin(); for(unsigned int j = 0; j < K && it != Candidates.rend(); j++,it++){ Results.insert(*it); if(Results.size() > K)Results.erase(Results.begin()); } } typename CandidateHeap::reverse_iterator it = Results.rbegin(); for(unsigned int j = 0; j < K && it != Candidates.rend(); j++,it++){ res.push_back(it->row_id); } } virtual void knnSearch(int K, const Matrix& query){ getNeighbors(K,query); } virtual void getNeighbors(size_t K, const Matrix& query) = 0; virtual void initGraph() = 0; //std::vector Range; std::vector > graph; std::vector nhoods; void join(){ size_t dim = features_.get_cols(); size_t cc = 0; #pragma omp parallel for default(shared) schedule(dynamic, 100) reduction(+:cc) for(size_t i = 0; i < nhoods.size(); i++){ size_t uu = 0; nhoods[i].found = false; /* for(size_t newi = 0; newi < nhoods[i].nn_new.size(); newi++){ for(size_t newj = newi+1; newj < nhoods[i].nn_new.size(); newj++){ unsigned a = nhoods[i].nn_new[newi]; unsigned b = nhoods[i].nn_new[newj]; DataType dist = distance_->compare( features_.get_row(a), features_.get_row(b), dim); unsigned r = nhoods[a].insert(b,dist); if(r < params_.Check_K){uu += 2;} nhoods[b].insert(a,dist); } for(size_t oldj = 0; oldj < nhoods[i].nn_old.size(); oldj++){ unsigned a = nhoods[i].nn_new[newi]; unsigned b = nhoods[i].nn_old[oldj]; DataType dist = distance_->compare( features_.get_row(a), features_.get_row(b), dim); unsigned r = nhoods[a].insert(b,dist); if(r < params_.Check_K){uu += 2;} nhoods[b].insert(a,dist); } } */ nhoods[i].join([&](unsigned i, unsigned j) { DataType dist = distance_->compare( features_.get_row(i), features_.get_row(j), dim); ++cc; unsigned r; r = nhoods[i].insert(j, dist); if (r < params_.Check_K) ++uu; nhoods[j].insert(i, dist); if (r < params_.Check_K) ++uu; }); nhoods[i].found = uu > 0; } } void update (int paramL) { for (size_t i = 0; i < nhoods.size(); i++) { nhoods[i].nn_new.clear(); nhoods[i].nn_old.clear(); nhoods[i].rnn_new.clear(); nhoods[i].rnn_old.clear(); nhoods[i].radius = nhoods[i].pool.back().dist; } //find longest new #pragma omp parallel for for(size_t i = 0; i < nhoods.size(); i++){ if(nhoods[i].found){ unsigned maxl = nhoods[i].Range + params_.S < nhoods[i].L ? nhoods[i].Range + params_.S : nhoods[i].L; unsigned c = 0; unsigned l = 0; while ((l < maxl) && (c < params_.S)) { if (nhoods[i].pool[l].flag) ++c; ++l; } nhoods[i].Range = l; } nhoods[i].radiusM = nhoods[i].pool[nhoods[i].Range-1].dist; } #pragma omp parallel for for (unsigned n = 0; n < nhoods.size(); ++n) { Neighbor &nhood = nhoods[n]; std::vector &nn_new = nhood.nn_new; std::vector &nn_old = nhood.nn_old; for (unsigned l = 0; l < nhood.Range; ++l) { Point &nn = nhood.pool[l]; Neighbor &nhood_o = nhoods[nn.id]; // nhood on the other side of the edge if (nn.flag) { nn_new.push_back(nn.id); if (nn.dist > nhood_o.radiusM) { LockGuard guard(*nhood_o.lock); nhood_o.rnn_new.push_back(n); } nn.flag = false; } else { nn_old.push_back(nn.id); if (nn.dist > nhood_o.radiusM) { LockGuard guard(*nhood_o.lock); nhood_o.rnn_old.push_back(n); } } } } for (unsigned i = 0; i < nhoods.size(); ++i) { std::vector &nn_new = nhoods[i].nn_new; std::vector &nn_old = nhoods[i].nn_old; std::vector &rnn_new = nhoods[i].rnn_new; std::vector &rnn_old = nhoods[i].rnn_old; if (paramL && (rnn_new.size() > (unsigned int)paramL)) { random_shuffle(rnn_new.begin(), rnn_new.end()); rnn_new.resize(paramL); } nn_new.insert(nn_new.end(), rnn_new.begin(), rnn_new.end()); if (paramL && (rnn_old.size() > (unsigned int)paramL)) { random_shuffle(rnn_old.begin(), rnn_old.end()); rnn_old.resize(paramL); } nn_old.insert(nn_old.end(), rnn_old.begin(), rnn_old.end()); } } void refineGraph(){ std::cout << " refineGraph" << std::endl; int iter = 0; clock_t s,f; s = clock();unsigned int l=100; while(iter++ < params_.build_epoches){ join();//std::cout<<"after join"< c(nhoods[i].pool[j].id,nhoods[i].pool[j].dist); can.insert(c); } while(can.size()compare(features_.get_row(i), features_.get_row(id),features_.get_cols()); Candidate c(id, dist); can.insert(c); } knn_graph.push_back(can); } */ g.resize(nhoods.size()); M.resize(nhoods.size()); gs.resize(nhoods.size()); for(unsigned i = 0; i < nhoods.size();i++){ M[i] = nhoods[i].Range; g[i].resize(nhoods[i].pool.size()); std::copy(nhoods[i].pool.begin(), nhoods[i].pool.end(), g[i].begin()); gs[i].resize(params_.K); for(unsigned j = 0; j < params_.K;j++) gs[i][j] = g[i][j].id; } } void calculate_norm(){ unsigned N = features_.get_rows(); unsigned D = features_.get_cols(); norms.resize(N); #pragma omp parallel for for (unsigned n = 0; n < N; ++n) { norms[n] = distance_->norm(features_.get_row(n),D); } } typedef std::set, std::greater> > CandidateHeap; typedef std::vector IndexVec; size_t getGraphSize(){return gs.size();} std::vector getGraphRow(unsigned row_id){ std::vector row; if(gs.size() > row_id){ for(unsigned i = 0; i < gs[row_id].size(); i++)row.push_back(gs[row_id][i]); } return row; } protected: const Matrix features_; const Distance* distance_; const IndexParams params_; std::vector > knn_table_gt; std::vector > g; std::vector > gs; std::vector M; //std::vector> knn_graph; std::vector knn_graph; std::vector norms; std::vector > nn_results; DataType* Radius; }; #define USING_BASECLASS_SYMBOLS \ using InitIndex::distance_;\ using InitIndex::params_;\ using InitIndex::features_;\ using InitIndex::buildIndex;\ using InitIndex::knn_table_gt;\ using InitIndex::nn_results;\ using InitIndex::saveResults;\ using InitIndex::knn_graph;\ using InitIndex::refineGraph;\ using InitIndex::nhoods;\ using InitIndex::SP;\ using InitIndex::nnExpansion;\ using InitIndex::nnExpansion_kgraph;\ using InitIndex::g;\ using InitIndex::gs;\ using InitIndex::M;\ using InitIndex::norms; } #endif ================================================ FILE: algorithm/hashing_index.hpp ================================================ #ifndef EFANNA_HASHING_INDEX_H_ #define EFANNA_HASHING_INDEX_H_ #include "algorithm/base_index.hpp" #include #include //for Debug #include #include #include #include #include #include #include //#define MAX_RADIUS 6 namespace efanna{ struct HASHINGIndexParams : public IndexParams { HASHINGIndexParams(int codelen, int TableNum,int UpperBits, int HashRadius, char*& BaseCodeFile, char*& QueryCodeFile, int codelenShift = 0) { init_index_type = HASHING; ValueType len; len.int_val = codelen; extra_params.insert(std::make_pair("codelen",len)); ValueType nTab; nTab.int_val = TableNum; extra_params.insert(std::make_pair("tablenum",nTab)); ValueType upb; upb.int_val = UpperBits; extra_params.insert(std::make_pair("upbits",upb)); ValueType radius; radius.int_val = HashRadius; extra_params.insert(std::make_pair("radius",radius)); ValueType bcf; bcf.str_pt = BaseCodeFile; extra_params.insert(std::make_pair("bcfile",bcf)); ValueType qcf; qcf.str_pt = QueryCodeFile; extra_params.insert(std::make_pair("qcfile",qcf)); ValueType lenShift; lenShift.int_val = codelenShift; extra_params.insert(std::make_pair("lenshift",lenShift)); } }; template class HASHINGIndex : public InitIndex { public: typedef InitIndex BaseClass; typedef std::vector Codes; typedef std::unordered_map > HashBucket; typedef std::vector HashTable; typedef std::vector Codes64; typedef std::unordered_map > HashBucket64; typedef std::vector HashTable64; HASHINGIndex(const Matrix& dataset, const Distance* d, const IndexParams& params = HASHINGIndexParams(0,NULL,NULL)) : BaseClass(dataset,d,params) { std::cout<<"HASHING initial, max code length : 64" <second).int_val; std::cout << "use "<second).int_val; int actuallen = codelength - codelengthshift; if(actuallen > 0){ std::cout << "Actually use "<< actuallen<< " bit code"<< std::endl; }else{ std::cout << "lenShift error: could not be larger than the code length! "<< std::endl; } } else{ codelengthshift = 0; } it = params_.extra_params.find("tablenum"); if(it != params_.extra_params.end()){ tablenum = (it->second).int_val; std::cout << "use "<second).int_val; std::cout << "use upper "<= codelength-codelengthshift){ std::cout << "upbits should be smaller than the actual codelength!" << std::endl; return; } int actuallen = codelength - codelengthshift; it = params_.extra_params.find("radius"); if(it != params_.extra_params.end()){ radius = (it->second).int_val; if(actuallen<=32){ if(radius > 13){ std::cout << "radius greater than 13 not supported yet!" << std::endl; radius = 13; } }else if(actuallen<=36){ if(radius > 11){ std::cout << "radius greater than 11 not supported yet!" << std::endl; radius = 11; } }else if(actuallen<=40){ if(radius > 10){ std::cout << "radius greater than 10 not supported yet!" << std::endl; radius = 10; } }else if(actuallen<=48){ if(radius > 9){ std::cout << "radius greater than 9 not supported yet!" << std::endl; radius = 9; } }else if(actuallen<=60){ if(radius > 8){ std::cout << "radius greater than 8 not supported yet!" << std::endl; radius = 8; } }else{ //actuallen<=64 if(radius > 7){ std::cout << "radius greater than 7 not supported yet!" << std::endl; radius = 7; } } std::cout << "search hamming radius "<second).str_pt; std::string str(fpath); std::cout << "Loading base code from " << str << std::endl; if (codelength <= 32 ){ LoadCode32(fpath, BaseCode); }else if(codelength <= 64 ){ LoadCode64(fpath, BaseCode64); }else{ std::cout<<"code length not supported yet!"<second).str_pt; std::string str(fpath); std::cout << "Loading query code from " << str << std::endl; if (codelength <= 32 ){ LoadCode32(fpath, QueryCode); }else if(codelength <= 64 ){ LoadCode64(fpath, QueryCode64); }else{ std::cout<<"code length not supported yet!"<& baseAll){ if (tablenum < 1){ std::cout<<"Total hash table num error! "<> codeFile; ss.clear(); std::ifstream in(codeFile.c_str(), std::ios::binary); if(!in.is_open()){std::cout<<"open file " << filename <<" error"<< std::endl;return;} int codeNum; in.read((char*)&codeNum,4); if (codeNum != 1){ std::cout<<"Codefile "<< j << " error!"<> codelengthshift; if (codetmp > maxValue){ std::cout<<"codetmp: "<< codetmp <& baseAll){ if (tablenum < 1){ std::cout<<"Total hash table num error! "<> codeFile; ss.clear(); std::ifstream in(codeFile.c_str(), std::ios::binary); if(!in.is_open()){std::cout<<"open file " << filename <<" error"<< std::endl;return;} int codeNum; in.read((char*)&codeNum,4); if (codeNum != 1){ std::cout<<"Codefile "<< j << " error!"<> codelengthshift; if (codetmp > maxValue){ std::cout<<"codetmp: "<< codetmp <& baseAll ,std::vector& tbAll){ for(size_t h=0; h < baseAll.size(); h++){ Codes& base = baseAll[h]; HashTable tb; for(int i = 0; i < (1 << upbits); i++){ HashBucket emptyBucket; tb.push_back(emptyBucket); } for(size_t i = 0; i < base.size(); i ++){ unsigned int idx1 = base[i] >> lowbits; unsigned int idx2 = base[i] - (idx1 << lowbits); if(tb[idx1].find(idx2) != tb[idx1].end()){ tb[idx1][idx2].push_back(i); }else{ std::vector v; v.push_back(i); tb[idx1].insert(make_pair(idx2,v)); } } tbAll.push_back(tb); } } void generateMask32(){ //i = 0 means the origin code HammingBallMask.push_back(0); HammingRadius.push_back(HammingBallMask.size()); if(radius>0){ //radius 1 for(int i = 0; i < codelength; i++){ unsigned int mask = 1 << i; HammingBallMask.push_back(mask); } HammingRadius.push_back(HammingBallMask.size()); } if(radius>1){ //radius 2 for(int i = 0; i < codelength; i++){ for(int j = i+1; j < codelength; j++){ unsigned int mask = (1<2){ //radius 3 for(int i = 0; i < codelength; i++){ for(int j = i+1; j < codelength; j++){ for(int k = j+1; k < codelength; k++){ unsigned int mask = (1<3){ //radius 4 for(int i = 0; i < codelength; i++){ for(int j = i+1; j < codelength; j++){ for(int k = j+1; k < codelength; k++){ for(int a = k+1; a < codelength; a++){ unsigned int mask = (1<4){ //radius 5 for(int i = 0; i < codelength; i++){ for(int j = i+1; j < codelength; j++){ for(int k = j+1; k < codelength; k++){ for(int a = k+1; a < codelength; a++){ for(int b = a+1; b < codelength; b++){ unsigned int mask = (1<5){ //radius 6 for(int i = 0; i < codelength; i++){ for(int j = i+1; j < codelength; j++){ for(int k = j+1; k < codelength; k++){ for(int a = k+1; a < codelength; a++){ for(int b = a+1; b < codelength; b++){ for(int c = b+1; c < codelength; c++){ unsigned int mask = (1<6){ //radius 7 for(int i = 0; i < codelength; i++){ for(int j = i+1; j < codelength; j++){ for(int k = j+1; k < codelength; k++){ for(int a = k+1; a < codelength; a++){ for(int b = a+1; b < codelength; b++){ for(int c = b+1; c < codelength; c++){ for(int d = c+1; d < codelength; d++){ unsigned int mask = (1<7){ //radius 8 for(int i = 0; i < codelength; i++){ for(int j = i+1; j < codelength; j++){ for(int k = j+1; k < codelength; k++){ for(int a = k+1; a < codelength; a++){ for(int b = a+1; b < codelength; b++){ for(int c = b+1; c < codelength; c++){ for(int d = c+1; d < codelength; d++){ for(int e = d+1; e < codelength; e++){ unsigned int mask = (1<8){ //radius 9 for(int i = 0; i < codelength; i++){ for(int j = i+1; j < codelength; j++){ for(int k = j+1; k < codelength; k++){ for(int a = k+1; a < codelength; a++){ for(int b = a+1; b < codelength; b++){ for(int c = b+1; c < codelength; c++){ for(int d = c+1; d < codelength; d++){ for(int e = d+1; e < codelength; e++){ for(int f = e+1; f < codelength; f++){ unsigned int mask = (1<9){ //radius 10 for(int i = 0; i < codelength; i++){ for(int j = i+1; j < codelength; j++){ for(int k = j+1; k < codelength; k++){ for(int a = k+1; a < codelength; a++){ for(int b = a+1; b < codelength; b++){ for(int c = b+1; c < codelength; c++){ for(int d = c+1; d < codelength; d++){ for(int e = d+1; e < codelength; e++){ for(int f = e+1; f < codelength; f++){ for(int g = f+1; g < codelength; g++){ unsigned int mask = (1<10){ //radius 11 for(int i = 0; i < codelength; i++){ for(int j = i+1; j < codelength; j++){ for(int k = j+1; k < codelength; k++){ for(int a = k+1; a < codelength; a++){ for(int b = a+1; b < codelength; b++){ for(int c = b+1; c < codelength; c++){ for(int d = c+1; d < codelength; d++){ for(int e = d+1; e < codelength; e++){ for(int f = e+1; f < codelength; f++){ for(int g = f+1; g < codelength; g++){ for(int h = g+1; h < codelength; h++){ unsigned int mask = (1<11){ //radius 12 for(int i = 0; i < codelength; i++){ for(int j = i+1; j < codelength; j++){ for(int k = j+1; k < codelength; k++){ for(int a = k+1; a < codelength; a++){ for(int b = a+1; b < codelength; b++){ for(int c = b+1; c < codelength; c++){ for(int d = c+1; d < codelength; d++){ for(int e = d+1; e < codelength; e++){ for(int f = e+1; f < codelength; f++){ for(int g = f+1; g < codelength; g++){ for(int h = g+1; h < codelength; h++){ for(int l = h+1; h < codelength; l++){ unsigned int mask = (1<12){ //radius 13 for(int i = 0; i < codelength; i++){ for(int j = i+1; j < codelength; j++){ for(int k = j+1; k < codelength; k++){ for(int a = k+1; a < codelength; a++){ for(int b = a+1; b < codelength; b++){ for(int c = b+1; c < codelength; c++){ for(int d = c+1; d < codelength; d++){ for(int e = d+1; e < codelength; e++){ for(int f = e+1; f < codelength; f++){ for(int g = f+1; g < codelength; g++){ for(int h = g+1; h < codelength; h++){ for(int l = h+1; h < codelength; l++){ for(int m = l+1; m < codelength; m++){ unsigned int mask = (1<& baseAll ,std::vector& tbAll){ for(size_t h=0; h < baseAll.size(); h++){ Codes64& base = baseAll[h]; HashTable64 tb; for(int i = 0; i < (1 << upbits); i++){ HashBucket64 emptyBucket; tb.push_back(emptyBucket); } for(size_t i = 0; i < base.size(); i ++){ unsigned int idx1 = base[i] >> lowbits; unsigned long idx2 = base[i] - ((unsigned long)idx1 << lowbits); if(tb[idx1].find(idx2) != tb[idx1].end()){ tb[idx1][idx2].push_back(i); }else{ std::vector v; v.push_back(i); tb[idx1].insert(make_pair(idx2,v)); } } tbAll.push_back(tb); } } void generateMask64(){ //i = 0 means the origin code HammingBallMask64.push_back(0); HammingRadius.push_back(HammingBallMask64.size()); unsigned long One = 1; if(radius>0){ //radius 1 for(int i = 0; i < codelength; i++){ unsigned long mask = One << i; HammingBallMask64.push_back(mask); } HammingRadius.push_back(HammingBallMask64.size()); } if(radius>1){ //radius 2 for(int i = 0; i < codelength; i++){ for(int j = i+1; j < codelength; j++){ unsigned long mask = (One<2){ //radius 3 for(int i = 0; i < codelength; i++){ for(int j = i+1; j < codelength; j++){ for(int k = j+1; k < codelength; k++){ unsigned long mask = (One<3){ //radius 4 for(int i = 0; i < codelength; i++){ for(int j = i+1; j < codelength; j++){ for(int k = j+1; k < codelength; k++){ for(int a = k+1; a < codelength; a++){ unsigned long mask = (One<4){ //radius 5 for(int i = 0; i < codelength; i++){ for(int j = i+1; j < codelength; j++){ for(int k = j+1; k < codelength; k++){ for(int a = k+1; a < codelength; a++){ for(int b = a+1; b < codelength; b++){ unsigned long mask = (One<5){ //radius 6 for(int i = 0; i < codelength; i++){ for(int j = i+1; j < codelength; j++){ for(int k = j+1; k < codelength; k++){ for(int a = k+1; a < codelength; a++){ for(int b = a+1; b < codelength; b++){ for(int c = b+1; c < codelength; c++){ unsigned long mask = (One<6){ //radius 7 for(int i = 0; i < codelength; i++){ for(int j = i+1; j < codelength; j++){ for(int k = j+1; k < codelength; k++){ for(int a = k+1; a < codelength; a++){ for(int b = a+1; b < codelength; b++){ for(int c = b+1; c < codelength; c++){ for(int d = c+1; d < codelength; d++){ unsigned long mask = (One<7){ //radius 8 for(int i = 0; i < codelength; i++){ for(int j = i+1; j < codelength; j++){ for(int k = j+1; k < codelength; k++){ for(int a = k+1; a < codelength; a++){ for(int b = a+1; b < codelength; b++){ for(int c = b+1; c < codelength; c++){ for(int d = c+1; d < codelength; d++){ for(int e = d+1; e < codelength; e++){ unsigned long mask = (One<8){ //radius 9 for(int i = 0; i < codelength; i++){ for(int j = i+1; j < codelength; j++){ for(int k = j+1; k < codelength; k++){ for(int a = k+1; a < codelength; a++){ for(int b = a+1; b < codelength; b++){ for(int c = b+1; c < codelength; c++){ for(int d = c+1; d < codelength; d++){ for(int e = d+1; e < codelength; e++){ for(int f = e+1; f < codelength; f++){ unsigned long mask = (One<9){ //radius 10 for(int i = 0; i < codelength; i++){ for(int j = i+1; j < codelength; j++){ for(int k = j+1; k < codelength; k++){ for(int a = k+1; a < codelength; a++){ for(int b = a+1; b < codelength; b++){ for(int c = b+1; c < codelength; c++){ for(int d = c+1; d < codelength; d++){ for(int e = d+1; e < codelength; e++){ for(int f = e+1; f < codelength; f++){ for(int g = f+1; g < codelength; g++){ unsigned long mask = (One<10){ //radius 11 for(int i = 0; i < codelength; i++){ for(int j = i+1; j < codelength; j++){ for(int k = j+1; k < codelength; k++){ for(int a = k+1; a < codelength; a++){ for(int b = a+1; b < codelength; b++){ for(int c = b+1; c < codelength; c++){ for(int d = c+1; d < codelength; d++){ for(int e = d+1; e < codelength; e++){ for(int f = e+1; f < codelength; f++){ for(int g = f+1; g < codelength; g++){ for(int h = g+1; h < codelength; h++){ unsigned long mask = (One<& query){ if(gs.size() != features_.get_rows()){ if (codelength <= 32 ){ getNeighbors32(K,query); }else if(codelength <= 64 ){ getNeighbors64(K,query); }else{ std::cout<<"code length not supported yet!"<& query){ int lowbits = codelength - upbits; unsigned int MaxCheck=HammingRadius[radius]; std::cout<<"maxcheck : "< tbflag(features_.get_rows(), false); nn_results.clear(); VisitBucketNum.clear(); VisitBucketNum.resize(radius+2); for(size_t cur = 0; cur < query.get_rows(); cur++){ std::vector pool(SP.search_init_num); unsigned int p = 0; tbflag.reset(); unsigned int j = 0; for(; j < MaxCheck; j++){ for(unsigned int h=0; h < QueryCode.size(); h++){ unsigned int searchcode = QueryCode[h][cur] ^ HammingBallMask[j]; unsigned int idx1 = searchcode >> lowbits; unsigned int idx2 = searchcode - (idx1 << lowbits); HashBucket::iterator bucket= htb[h][idx1].find(idx2); if(bucket != htb[h][idx1].end()){ std::vector vp = bucket->second; for(size_t k = 0; k < vp.size() && p < (unsigned int)SP.search_init_num; k++){ if(tbflag.test(vp[k]))continue; tbflag.set(vp[k]); pool[p++]=(vp[k]); } if(p >= (unsigned int)SP.search_init_num) break; } if(p >= (unsigned int)SP.search_init_num) break; } if(p >= (unsigned int)SP.search_init_num) break; } if(p < (unsigned int)SP.search_init_num){ VisitBucketNum[radius+1]++; }else{ for(int r=0;r<=radius;r++){ if(j<=HammingRadius[r]){ VisitBucketNum[r]++; break; } } } if (p> result; for(unsigned int i=0; icompare(query.get_row(cur), features_.get_row(pool[i]), features_.get_cols()),pool[i])); } std::partial_sort(result.begin(), result.begin() + K, result.end()); std::vector res; for(unsigned int j = 0; j < K; j++) res.push_back(result[j].second); nn_results.push_back(res); } //std::cout<<"bad query number: " << VisitBucketNum[radius+1] << std::endl; } void getNeighbors64(size_t K, const Matrix& query){ int lowbits = codelength - upbits; unsigned int MaxCheck=HammingRadius[radius]; std::cout<<"maxcheck : "< tbflag(features_.get_rows(), false); nn_results.clear(); VisitBucketNum.clear(); VisitBucketNum.resize(radius+2); for(size_t cur = 0; cur < query.get_rows(); cur++){ std::vector pool(SP.search_init_num); unsigned int p = 0; tbflag.reset(); unsigned int j = 0; for(; j < MaxCheck; j++){ for(unsigned int h=0; h < QueryCode64.size(); h++){ unsigned long searchcode = QueryCode64[h][cur] ^ HammingBallMask64[j]; unsigned int idx1 = searchcode >> lowbits; unsigned long idx2 = searchcode - (( unsigned long)idx1 << lowbits); HashBucket64::iterator bucket= htb64[h][idx1].find(idx2); if(bucket != htb64[h][idx1].end()){ std::vector vp = bucket->second; for(size_t k = 0; k < vp.size() && p < (unsigned int)SP.search_init_num; k++){ if(tbflag.test(vp[k]))continue; tbflag.set(vp[k]); pool[p++]=(vp[k]); } if(p >= (unsigned int)SP.search_init_num) break; } if(p >= (unsigned int)SP.search_init_num) break; } if(p >= (unsigned int)SP.search_init_num) break; } if(p < (unsigned int)SP.search_init_num){ VisitBucketNum[radius+1]++; }else{ for(int r=0;r<=radius;r++){ if(j<=HammingRadius[r]){ VisitBucketNum[r]++; break; } } } if (p> result; for(unsigned int i=0; icompare(query.get_row(cur), features_.get_row(pool[i]), features_.get_cols()),pool[i])); } std::partial_sort(result.begin(), result.begin() + K, result.end()); std::vector res; for(unsigned int j = 0; j < K; j++) res.push_back(result[j].second); nn_results.push_back(res); } //std::cout<<"bad query number: " <& query){ int lowbits = codelength - upbits; unsigned int MaxCheck=HammingRadius[radius]; std::cout<<"maxcheck : "< (unsigned)SP.extend_to) resultSize = K; boost::dynamic_bitset<> tbflag(features_.get_rows(), false); nn_results.clear(); VisitBucketNum.clear(); VisitBucketNum.resize(radius+2); for(size_t cur = 0; cur < query.get_rows(); cur++){ tbflag.reset(); std::vector pool(SP.search_init_num); unsigned int p = 0; unsigned int j = 0; for(; j < MaxCheck; j++){ for(size_t h=0; h < QueryCode.size(); h++){ unsigned int searchcode = QueryCode[h][cur] ^ HammingBallMask[j]; unsigned int idx1 = searchcode >> lowbits; unsigned int idx2 = searchcode - (idx1 << lowbits); HashBucket::iterator bucket= htb[h][idx1].find(idx2); if(bucket != htb[h][idx1].end()){ std::vector vp = bucket->second; for(size_t k = 0; k < vp.size() && p < (unsigned int)SP.search_init_num; k++){ if(tbflag.test(vp[k]))continue; tbflag.set(vp[k]); pool[p++]=(vp[k]); } if(p >= (unsigned int)SP.search_init_num) break; } if(p >= (unsigned int)SP.search_init_num) break; } if(p >= (unsigned int)SP.search_init_num) break; } if(p < (unsigned int)SP.search_init_num){ VisitBucketNum[radius+1]++; }else{ for(int r=0;r<=radius;r++){ if(j<=HammingRadius[r]){ VisitBucketNum[r]++; break; } } } int base_n = features_.get_rows(); while(p < (unsigned int)SP.search_init_num){ unsigned int nn = rand() % base_n; if(tbflag.test(nn)) continue; tbflag.set(nn); pool[p++] = (nn); } //sorting the pool std::vector> result; for(unsigned int i=0; icompare(query.get_row(cur), features_.get_row(pool[i]), features_.get_cols()),pool[i])); } std::partial_sort(result.begin(), result.begin() + resultSize, result.end()); result.resize(resultSize); pool.clear(); for(int j = 0; j < resultSize; j++) pool.push_back(result[j].second); //nn_exp boost::dynamic_bitset<> newflag(features_.get_rows(), true); newflag.set(); int iter=0; std::vector ids; while(iter++ < SP.search_epoches){ //the heap is max heap ids.clear(); for(unsigned j = 0; j < SP.extend_to ; j++){ if(newflag.test( pool[j] )){ newflag.reset(pool[j]); for(unsigned neighbor=0; neighbor < gs[pool[j]].size(); neighbor++){ unsigned id = gs[pool[j]][neighbor]; if(tbflag.test(id))continue; else tbflag.set(id); ids.push_back(id); } } } for(size_t j = 0; j < ids.size(); j++){ result.push_back(std::make_pair(distance_->compare(query.get_row(cur), features_.get_row(ids[j]), features_.get_cols()),ids[j])); } std::partial_sort(result.begin(), result.begin() + resultSize, result.end()); result.resize(resultSize); pool.clear(); for(int j = 0; j < resultSize; j++) pool.push_back(result[j].second); } if(K<(unsigned)SP.extend_to) pool.resize(K); nn_results.push_back(pool); } } void getNeighborsIEH32_kgraph(size_t K, const Matrix& query){ int lowbits = codelength - upbits; unsigned int MaxCheck=HammingRadius[radius]; std::cout<<"maxcheck : "< tbflag(features_.get_rows(), false); bool bSorted = true; unsigned pool_size = SP.search_epoches * SP.extend_to; if (pool_size >= (unsigned)SP.search_init_num){ SP.search_init_num = pool_size; bSorted = false; } VisitBucketNum.clear(); VisitBucketNum.resize(radius+2); for(size_t cur = 0; cur < query.get_rows(); cur++){ tbflag.reset(); std::vector pool(SP.search_init_num); unsigned int p = 0; unsigned int j = 0; for(; j < MaxCheck; j++){ for(size_t h=0; h < QueryCode.size(); h++){ unsigned int searchcode = QueryCode[h][cur] ^ HammingBallMask[j]; unsigned int idx1 = searchcode >> lowbits; unsigned int idx2 = searchcode - (idx1 << lowbits); HashBucket::iterator bucket= htb[h][idx1].find(idx2); if(bucket != htb[h][idx1].end()){ std::vector vp = bucket->second; for(size_t k = 0; k < vp.size() && p < (unsigned int)SP.search_init_num; k++){ if(tbflag.test(vp[k]))continue; tbflag.set(vp[k]); pool[p++]=(vp[k]); } if(p >= (unsigned int)SP.search_init_num) break; } if(p >= (unsigned int)SP.search_init_num) break; } if(p >= (unsigned int)SP.search_init_num) break; } if(p < (unsigned int)SP.search_init_num){ VisitBucketNum[radius+1]++; }else{ for(int r=0;r<=radius;r++){ if(j<=HammingRadius[r]){ VisitBucketNum[r]++; break; } } } int base_n = features_.get_rows(); while(p < (unsigned int)SP.search_init_num){ unsigned int nn = rand() % base_n; if(tbflag.test(nn)) continue; tbflag.set(nn); pool[p++] = (nn); } std::vector> result; for(unsigned int i=0; icompare(query.get_row(cur), features_.get_row(pool[i]), features_.get_cols()),pool[i])); } if(bSorted){ std::partial_sort(result.begin(), result.begin() + pool_size, result.end()); result.resize(pool_size); } tbflag.reset(); std::vector knn(K + SP.extend_to +1); std::vector results; for (unsigned iter = 0; iter < (unsigned)SP.search_epoches; iter++) { unsigned L = 0; for(unsigned j=0; j < SP.extend_to ; j++){ if(!tbflag.test(result[iter*SP.extend_to+j].second)){ tbflag.set(result[iter*SP.extend_to+j].second); knn[L].id = result[iter*SP.extend_to+j].second; knn[L].dist = result[iter*SP.extend_to+j].first; knn[L].flag = true; L++; } } if(~bSorted){ std::sort(knn.begin(), knn.begin() + L); } unsigned int k = 0; while (k < L) { unsigned int nk = L; if (knn[k].flag) { knn[k].flag = false; unsigned n = knn[k].id; for(unsigned neighbor=0; neighbor < gs[n].size(); neighbor++){ unsigned id = gs[n][neighbor]; if(tbflag.test(id))continue; tbflag.set(id); float dist = distance_->compare(query.get_row(cur), features_.get_row(id), features_.get_cols()); Point nn(id, dist); unsigned int r = InsertIntoKnn(&knn[0], L, nn); //if ( (r <= L) && (L + 1 < knn.size())) ++L; if ( L + 1 < knn.size()) ++L; if (r < nk) nk = r; } } if (nk <= k) k = nk; else ++k; } if (L > K) L = K; if (results.empty()) { results.reserve(K + 1); results.resize(L + 1); std::copy(knn.begin(), knn.begin() + L, results.begin()); } else { for (unsigned int l = 0; l < L; ++l) { unsigned r = InsertIntoKnn(&results[0], results.size() - 1, knn[l]); if (r < results.size() /* inserted */ && results.size() < (K + 1)) { results.resize(results.size() + 1); } } } } std::vector res; for(size_t i = 0; i < K && i < results.size();i++) res.push_back(results[i].id); nn_results.push_back(res); } } void getNeighborsIEH64_nnexp(size_t K, const Matrix& query){ int lowbits = codelength - upbits; unsigned int MaxCheck=HammingRadius[radius]; std::cout<<"maxcheck : "< (unsigned)SP.extend_to) resultSize = K; boost::dynamic_bitset<> tbflag(features_.get_rows(), false); nn_results.clear(); VisitBucketNum.clear(); VisitBucketNum.resize(radius+2); for(size_t cur = 0; cur < query.get_rows(); cur++){ std::vector pool(SP.search_init_num); unsigned int p = 0; tbflag.reset(); unsigned int j = 0; for(; j < MaxCheck; j++){ for(unsigned int h=0; h < QueryCode64.size(); h++){ unsigned long searchcode = QueryCode64[h][cur] ^ HammingBallMask64[j]; unsigned int idx1 = searchcode >> lowbits; unsigned long idx2 = searchcode - (( unsigned long)idx1 << lowbits); HashBucket64::iterator bucket= htb64[h][idx1].find(idx2); if(bucket != htb64[h][idx1].end()){ std::vector vp = bucket->second; for(size_t k = 0; k < vp.size() && p < (unsigned int)SP.search_init_num; k++){ if(tbflag.test(vp[k]))continue; tbflag.set(vp[k]); pool[p++]=(vp[k]); } if(p >= (unsigned int)SP.search_init_num) break; } if(p >= (unsigned int)SP.search_init_num) break; } if(p >= (unsigned int)SP.search_init_num) break; } if(p < (unsigned int)SP.search_init_num){ VisitBucketNum[radius+1]++; }else{ for(int r=0;r<=radius;r++){ if(j<=HammingRadius[r]){ VisitBucketNum[r]++; break; } } } int base_n = features_.get_rows(); while(p < (unsigned int)SP.search_init_num){ unsigned int nn = rand() % base_n; if(tbflag.test(nn)) continue; tbflag.set(nn); pool[p++] = (nn); } //sorting the pool std::vector> result; for(unsigned int i=0; icompare(query.get_row(cur), features_.get_row(pool[i]), features_.get_cols()),pool[i])); } std::partial_sort(result.begin(), result.begin() + resultSize, result.end()); result.resize(resultSize); pool.clear(); for(int j = 0; j < resultSize; j++) pool.push_back(result[j].second); //nn_exp boost::dynamic_bitset<> newflag(features_.get_rows(), true); newflag.set(); int iter=0; std::vector ids; while(iter++ < SP.search_epoches){ //the heap is max heap ids.clear(); for(unsigned j = 0; j < SP.extend_to ; j++){ if(newflag.test( pool[j] )){ newflag.reset(pool[j]); for(unsigned neighbor=0; neighbor < gs[pool[j]].size(); neighbor++){ unsigned id = gs[pool[j]][neighbor]; if(tbflag.test(id))continue; else tbflag.set(id); ids.push_back(id); } } } for(size_t j = 0; j < ids.size(); j++){ result.push_back(std::make_pair(distance_->compare(query.get_row(cur), features_.get_row(ids[j]), features_.get_cols()),ids[j])); } std::partial_sort(result.begin(), result.begin() + resultSize, result.end()); result.resize(resultSize); pool.clear(); for(int j = 0; j < resultSize; j++) pool.push_back(result[j].second); } if(K<(unsigned)SP.extend_to) pool.resize(K); nn_results.push_back(pool); } } void getNeighborsIEH64_kgraph(size_t K, const Matrix& query){ int lowbits = codelength - upbits; unsigned int MaxCheck=HammingRadius[radius]; std::cout<<"maxcheck : "< tbflag(features_.get_rows(), false); bool bSorted = true; unsigned pool_size = SP.search_epoches * SP.extend_to; if (pool_size >= (unsigned)SP.search_init_num){ SP.search_init_num = pool_size; bSorted = false; } VisitBucketNum.clear(); VisitBucketNum.resize(radius+2); for(size_t cur = 0; cur < query.get_rows(); cur++){ std::vector pool(SP.search_init_num); unsigned int p = 0; tbflag.reset(); unsigned int j = 0; for(; j < MaxCheck; j++){ for(unsigned int h=0; h < QueryCode64.size(); h++){ unsigned long searchcode = QueryCode64[h][cur] ^ HammingBallMask64[j]; unsigned int idx1 = searchcode >> lowbits; unsigned long idx2 = searchcode - (( unsigned long)idx1 << lowbits); HashBucket64::iterator bucket= htb64[h][idx1].find(idx2); if(bucket != htb64[h][idx1].end()){ std::vector vp = bucket->second; for(size_t k = 0; k < vp.size() && p < (unsigned int)SP.search_init_num; k++){ if(tbflag.test(vp[k]))continue; tbflag.set(vp[k]); pool[p++]=(vp[k]); } if(p >= (unsigned int)SP.search_init_num) break; } if(p >= (unsigned int)SP.search_init_num) break; } if(p >= (unsigned int)SP.search_init_num) break; } if(p < (unsigned int)SP.search_init_num){ VisitBucketNum[radius+1]++; }else{ for(int r=0;r<=radius;r++){ if(j<=HammingRadius[r]){ VisitBucketNum[r]++; break; } } } int base_n = features_.get_rows(); while(p < (unsigned int)SP.search_init_num){ unsigned int nn = rand() % base_n; if(tbflag.test(nn)) continue; tbflag.set(nn); pool[p++] = (nn); } std::vector> result; for(unsigned int i=0; icompare(query.get_row(cur), features_.get_row(pool[i]), features_.get_cols()),pool[i])); } if(bSorted){ std::partial_sort(result.begin(), result.begin() + pool_size, result.end()); result.resize(pool_size); } tbflag.reset(); std::vector knn(K + SP.extend_to +1); std::vector results; for (unsigned iter = 0; iter < (unsigned)SP.search_epoches; iter++) { unsigned L = 0; for(unsigned j=0; j < (unsigned)SP.extend_to ; j++){ if(!tbflag.test(result[iter*SP.extend_to+j].second)){ tbflag.set(result[iter*SP.extend_to+j].second); knn[L].id = result[iter*SP.extend_to+j].second; knn[L].dist = result[iter*SP.extend_to+j].first; knn[L].flag = true; L++; } } if(~bSorted){ std::sort(knn.begin(), knn.begin() + L); } unsigned int k = 0; while (k < L) { unsigned int nk = L; if (knn[k].flag) { knn[k].flag = false; unsigned n = knn[k].id; for(unsigned neighbor=0; neighbor < gs[n].size(); neighbor++){ unsigned id = gs[n][neighbor]; if(tbflag.test(id))continue; tbflag.set(id); float dist = distance_->compare(query.get_row(cur), features_.get_row(id), features_.get_cols()); Point nn(id, dist); unsigned int r = InsertIntoKnn(&knn[0], L, nn); //if ( (r <= L) && (L + 1 < knn.size())) ++L; if ( L + 1 < knn.size()) ++L; if (r < nk) nk = r; } } if (nk <= k) k = nk; else ++k; } if (L > K) L = K; if (results.empty()) { results.reserve(K + 1); results.resize(L + 1); std::copy(knn.begin(), knn.begin() + L, results.begin()); } else { for (unsigned int l = 0; l < L; ++l) { unsigned r = InsertIntoKnn(&results[0], results.size() - 1, knn[l]); if (r < results.size() /* inserted */ && results.size() < (K + 1)) { results.resize(results.size() + 1); } } } } std::vector res; for(size_t i = 0; i < K && i < results.size();i++) res.push_back(results[i].id); nn_results.push_back(res); } } void outputVisitBucketNum(){ unsigned i=0; std::cout<< "Radius " << i <<" bucket num: "< heap; in.read((char*)&dim, sizeof(int)); for(int j =0; j < dim; j++){ unsigned id; in.read((char*)&id, sizeof(int)); heap.push_back(id); } gs.push_back(heap); } in.close(); } void saveGraph(char* filename){} void initGraph(){} protected: int tablenum; int upbits; int codelength; int codelengthshift; int radius; USING_BASECLASS_SYMBOLS std::vector htb; std::vector BaseCode; std::vector QueryCode; std::vector HammingBallMask; std::vector htb64; std::vector BaseCode64; std::vector QueryCode64; std::vector HammingBallMask64; std::vector HammingRadius; // for statistic info std::vector VisitBucketNum; }; } #endif ================================================ FILE: algorithm/init_indices.hpp ================================================ #ifndef EFANNA_INIT_INDICES_H_ #define EFANNA_INIT_INDICES_H_ #include "base_index.hpp" #include "kdtreeub_index.hpp" #include "hashing_index.hpp" namespace efanna{ template class Index, typename DataType> inline InitIndex* create_index_(efanna::Matrix data, const efanna::IndexParams& params, const Distance* d) { return new Index(data, d, params); } template inline InitIndex* create_index_by_type(const init_algorithm index_type, const Matrix& dataset, const IndexParams& params, const Distance* d) { InitIndex* initIndex = NULL; switch(index_type){ case KDTREE_UB: initIndex = create_index_(dataset, params, d); break; case HASHING: initIndex = create_index_(dataset, params, d); break; } return initIndex; } } #endif ================================================ FILE: algorithm/kdtreeub_index.hpp ================================================ #ifndef EFANNA_KDTREE_UB_INDEX_H_ #define EFANNA_KDTREE_UB_INDEX_H_ #include "algorithm/base_index.hpp" #include #include #include #include #include //#include //using std::bitset; #include namespace efanna{ struct KDTreeUbIndexParams : public IndexParams { KDTreeUbIndexParams(bool rnn_used, int tree_num_total, int merge_level = 4, int epoches = 4, int check = 25, int myL = 30, int building_use_k = 10, int tree_num_build = 0, int myS = 10) { reverse_nn_used = rnn_used; init_index_type = KDTREE_UB; K = building_use_k; build_epoches = epoches; S = myS; ValueType treev; treev.int_val = tree_num_total; extra_params.insert(std::make_pair("trees",treev)); ValueType treeb; treeb.int_val = tree_num_build > 0 ? tree_num_build : tree_num_total; extra_params.insert(std::make_pair("treesb",treeb)); ValueType merge_levelv; merge_levelv.int_val = merge_level; extra_params.insert(std::make_pair("ml",merge_levelv)); L = myL; Check_K = check; } }; template class KDTreeUbIndex : public InitIndex { public: typedef InitIndex BaseClass; KDTreeUbIndex(const Matrix& dataset, const Distance* d, const IndexParams& params = KDTreeUbIndexParams(true,4)) : BaseClass(dataset,d,params) { std::cout<<"kdtree ub initial"<second).int_val; #ifdef INFO std::cout << "Using kdtree to build "<< TreeNum << " trees in total" << std::endl; #endif } else{ TreeNum = 4; #ifdef INFO std::cout << "Using kdtree to build "<< TreeNum << " trees in total" << std::endl; #endif } SP.tree_num = TreeNum; it = params_.extra_params.find("treesb"); if(it != params_.extra_params.end()){ TreeNumBuild = (it->second).int_val; #ifdef INFO std::cout << "Building kdtree graph with "<< TreeNumBuild <<" trees"<< std::endl; #endif } else{ TreeNumBuild = TreeNum; #ifdef INFO std::cout << "Building kdtree graph with "<< TreeNumBuild <<" trees"<< std::endl; #endif } it = params_.extra_params.find("ml"); if(it != params_.extra_params.end()){ ml = (it->second).int_val; #ifdef INFO std::cout << "Building kdtree initial index with merge level "<< ml << std::endl; #endif } else{ ml = -1; #ifdef INFO std::cout << "Building kdtree initial index with max merge level "<< std::endl; #endif } max_deepth = 0x0fffffff; error_flag = false; } void buildIndexImpl(){ #ifdef INFO clock_t s,f; s = clock(); #endif initGraph(); #ifdef INFO f = clock(); #endif std::cout << "initial graph finised"<< std::endl; #ifdef INFO std::cout << "initial graph using time: "<< (f-s)*1.0/CLOCKS_PER_SEC<<" seconds"<< std::endl; #endif if(error_flag){ std::cout << "merge level deeper than tree, max merge deepth is" << max_deepth-1<~Node(); if (Rchild!=NULL) Rchild->~Node(); } }; void loadIndex(char* filename){ read_data(filename); } void saveIndex(char* filename){ size_t points_num = features_.get_rows(); size_t feature_dim = features_.get_cols(); save_data(filename, params_.K, points_num, feature_dim); } //algorithms copy and rewrite from flann void loadTrees(char* filename){ std::ifstream in(filename, std::ios::binary|std::ios::in); if(!in.is_open()){std::cout<<"open file error"< tree_nodes; for(int j=0;jDivDim),sizeof(tmp->DivDim)); in.read((char*)&(tmp->DivVal),sizeof(tmp->DivVal)); in.read((char*)&(tmp->StartIdx),sizeof(tmp->StartIdx)); in.read((char*)&(tmp->EndIdx),sizeof(tmp->EndIdx)); in.read((char*)&(tmp->Lchild),sizeof(tmp->Lchild)); in.read((char*)&(tmp->Rchild),sizeof(tmp->Rchild)); tmp->Lchild = NULL; tmp->Rchild = NULL; tmp->treeid = i; tree_nodes.push_back(tmp); } //std::cout<<"build "< leaves; for(unsigned int j=0;j::iterator it;//int cnt=0; for(it=tree_roots_.begin(); it!=tree_roots_.end(); it++){ //write tree nodes with depth first trace size_t offset_node_num = out.tellp(); out.seekp(sizeof(int),std::ios::cur); unsigned int node_size = sizeof(struct Node); out.write((char *)&(node_size), sizeof(int)); unsigned int node_num = DepthFirstWrite(out, *it); out.seekg(offset_node_num,std::ios::beg); out.write((char *)&(node_num), sizeof(int)); out.seekp(0,std::ios::end); //std::cout<<"tree: "<row_id; out.write((char*)&id, sizeof(int)); } }//meansize /= knn_graph.size(); //std::cout << "size mean " << meansize << std::endl; out.close(); } */ void saveGraph(char* filename){ std::ofstream out(filename,std::ios::binary); unsigned N = gs.size(); //out.write((char*)&N, sizeof(int)); for(unsigned i=0; i < N; i++){ unsigned k = gs[i].size(); //unsigned m = M[i]; //DataType norm = norms[i]; out.write((char*)&k, sizeof(unsigned)); //out.write((char*)&m, sizeof(unsigned)); //out.write((char*)&norm, sizeof(DataType)); for(unsigned j = 0; j < k; j++){ unsigned id = gs[i][j]; out.write((char*)&id, sizeof(unsigned)); } } out.close(); } //for nn search void SearchQueryToLeaf(Node* node, const DataType* q, unsigned dep, std::vector& node_pool){ if(node->Lchild != NULL && node->Rchild !=NULL){ if(q[node->DivDim] < node->DivVal){ SearchQueryToLeaf(node->Lchild, q, dep, node_pool); if(node_pool.size() < dep) SearchQueryToLeaf(node->Rchild, q, dep, node_pool); } else{ SearchQueryToLeaf(node->Rchild, q, dep, node_pool); if(node_pool.size() < dep) SearchQueryToLeaf(node->Lchild, q, dep, node_pool); } } else node_pool.push_back(node); } void getSearchNodeList(Node* node, const DataType* q, unsigned int lsize, std::vector& vn){ if(vn.size() >= lsize) return; if(node->Lchild != NULL && node->Rchild !=NULL){ if(q[node->DivDim] < node->DivVal){ getSearchNodeList(node->Lchild, q, lsize, vn ); getSearchNodeList(node->Rchild, q, lsize, vn); }else{ getSearchNodeList(node->Rchild, q, lsize, vn); getSearchNodeList(node->Lchild, q, lsize, vn); } }else vn.push_back(node); } void getNeighbors(size_t searchK, const Matrix& query){ switch(SP.search_method){ case 0: getNeighbors_nnexp(searchK, query); break; case 1: getNeighbors_kgraph(searchK, query); break; default: std::cout<<"no such searching method"<& query){ #ifdef INFO std::cout<<"using tree num "<< SP.tree_num< tree_roots_.size()){ std::cout<<"wrong tree number"< (unsigned)SP.extend_to) resultSize = K; #pragma omp parallel for for(unsigned int cur = 0; cur < query.get_rows(); cur++){ boost::dynamic_bitset<> tbflag(features_.get_rows(), false); boost::dynamic_bitset<> newflag(features_.get_rows(), true); tbflag.reset(); newflag.set(); std::vector> NodeCandi; NodeCandi.resize(SP.tree_num); const DataType* q_row = query.get_row(cur); _mm_prefetch((char *)q_row, _MM_HINT_T0); unsigned int lsize = SP.search_init_num*2 / (5*SP.tree_num) + 1; for(unsigned int i = 0; i < SP.tree_num; i++){ getSearchNodeList(tree_roots_[i], q_row, lsize, NodeCandi[i]); } std::vector pool(SP.search_init_num); unsigned int p = 0; for(unsigned int ni = 0; ni < lsize; ni++){ for(unsigned int i = 0; i < NodeCandi.size(); i++){ Node* leafn = NodeCandi[i][ni]; for(size_t j = leafn->StartIdx; j < leafn->EndIdx && p < (unsigned int)SP.search_init_num; j++){ size_t nn = LeafLists[i][j]; if(tbflag.test(nn))continue; tbflag.set(nn); pool[p++]=(nn); } if(p >= (unsigned int)SP.search_init_num) break; } if(p >= (unsigned int)SP.search_init_num) break; } int base_n = features_.get_rows(); while(p < (unsigned int)SP.search_init_num){ unsigned int nn = rand() % base_n; if(tbflag.test(nn))continue; tbflag.set(nn); pool[p++]=(nn); } std::vector> result; //for(unsigned int i=0; i pool.size() ? pool.size() : s+cache_blocksz; unsigned s_ = s; while(scompare(q_row, features_.get_row(pool[s_]), dim),pool[s_])); s_++; } } std::partial_sort(result.begin(), result.begin() + resultSize, result.end()); result.resize(resultSize); pool.clear(); for(int j = 0; j < resultSize; j++) pool.push_back(result[j].second); int iter=0; std::vector ids; while(iter++ < SP.search_epoches){ ids.clear(); for(unsigned j = 0; j < SP.extend_to ; j++){ if(newflag.test( pool[j] )){ newflag.reset(pool[j]); for(unsigned neighbor=0; neighbor < gs[pool[j]].size(); neighbor++){ unsigned id = gs[pool[j]][neighbor]; if(tbflag.test(id))continue; else tbflag.set(id); ids.push_back(id); } } } //for(unsigned int j=0; j ids.size() ? ids.size() : s+cache_blocksz; unsigned s_ = s; while(scompare(q_row, features_.get_row(ids[s_]), dim),ids[s_])); s_++; } //result.push_back(std::make_pair(distance_->compare(q_row, features_.get_row(ids[j]), dim),ids[j])); } std::partial_sort(result.begin(), result.begin() + resultSize, result.end()); result.resize(resultSize); pool.clear(); for(int j = 0; j < resultSize; j++) pool.push_back(result[j].second); } if(K& res = nn_results[cur]; for(unsigned i = 0; i < K ;i++) res.push_back(pool[i]); } } void getNeighbors_kgraph(size_t searchK, const Matrix& query){ #ifdef INFO std::cout<<"using tree num "<< SP.tree_num< tree_roots_.size()){ std::cout<<"wrong tree number"<= (unsigned)SP.search_init_num){ SP.search_init_num = pool_size; bSorted = false; } #pragma omp parallel for for(unsigned int cur = 0; cur < query.get_rows(); cur++){ std::mt19937 rng(1998); boost::dynamic_bitset<> flags(features_.get_rows(), false); std::vector > Vnl; Vnl.resize(SP.tree_num); const DataType* q_row = query.get_row(cur); _mm_prefetch((char *)q_row, _MM_HINT_T0); for(unsigned int i = 0; i < SP.tree_num; i++){ getSearchNodeList(tree_roots_[i], q_row, lsize, Vnl[i]); } std::vector pool(SP.search_init_num); unsigned int p = 0; for(unsigned int ni = 0; ni < lsize; ni++){ for(unsigned int i = 0; i < Vnl.size(); i++){ Node* leafn = Vnl[i][ni]; for(size_t j = leafn->StartIdx; j < leafn->EndIdx && p < (unsigned int)SP.search_init_num; j++){ size_t nn = LeafLists[i][j]; if(flags.test(nn))continue; flags.set(nn); pool[p++]=(nn); } if(p >= (unsigned int)SP.search_init_num) break; } if(p >= (unsigned int)SP.search_init_num) break; } int base_n = features_.get_rows(); while(p < (unsigned int)SP.search_init_num){ unsigned int nn = rand() % base_n; if(flags.test(nn))continue; flags.set(nn); pool[p++]=(nn); } std::vector> result; unsigned cache_blocksz = 80; for(unsigned int i=0; i*cache_blocksz pool.size() ? pool.size() : s+cache_blocksz; unsigned s_ = s; while(scompare(q_row, features_.get_row(pool[s_]), dim),pool[s_])); s_++; } } if(bSorted){ std::partial_sort(result.begin(), result.begin() + pool_size, result.end()); result.resize(pool_size); } flags.reset(); std::vector knn(searchK + SP.extend_to +1); std::vector results; for (unsigned iter = 0; iter < (unsigned)SP.search_epoches; ++iter) { unsigned L = 0; for(unsigned j=0; j < (unsigned)SP.extend_to ; j++){ if(!flags.test(result[iter*SP.extend_to+j].second)){ flags.set(result[iter*SP.extend_to+j].second); knn[L].id = result[iter*SP.extend_to+j].second; knn[L].dist = result[iter*SP.extend_to+j].first; knn[L].flag = true; L++; } } if(~bSorted){ std::sort(knn.begin(), knn.begin() + L); } unsigned k = 0; while (k < L) { unsigned nk = L; if (knn[k].flag) { knn[k].flag = false; unsigned n = knn[k].id; //unsigned maxM = M[n]; unsigned maxM = SP.extend_to; //if ((unsigned)SP.extend_to > maxM) maxM = SP.extend_to; auto const &neighbors = gs[n]; if (maxM > neighbors.size()) { maxM = neighbors.size(); } for(unsigned m = 0; m < maxM; ++m){ _mm_prefetch((char *)features_.get_row(neighbors[m]), _MM_HINT_T0); } for (unsigned m = 0; m < maxM; ++m) { unsigned id = neighbors[m]; //BOOST_VERIFY(id < graph.size()); if (flags[id]) continue; flags[id] = true; DataType dist = distance_->compare(q_row, features_.get_row(id), dim); Point nn(id, dist); unsigned r = InsertIntoKnn(&knn[0], L, nn); //BOOST_VERIFY(r <= L); //if (r > L) continue; if (L + 1 < knn.size()) ++L; if (r < nk) { nk = r; } } } if (nk <= k) { k = nk; } else { ++k; } } if (L > searchK) L = searchK; if (results.empty()) { results.reserve(searchK + 1); results.resize(L + 1); std::copy(knn.begin(), knn.begin() + L, results.begin()); } else { for (unsigned int l = 0; l < L; ++l) { unsigned r = InsertIntoKnn(&results[0], results.size() - 1, knn[l]); if (r < results.size() && results.size() < (searchK + 1)) { results.resize(results.size() + 1); } } } } std::vector& res = nn_results[cur]; for(size_t i = 0; i < searchK && i < results.size();i++) res.push_back(results[i].id); } } int DepthFirstWrite(std::fstream& out, struct Node *root){ if(root==NULL) return 0; int left_cnt = DepthFirstWrite(out, root->Lchild); int right_cnt = DepthFirstWrite(out, root->Rchild); //std::cout << root->StartIdx <<":" << root->EndIdx<< std::endl; out.write((char *)&(root->DivDim), sizeof(root->DivDim)); out.write((char *)&(root->DivVal), sizeof(root->DivVal)); out.write((char *)&(root->StartIdx), sizeof(root->StartIdx)); out.write((char *)&(root->EndIdx), sizeof(root->EndIdx)); out.write((char *)&(root->Lchild), sizeof(root->Lchild)); out.write((char *)&(root->Rchild), sizeof(root->Rchild)); return (left_cnt + right_cnt + 1); } struct Node* DepthFirstBuildTree(std::vector& tree_nodes){ std::vector root_serial; typename std::vector::iterator it = tree_nodes.begin(); for( ; it!=tree_nodes.end(); it++){ Node* tmp = *it; size_t rsize = root_serial.size(); if(rsize<2){ root_serial.push_back(tmp); //continue; } else{ Node *last1 = root_serial[rsize-1]; Node *last2 = root_serial[rsize-2]; if(last1->EndIdx == tmp->EndIdx && last2->StartIdx == tmp->StartIdx){ tmp->Rchild = last1; tmp->Lchild = last2; root_serial.pop_back(); root_serial.pop_back(); } root_serial.push_back(tmp); } } if(root_serial.size()!=1){ std::cout << "Error constructing trees" << std::endl; return NULL; } return root_serial[0]; } void read_data(char *filename){ std::ifstream in(filename, std::ios::binary|std::ios::in); if(!in.is_open()){std::cout<<"open file error"< tree_nodes; for(int j=0;jDivDim),sizeof(tmp->DivDim)); in.read((char*)&(tmp->DivVal),sizeof(tmp->DivVal)); in.read((char*)&(tmp->StartIdx),sizeof(tmp->StartIdx)); in.read((char*)&(tmp->EndIdx),sizeof(tmp->EndIdx)); in.read((char*)&(tmp->Lchild),sizeof(tmp->Lchild)); in.read((char*)&(tmp->Rchild),sizeof(tmp->Rchild)); tmp->Lchild = NULL; tmp->Rchild = NULL; tree_nodes.push_back(tmp); } //std::cout<<"build "< leaves; for(unsigned int j=0;j can(id, -1); heap.insert(can); } knn_graph.push_back(heap); } in.close(); } void save_data(char *filename, unsigned int K, size_t num, size_t dim){ std::fstream out(filename, std::ios::binary|std::ios::out); if(!out.is_open()){std::cout<<"open file error"<::iterator it;//int cnt=0; for(it=tree_roots_.begin(); it!=tree_roots_.end(); it++){ //write tree nodes with depth first trace size_t offset_node_num = out.tellp(); out.seekp(sizeof(int),std::ios::cur); unsigned int node_size = sizeof(struct Node); out.write((char *)&(node_size), sizeof(int)); unsigned int node_num = DepthFirstWrite(out, *it); out.seekg(offset_node_num,std::ios::beg); out.write((char *)&(node_num), sizeof(int)); out.seekp(0,std::ios::end); //std::cout<<"tree: "<row_id; out.write((char*)&id, sizeof(int)); } } out.close(); } /* Node* divideTree(std::mt19937& rng, int* indices, size_t count, size_t offset){ Node* node = new Node(); if(count <= params_.TNS){ node->DivDim = -1; node->Lchild = NULL; node->Rchild = NULL; node->StartIdx = offset; node->EndIdx = offset + count; //add points for(size_t i = 0; i < count; i++){ for(size_t j = i+1; j < count; j++){ DataType dist = distance_->compare( features_.get_row(indices[i]), features_.get_row(indices[j]), features_.get_cols()); if(knn_graph[indices[i]].size() < params_.S || dist < knn_graph[indices[i]].begin()->distance){ Candidate c1(indices[j], dist); knn_graph[indices[i]].insert(c1); if(knn_graph[indices[i]].size() > params_.S)knn_graph[indices[i]].erase(knn_graph[indices[i]].begin()); } else if(nhoods[indices[i]].nn_new.size() < params_.S * 2)nhoods[indices[i]].nn_new.push_back(indices[j]); if(knn_graph[indices[j]].size() < params_.S || dist < knn_graph[indices[j]].begin()->distance){ Candidate c2(indices[i], dist); knn_graph[indices[j]].insert(c2); if(knn_graph[indices[j]].size() > params_.S)knn_graph[indices[j]].erase(knn_graph[indices[j]].begin()); } else if(nhoods[indices[j]].nn_new.size() < params_.S * 2)nhoods[indices[j]].nn_new.push_back(indices[i]); } } }else{ int idx; int cutdim; DataType cutval; meanSplit(rng, indices, count, idx, cutdim, cutval); node->DivDim = cutdim; node->DivVal = cutval; node->StartIdx = offset; node->EndIdx = offset + count; node->Lchild = divideTree(rng, indices, idx, offset); node->Rchild = divideTree(rng, indices+idx, count-idx, offset+idx); } return node; } Node* divideTreeOnly(std::mt19937& rng, unsigned* indices, size_t count, size_t offset){ Node* node = new Node(); if(count <= params_.TNS){ node->DivDim = -1; node->Lchild = NULL; node->Rchild = NULL; node->StartIdx = offset; node->EndIdx = offset + count; //add points }else{ unsigned idx; unsigned cutdim; DataType cutval; meanSplit(rng, indices, count, idx, cutdim, cutval); node->DivDim = cutdim; node->DivVal = cutval; node->StartIdx = offset; node->EndIdx = offset + count; node->Lchild = divideTreeOnly(rng, indices, idx, offset); node->Rchild = divideTreeOnly(rng, indices+idx, count-idx, offset+idx); } return node; } */ void meanSplit(std::mt19937& rng, unsigned* indices, unsigned count, unsigned& index, unsigned& cutdim, DataType& cutval){ size_t veclen_ = features_.get_cols(); DataType* mean_ = new DataType[veclen_]; DataType* var_ = new DataType[veclen_]; memset(mean_,0,veclen_*sizeof(DataType)); memset(var_,0,veclen_*sizeof(DataType)); /* Compute mean values. Only the first SAMPLE_NUM values need to be sampled to get a good estimate. */ unsigned cnt = std::min((unsigned)SAMPLE_NUM+1, count); for (unsigned j = 0; j < cnt; ++j) { const DataType* v = features_.get_row(indices[j]); for (size_t k=0; kcount/2) index = lim1; else if (lim2=cutval) --right; if (left>right) break; std::swap(indices[left], indices[right]); ++left; --right; } lim1 = left;//lim1 is the id of the leftmost point <= cutval right = count-1; for (;; ) { while (left<=right && features_.get_row(indices[left])[cutdim]<=cutval) ++left; while (left<=right && features_.get_row(indices[right])[cutdim]>cutval) --right; if (left>right) break; std::swap(indices[left], indices[right]); ++left; --right; } lim2 = left;//lim2 is the id of the leftmost point >cutval } int selectDivision(std::mt19937& rng, DataType* v){ int num = 0; size_t topind[RAND_DIM]; //Create a list of the indices of the top RAND_DIM values. for (size_t i = 0; i < features_.get_cols(); ++i) { if ((num < RAND_DIM)||(v[i] > v[topind[num-1]])) { // Put this element at end of topind. if (num < RAND_DIM) { topind[num++] = i; // Add to list. } else { topind[num-1] = i; // Replace last element. } // Bubble end value down to right location by repeated swapping. sort the varience in decrease order int j = num - 1; while (j > 0 && v[topind[j]] > v[topind[j-1]]) { std::swap(topind[j], topind[j-1]); --j; } } } // Select a random integer in range [0,num-1], and return that index. int rnd = rng()%num; return (int)topind[rnd]; } void getMergeLevelNodeList(Node* node, size_t treeid, int deepth){ if(node->Lchild != NULL && node->Rchild != NULL && deepth < ml){ deepth++; getMergeLevelNodeList(node->Lchild, treeid, deepth); getMergeLevelNodeList(node->Rchild, treeid, deepth); }else if(deepth == ml){ mlNodeList.push_back(std::make_pair(node,treeid)); }else{ error_flag = true; if(deepth < max_deepth)max_deepth = deepth; } } Node* SearchToLeaf(Node* node, size_t id){ if(node->Lchild != NULL && node->Rchild !=NULL){ if(features_.get_row(id)[node->DivDim] < node->DivVal) return SearchToLeaf(node->Lchild, id); else return SearchToLeaf(node->Rchild, id); } else return node; }int cc = 0; void mergeSubGraphs(size_t treeid, Node* node){ if(node->Lchild != NULL && node->Rchild != NULL){ mergeSubGraphs(treeid, node->Lchild); mergeSubGraphs(treeid, node->Rchild); size_t numL = node->Lchild->EndIdx - node->Lchild->StartIdx; size_t numR = node->Rchild->EndIdx - node->Rchild->StartIdx; size_t start,end; Node * root; if(numL < numR){ root = node->Rchild; start = node->Lchild->StartIdx; end = node->Lchild->EndIdx; }else{ root = node->Lchild; start = node->Rchild->StartIdx; end = node->Rchild->EndIdx; } for(;start < end; start++){ size_t feature_id = LeafLists[treeid][start]; Node* leaf = SearchToLeaf(root, feature_id); for(size_t i = leaf->StartIdx; i < leaf->EndIdx; i++){ size_t tmpfea = LeafLists[treeid][i]; DataType dist = distance_->compare( features_.get_row(tmpfea), features_.get_row(feature_id), features_.get_cols()); {LockGuard g(*nhoods[tmpfea].lock); if(knn_graph[tmpfea].size() < params_.S || dist < knn_graph[tmpfea].begin()->distance){ Candidate c1(feature_id, dist); knn_graph[tmpfea].insert(c1); if(knn_graph[tmpfea].size() > params_.S)knn_graph[tmpfea].erase(knn_graph[tmpfea].begin()); } else if(nhoods[tmpfea].nn_new.size() < params_.S * 2){ nhoods[tmpfea].nn_new.push_back(feature_id); } } {LockGuard g(*nhoods[feature_id].lock); if(knn_graph[feature_id].size() < params_.S || dist < knn_graph[feature_id].begin()->distance){ Candidate c1(tmpfea, dist); knn_graph[feature_id].insert(c1); if(knn_graph[feature_id].size() > params_.S)knn_graph[feature_id].erase(knn_graph[feature_id].begin()); } else if(nhoods[feature_id].nn_new.size() < params_.S * 2){ nhoods[feature_id].nn_new.push_back(tmpfea); } } } } } } typedef std::set, std::greater> > CandidateHeap; protected: enum { /** * To improve efficiency, only SAMPLE_NUM random values are used to * compute the mean and variance at each level when building a tree. * A value of 100 seems to perform as well as using all values. */ SAMPLE_NUM = 100, /** * Top random dimensions to consider * * When creating random trees, the dimension on which to subdivide is * selected at random from among the top RAND_DIM dimensions with the * highest variance. A value of 5 works well. */ RAND_DIM=5 }; int TreeNum; int TreeNumBuild; int ml; //merge_level int max_deepth; int veclen_; //DataType* var_; omp_lock_t rootlock; bool error_flag; //DataType* mean_; std::vector tree_roots_; std::vector< std::pair > mlNodeList; std::vector> LeafLists; USING_BASECLASS_SYMBOLS //kgraph code static void GenRandom (std::mt19937& rng, unsigned *addr, unsigned size, unsigned N) { for (unsigned i = 0; i < size; ++i) { addr[i] = rng() % (N - size); } std::sort(addr, addr + size); for (unsigned i = 1; i < size; ++i) { if (addr[i] <= addr[i-1]) { addr[i] = addr[i-1] + 1; } } unsigned off = rng() % N; for (unsigned i = 0; i < size; ++i) { addr[i] = (addr[i] + off) % N; } } void DFSbuild(Node* node, std::mt19937& rng, unsigned* indices, unsigned count, unsigned offset){ //omp_set_lock(&rootlock); //std::cout<treeid<<":"<DivDim = -1; node->Lchild = NULL; node->Rchild = NULL; node->StartIdx = offset; node->EndIdx = offset + count; //add points }else{ unsigned idx; unsigned cutdim; DataType cutval; meanSplit(rng, indices, count, idx, cutdim, cutval); node->DivDim = cutdim; node->DivVal = cutval; node->StartIdx = offset; node->EndIdx = offset + count; Node* nodeL = new Node(); Node* nodeR = new Node(); node->Lchild = nodeL; nodeL->treeid = node->treeid; DFSbuild(nodeL, rng, indices, idx, offset); node->Rchild = nodeR; nodeR->treeid = node->treeid; DFSbuild(nodeR, rng, indices+idx, count-idx, offset+idx); } } void DFStest(unsigned level, unsigned dim, Node* node){ if(node->Lchild !=NULL){ DFStest(++level, node->DivDim, node->Lchild); //if(level > 15) std::cout<<"dim: "<DivDim<<"--cutval: "<DivVal<<"--S: "<StartIdx<<"--E: "<EndIdx<<" TREE: "<treeid<Lchild->Lchild ==NULL){ std::vector& tmp = LeafLists[node->treeid]; for(unsigned i = node->Rchild->StartIdx; i < node->Rchild->EndIdx; i++) std::cout<DivDim]<<" "; std::cout<Rchild !=NULL){ DFStest(++level, node->DivDim, node->Rchild); } else{ std::cout<<"dim: "<& tmp = LeafLists[node->treeid]; for(unsigned i = node->StartIdx; i < node->EndIdx; i++) std::cout< visited(N, false); knn_graph.resize(N); for (auto &nhood: nhoods) { //nhood.nn_new.resize(params_.S * 2); nhood.pool.resize(params_.L+1); nhood.radius = std::numeric_limits::max(); } //build tree std::vector indices(N); LeafLists.resize(TreeNum); std::vector ActiveSet; std::vector NewSet; for(unsigned i = 0; i < (unsigned)TreeNum; i++){ Node* node = new Node; node->DivDim = -1; node->Lchild = NULL; node->Rchild = NULL; node->StartIdx = 0; node->EndIdx = N; node->treeid = i; tree_roots_.push_back(node); ActiveSet.push_back(node); } #pragma omp parallel for for(unsigned i = 0; i < N; i++)indices[i] = i; #pragma omp parallel for for(unsigned i = 0; i < (unsigned)TreeNum; i++){ std::vector& myids = LeafLists[i]; myids.resize(N); std::copy(indices.begin(), indices.end(),myids.begin()); std::random_shuffle(myids.begin(), myids.end()); } omp_init_lock(&rootlock); while(!ActiveSet.empty() && ActiveSet.size() < 1100){ #pragma omp parallel for for(unsigned i = 0; i < ActiveSet.size(); i++){ Node* node = ActiveSet[i]; unsigned mid; unsigned cutdim; DataType cutval; std::mt19937 rng(seed ^ omp_get_thread_num()); std::vector& myids = LeafLists[node->treeid]; meanSplit(rng, &myids[0]+node->StartIdx, node->EndIdx - node->StartIdx, mid, cutdim, cutval); node->DivDim = cutdim; node->DivVal = cutval; //node->StartIdx = offset; //node->EndIdx = offset + count; Node* nodeL = new Node(); Node* nodeR = new Node(); nodeR->treeid = nodeL->treeid = node->treeid; nodeL->StartIdx = node->StartIdx; nodeL->EndIdx = node->StartIdx+mid; nodeR->StartIdx = nodeL->EndIdx; nodeR->EndIdx = node->EndIdx; node->Lchild = nodeL; node->Rchild = nodeR; omp_set_lock(&rootlock); if(mid>params_.S)NewSet.push_back(nodeL); if(nodeR->EndIdx - nodeR->StartIdx > params_.S)NewSet.push_back(nodeR); omp_unset_lock(&rootlock); } ActiveSet.resize(NewSet.size()); std::copy(NewSet.begin(), NewSet.end(),ActiveSet.begin()); NewSet.clear(); } #pragma omp parallel for for(unsigned i = 0; i < ActiveSet.size(); i++){ Node* node = ActiveSet[i]; //omp_set_lock(&rootlock); //std::cout<EndIdx-node->StartIdx<& myids = LeafLists[node->treeid]; DFSbuild(node, rng, &myids[0]+node->StartIdx, node->EndIdx-node->StartIdx, node->StartIdx); } } void outputVisitBucketNum(){} void initGraph(){ //initial unsigned N = features_.get_rows(); unsigned seed = 1998; std::mt19937 rng(seed); nhoods.resize(N); g.resize(N); boost::dynamic_bitset<> visited(N, false); knn_graph.resize(N); for (auto &nhood: nhoods) { //nhood.nn_new.resize(params_.S * 2); nhood.pool.resize(params_.L+1); nhood.radius = std::numeric_limits::max(); } //build tree std::vector indices(N); LeafLists.resize(TreeNum); std::vector ActiveSet; std::vector NewSet; for(unsigned i = 0; i < (unsigned)TreeNum; i++){ Node* node = new Node; node->DivDim = -1; node->Lchild = NULL; node->Rchild = NULL; node->StartIdx = 0; node->EndIdx = N; node->treeid = i; tree_roots_.push_back(node); ActiveSet.push_back(node); } #pragma omp parallel for for(unsigned i = 0; i < N; i++)indices[i] = i; #pragma omp parallel for for(unsigned i = 0; i < (unsigned)TreeNum; i++){ std::vector& myids = LeafLists[i]; myids.resize(N); std::copy(indices.begin(), indices.end(),myids.begin()); std::random_shuffle(myids.begin(), myids.end()); } omp_init_lock(&rootlock); while(!ActiveSet.empty() && ActiveSet.size() < 1100){ #pragma omp parallel for for(unsigned i = 0; i < ActiveSet.size(); i++){ Node* node = ActiveSet[i]; unsigned mid; unsigned cutdim; DataType cutval; std::mt19937 rng(seed ^ omp_get_thread_num()); std::vector& myids = LeafLists[node->treeid]; meanSplit(rng, &myids[0]+node->StartIdx, node->EndIdx - node->StartIdx, mid, cutdim, cutval); node->DivDim = cutdim; node->DivVal = cutval; //node->StartIdx = offset; //node->EndIdx = offset + count; Node* nodeL = new Node(); Node* nodeR = new Node(); nodeR->treeid = nodeL->treeid = node->treeid; nodeL->StartIdx = node->StartIdx; nodeL->EndIdx = node->StartIdx+mid; nodeR->StartIdx = nodeL->EndIdx; nodeR->EndIdx = node->EndIdx; node->Lchild = nodeL; node->Rchild = nodeR; omp_set_lock(&rootlock); if(mid>params_.S)NewSet.push_back(nodeL); if(nodeR->EndIdx - nodeR->StartIdx > params_.S)NewSet.push_back(nodeR); omp_unset_lock(&rootlock); } ActiveSet.resize(NewSet.size()); std::copy(NewSet.begin(), NewSet.end(),ActiveSet.begin()); NewSet.clear(); } #pragma omp parallel for for(unsigned i = 0; i < ActiveSet.size(); i++){ Node* node = ActiveSet[i]; //omp_set_lock(&rootlock); //std::cout<EndIdx-node->StartIdx<& myids = LeafLists[node->treeid]; DFSbuild(node, rng, &myids[0]+node->StartIdx, node->EndIdx-node->StartIdx, node->StartIdx); } //DFStest(0,0,tree_roots_[0]); //build tree completed for(size_t i = 0; i < (unsigned)TreeNumBuild; i++){ getMergeLevelNodeList(tree_roots_[i], i ,0); } #pragma omp parallel for for(size_t i = 0; i < mlNodeList.size(); i++){ mergeSubGraphs(mlNodeList[i].second, mlNodeList[i].first); } #pragma omp parallel { #ifdef _OPENMP std::mt19937 rng(seed ^ omp_get_thread_num()); #else std::mt19937 rng(seed); #endif std::vector random(params_.S + 1); #pragma omp for for (unsigned n = 0; n < N; ++n) { auto &nhood = nhoods[n]; Points &pool = nhood.pool; if(nhood.nn_new.size()compare( features_.get_row(n), features_.get_row(rand_id), features_.get_cols()); Candidate c(rand_id,dist); knn_graph[n].insert(c); } //omp_set_lock(&rootlock); //if(knn_graph[n].size() < nhood.L)std::cout<row_id;//random[i++]; nhood.nn_new[l] = it->row_id; nn.dist = it->distance;//distance_->compare(features_.get_row(n), features_.get_row(nn.id), features_.get_cols()); nn.flag = true;it++; //if(it == knn_graph[n].rend())break; } sort(pool.begin(), pool.begin() + nhood.L); } } knn_graph.clear(); #ifdef INFO std::cout<<"initial completed"<. All Rights Reserved. #ifndef EFANNA #define EFANNA #include "general/distance.hpp" #include "general/matrix.hpp" #include "general/params.hpp" #include "algorithm/init_indices.hpp" namespace efanna{ template class FIndex{ public: typedef InitIndex IndexType; FIndex(const Matrix& features, Distance* d, const IndexParams& params) : index_params_(params) { init_algorithm init_index_type= params.init_index_type; index_params_ = params; initIndex_ = create_index_by_type(init_index_type, features, params, d); } virtual ~FIndex () { } void buildIndex() { initIndex_->buildIndex(); } void buildTrees() { initIndex_->buildTrees(); } void knnSearch(int k, const Matrix& query){ initIndex_->knnSearch(k, query); } void saveIndex(char* filename){ initIndex_->saveIndex(filename); } void loadIndex(char* filename){ initIndex_->loadIndex(filename); } void saveTrees(char* filename){ initIndex_->saveTrees(filename); } void loadTrees(char* filename){ initIndex_->loadTrees(filename); } void saveGraph(char* filename){ initIndex_->saveGraph(filename); } void loadGraph(char* filename){ initIndex_->loadGraph(filename); } void saveResults(char* filename){ initIndex_->saveResults(filename); } void setSearchParams(int epochs, int init_num, int extend_to, int search_trees = 0, int search_lv=-1, int search_method = 0){ initIndex_->setSearchParams(epochs, init_num, extend_to,search_trees, search_lv, search_method); } size_t getGraphSize(){ return initIndex_->getGraphSize(); } std::vector getGraphRow(unsigned row_id){ return initIndex_->getGraphRow(row_id); } void outputVisitBucketNum(){ initIndex_->outputVisitBucketNum(); } private: /** Pointer to actual index class */ IndexType* initIndex_; /** Parameters passed to the index */ IndexParams index_params_; }; } #endif ================================================ FILE: general/distance.hpp ================================================ #ifndef EFANNA_DISTANCE_H_ #define EFANNA_DISTANCE_H_ #include #include #include #include #include #include //for debug #ifdef __GNUC__ #ifdef __AVX__ #define KGRAPH_MATRIX_ALIGN 32 #else #ifdef __SSE2__ #define KGRAPH_MATRIX_ALIGN 16 #else #define KGRAPH_MATRIX_ALIGN 4 #endif #endif #endif namespace efanna { template struct Candidate { size_t row_id; T distance; Candidate(const size_t row_id, const T distance): row_id(row_id), distance(distance) { } bool operator >(const Candidate& rhs) const { if (this->distance == rhs.distance) { return this->row_id > rhs.row_id; } return this->distance > rhs.distance; } bool operator <(const Candidate& rhs) const { if (this->distance == rhs.distance) { return this->row_id < rhs.row_id; } return this->distance < rhs.distance; } }; template class Distance { public: virtual T compare(const T* a, const T* b, size_t length) const = 0; virtual T norm(const T* a, size_t length) const = 0; virtual T dot(const T* a, const T* b, size_t length) const = 0; virtual ~Distance() {} }; #define SSE_L2SQR(addr1, addr2, dest, tmp1, tmp2) \ tmp1 = _mm_load_ps(addr1);\ tmp2 = _mm_load_ps(addr2);\ tmp1 = _mm_sub_ps(tmp1, tmp2); \ tmp1 = _mm_mul_ps(tmp1, tmp1); \ dest = _mm_add_ps(dest, tmp1); template class L2DistanceSSE: public Distance { public: typedef T ResultType; /** * * We use msse intrinstic here, we should ensure data align. */ ResultType compare(const T* a, const T* b, size_t size) const { __m128 sum; __m128 l0, l1, l2, l3; __m128 r0, r1, r2, r3; unsigned D = (size + 3) & ~3U; unsigned DR = D % 16; unsigned DD = D - DR; const float *l = a; const float *r = b; const float *e_l = l + DD; const float *e_r = r + DD; float unpack[4] __attribute__ ((aligned (16))) = {0, 0, 0, 0}; ResultType ret = 0.0; sum = _mm_load_ps(unpack); switch (DR) { case 12: SSE_L2SQR(e_l+8, e_r+8, sum, l2, r2); case 8: SSE_L2SQR(e_l+4, e_r+4, sum, l1, r1); case 4: SSE_L2SQR(e_l, e_r, sum, l0, r0); } for (unsigned i = 0; i < DD; i += 16, l += 16, r += 16) { SSE_L2SQR(l, r, sum, l0, r0); SSE_L2SQR(l + 4, r + 4, sum, l1, r1); SSE_L2SQR(l + 8, r + 8, sum, l2, r2); SSE_L2SQR(l + 12, r + 12, sum, l3, r3); } _mm_storeu_ps(unpack, sum); ret = unpack[0] + unpack[1] + unpack[2] + unpack[3]; return ret;//sqrt(ret); } T norm(const T* a, size_t length) const { return 0; } T dot(const T* a, const T* b, size_t length) const { return 0; } }; template class L2Distance: public Distance { public: typedef T ResultType; /** * Copied from flann * We do not want msse intrinstic here to avoid misalign problems. */ ResultType compare(const T* a, const T* b, size_t size) const { ResultType result = ResultType(); ResultType diff0, diff1, diff2, diff3; const T* last = a + size; const T* lastgroup = last - 3; /* Process 4 items with each loop for efficiency. */ while (a < lastgroup) { diff0 = (ResultType)(a[0] - b[0]); diff1 = (ResultType)(a[1] - b[1]); diff2 = (ResultType)(a[2] - b[2]); diff3 = (ResultType)(a[3] - b[3]); result += diff0 * diff0 + diff1 * diff1 + diff2 * diff2 + diff3 * diff3; a += 4; b += 4; } /* Process last 0-3 pixels. Not needed for standard vector lengths. */ while (a < last) { diff0 = (ResultType)(*a++ - *b++); result += diff0 * diff0; } return result;//sqrt(result); } T norm(const T* a, size_t length) const { return 0; } T dot(const T* a, const T* b, size_t length) const { return 0; } }; template class L2DistanceAVXr4: public Distance { public: typedef T ResultType; /** * * We use msse intrinstic here, we should ensure data align. */ #define AVX_L2SQR(addr1, addr2, dest, tmp1, tmp2) \ tmp1 = _mm256_loadu_ps(addr1);\ tmp2 = _mm256_loadu_ps(addr2);\ tmp1 = _mm256_sub_ps(tmp1, tmp2); \ tmp1 = _mm256_mul_ps(tmp1, tmp1); \ dest = _mm256_add_ps(dest, tmp1); ResultType compare(const T* a, const T* b, size_t size) const{ /* __m256 sum; __m256 l0, l1; __m256 r0, r1; unsigned D = (size + 7) & ~7U; unsigned DR = D % 16; unsigned DD = D - DR; const float *l = a; const float *r = b; const float *e_l = l + DD; const float *e_r = r + DD; float unpack[8] __attribute__ ((aligned (32))) = {0, 0, 0, 0, 0, 0, 0, 0}; ResultType ret = 0.0; sum = _mm256_loadu_ps(unpack); if(DR){AVX_L2SQR(e_l, e_r, sum, l0, r0);} for (unsigned i = 0; i < DD; i += 16, l += 16, r += 16) { AVX_L2SQR(l, r, sum, l0, r0); AVX_L2SQR(l + 8, r + 8, sum, l1, r1); } _mm256_storeu_ps(unpack, sum); ret = unpack[0] + unpack[1] + unpack[2] + unpack[3] + unpack[4] + unpack[5] + unpack[6] + unpack[7]; return ret;//sqrt(ret); */ __m256 sum; __m256 l0, l1, l2, l3; __m256 r0, r1, r2, r3; unsigned D = (size + 7) & ~7U; unsigned DR = D % 32; unsigned DD = D - DR; const float *l = a; const float *r = b; const float *e_l = l + DD; const float *e_r = r + DD; float unpack[8] __attribute__ ((aligned (32))) = {0, 0, 0, 0, 0, 0, 0, 0}; ResultType ret = 0.0; sum = _mm256_loadu_ps(unpack); switch (DR) { case 24: AVX_L2SQR(e_l+16, e_r+16, sum, l2, r2); case 16: AVX_L2SQR(e_l+8, e_r+8, sum, l1, r1); case 8: AVX_L2SQR(e_l, e_r, sum, l0, r0); } for (unsigned i = 0; i < DD; i += 32, l += 32, r += 32) { AVX_L2SQR(l, r, sum, l0, r0); AVX_L2SQR(l + 8, r + 8, sum, l1, r1); AVX_L2SQR(l + 16, r + 16, sum, l2, r2); AVX_L2SQR(l + 24, r + 24, sum, l3, r3); } _mm256_storeu_ps(unpack, sum); ret = unpack[0] + unpack[1] + unpack[2] + unpack[3] + unpack[4] + unpack[5] + unpack[6] + unpack[7]; return ret; } #define AVX_L2NORM(addr, dest, tmp) \ tmp = _mm256_loadu_ps(addr); \ tmp = _mm256_mul_ps(tmp, tmp); \ dest = _mm256_add_ps(dest, tmp); T norm(const T* a, size_t size) const { __m256 sum; __m256 l0, l1; unsigned D = (size + 7) & ~7U; unsigned DR = D % 16; unsigned DD = D - DR; const float *l = a; const float *e_l = l + DD; float unpack[8] __attribute__ ((aligned (32))) = {0, 0, 0, 0, 0, 0, 0, 0}; ResultType ret = 0.0; sum = _mm256_loadu_ps(unpack); if(DR){AVX_L2NORM(e_l, sum, l0);} for (unsigned i = 0; i < DD; i += 16, l += 16) { AVX_L2NORM(l, sum, l0); AVX_L2NORM(l + 8, sum, l1); } _mm256_storeu_ps(unpack, sum); ret = unpack[0] + unpack[1] + unpack[2] + unpack[3] + unpack[4] + unpack[5] + unpack[6] + unpack[7]; return ret; } #define AVX_L2DOT(addr1, addr2, dest, tmp1, tmp2) \ tmp1 = _mm256_loadu_ps(addr1);\ tmp2 = _mm256_loadu_ps(addr2);\ tmp1 = _mm256_mul_ps(tmp1, tmp2); \ dest = _mm256_add_ps(dest, tmp1); T dot(const T* a, const T* b, size_t size) const { __m256 sum; __m256 l0, l1; __m256 r0, r1; unsigned D = (size + 7) & ~7U; unsigned DR = D % 16; unsigned DD = D - DR; const float *l = a; const float *r = b; const float *e_l = l + DD; const float *e_r = r + DD; float unpack[8] __attribute__ ((aligned (32))) = {0, 0, 0, 0, 0, 0, 0, 0}; ResultType ret = 0.0; sum = _mm256_loadu_ps(unpack); if(DR){AVX_L2DOT(e_l, e_r, sum, l0, r0);} for (unsigned i = 0; i < DD; i += 16, l += 16, r += 16) { AVX_L2DOT(l, r, sum, l0, r0); AVX_L2DOT(l + 8, r + 8, sum, l1, r1); } _mm256_storeu_ps(unpack, sum); ret = unpack[0] + unpack[1] + unpack[2] + unpack[3] + unpack[4] + unpack[5] + unpack[6] + unpack[7]; return ret; } }; template class L2DistanceAVX: public Distance { public: typedef T ResultType; /** * * We use msse intrinstic here, we should ensure data align. */ #define AVX_L2SQR(addr1, addr2, dest, tmp1, tmp2) \ tmp1 = _mm256_loadu_ps(addr1);\ tmp2 = _mm256_loadu_ps(addr2);\ tmp1 = _mm256_sub_ps(tmp1, tmp2); \ tmp1 = _mm256_mul_ps(tmp1, tmp1); \ dest = _mm256_add_ps(dest, tmp1); ResultType compare(const T* a, const T* b, size_t size) const{ __m256 sum; __m256 l0, l1; __m256 r0, r1; unsigned D = (size + 7) & ~7U; unsigned DR = D % 32; unsigned DD = D - DR; const float *l = a; const float *r = b; const float *e_l = l + DD; const float *e_r = r + DD; float unpack[8] __attribute__ ((aligned (32))) = {0, 0, 0, 0, 0, 0, 0, 0}; ResultType ret = 0.0; sum = _mm256_loadu_ps(unpack); if(DR){AVX_L2SQR(e_l, e_r, sum, l0, r0);} for (unsigned i = 0; i < DD; i += 16, l += 16, r += 16) { AVX_L2SQR(l, r, sum, l0, r0); AVX_L2SQR(l + 8, r + 8, sum, l1, r1); } _mm256_storeu_ps(unpack, sum); ret = unpack[0] + unpack[1] + unpack[2] + unpack[3] + unpack[4] + unpack[5] + unpack[6] + unpack[7]; return ret;//sqrt(ret); } #define AVX_L2NORM(addr, dest, tmp) \ tmp = _mm256_loadu_ps(addr); \ tmp = _mm256_mul_ps(tmp, tmp); \ dest = _mm256_add_ps(dest, tmp); T norm(const T* a, size_t size) const { __m256 sum; __m256 l0, l1; unsigned D = (size + 7) & ~7U; unsigned DR = D % 16; unsigned DD = D - DR; const float *l = a; const float *e_l = l + DD; float unpack[8] __attribute__ ((aligned (32))) = {0, 0, 0, 0, 0, 0, 0, 0}; ResultType ret = 0.0; sum = _mm256_loadu_ps(unpack); if(DR){AVX_L2NORM(e_l, sum, l0);} for (unsigned i = 0; i < DD; i += 16, l += 16) { AVX_L2NORM(l, sum, l0); AVX_L2NORM(l + 8, sum, l1); } _mm256_storeu_ps(unpack, sum); ret = unpack[0] + unpack[1] + unpack[2] + unpack[3] + unpack[4] + unpack[5] + unpack[6] + unpack[7]; return ret; } #define AVX_L2DOT(addr1, addr2, dest, tmp1, tmp2) \ tmp1 = _mm256_loadu_ps(addr1);\ tmp2 = _mm256_loadu_ps(addr2);\ tmp1 = _mm256_mul_ps(tmp1, tmp2); \ dest = _mm256_add_ps(dest, tmp1); T dot(const T* a, const T* b, size_t size) const { __m256 sum; __m256 l0, l1; __m256 r0, r1; unsigned D = (size + 7) & ~7U; unsigned DR = D % 16; unsigned DD = D - DR; const float *l = a; const float *r = b; const float *e_l = l + DD; const float *e_r = r + DD; float unpack[8] __attribute__ ((aligned (32))) = {0, 0, 0, 0, 0, 0, 0, 0}; ResultType ret = 0.0; sum = _mm256_loadu_ps(unpack); if(DR){AVX_L2DOT(e_l, e_r, sum, l0, r0);} for (unsigned i = 0; i < DD; i += 16, l += 16, r += 16) { AVX_L2DOT(l, r, sum, l0, r0); AVX_L2DOT(l + 8, r + 8, sum, l1, r1); } _mm256_storeu_ps(unpack, sum); ret = unpack[0] + unpack[1] + unpack[2] + unpack[3] + unpack[4] + unpack[5] + unpack[6] + unpack[7]; return ret; } }; } #endif ================================================ FILE: general/matrix.hpp ================================================ #ifndef EFANNA_MATRIX_H #define EFANNA_MATRIX_H #include #include #include #include #include #include "distance.hpp" namespace efanna { template class Matrix { public: /** * Create a matrix using number of rows and cols with rawdata. * rawdata should be compact without any alignments. * Notice that efnn::Matrix itself just keep a reference of data and * does not make a copy of data, so keep it available. */ Matrix(const size_t rows, const size_t cols, const void* data): rows_(rows), cols_(cols) { size_t align_cols; #ifdef __GNUC__ #ifdef __AVX__ align_cols = (cols + 7)/8*8;//re align to sse format #else #ifdef __SSE2__ align_cols = (cols + 3)/4*4; #else align_cols = cols; #endif #endif #endif //std::cout<<" DD: "<(data) + (align_cols * i)); } } size_t get_cols() const { return cols_; } size_t get_rows() const { return rows_; } const T* get_row(const size_t index) const { if (index >= rows_) { throw std::runtime_error("index out of range"); } return row_pointers_[index]; } // Debug usage only std::vector > brute_force_search(size_t idx, size_t k, Distance* distance) const { printf("idx: %lu\n", idx); std::vector > result; for (size_t i = 0; i < rows_; i++) { result.push_back(std::make_pair( distance->compare(get_row(i), get_row(idx), cols_), i)); } std::partial_sort(result.begin(), result.begin() + k, result.end()); result.resize(k); return result; } private: size_t rows_, cols_; std::vector row_pointers_; }; } #endif ================================================ FILE: general/params.hpp ================================================ #ifndef EFANNA_PARAMS_H_ #define EFANNA_PARAMS_H_ #include #include namespace efanna { enum init_algorithm { KDTREE_UB, HASHING }; union ValueType{ int int_val; float float_val; char* str_pt; }; typedef std::map ExtraParamsMap; struct IndexParams{ init_algorithm init_index_type; size_t K; //build knn table with nn = K size_t S; //nn sets' max size size_t L =30;//rnn size size_t TNS = 10;//tree node size size_t Check_K = 40; int build_epoches; int extend_num; //number to extend each time size_t pool_size = 100; size_t init_num = 100; ExtraParamsMap extra_params; bool reverse_nn_used; }; struct SearchParams{ int search_init_num; int search_epoches; int search_method; unsigned extend_to; unsigned tree_num; unsigned search_depth; }; } #endif ================================================ FILE: matlab/.gitignore ================================================ *.mexa32 *.mexa64 *.mexw32 *.mexw64 *.mexmac ================================================ FILE: matlab/README.md ================================================ **Compilation** 1. Compile with: ``mex CXXFLAGS="\$CXXFLAGS -std=c++11 -O3 -march=native -fopenmp -Wall -lboost_timer -lboost_system" LDFLAGS="\$LDFLAGS -fopenmp" findex.cc -I../ -largeArrayDims`` 2. Run any matlab program with efanna! ----- **Dependencies** * Matlab 2010b or above. * Other prerequisites are the same as C++ prerequisites, see ----- **Examples** * Example programs are provided under folder "samples". * For instance, use ``matlab -nodesktop -nosplash -r "run('./samples/example_buildgraph')"`` to try. (Don't forget to provide inputs! Default inputs are placed under ``~/data/sift/``. You may change this path in the \*.m programs) * Every sample does the same job as C++ programs under ``../samples/`` do, and their parameters are mostly identical. * Additionaly, a *row-wise* organized sparse matrix describing nearest neighbours for all points is returned from functions for building or loading graphs * Look inside samples for more API details. ----- **FAQ** Common errors and their solutions: * If error ``redeclaration of C++ built-in type ‘char16_t’`` occurs in compilation * Try uncommenting lines starts with ``define`` and ``undef`` at the beginning of BOTH ``findex.cc`` and ``handle_wrapper.hpp``. * Compile again. Problem solved. * It may indicates you are using very old version of matlab and not fitting your g++ version well. * If error related to ``{mablab root}/sys/os/glnxa64/libstdc++.so.6`` occurs in runtime (typically, ``version 'CXXABI_1.x.x' not found (required by ...)``, or `` version `GLIBCXX_3.x.x' not found (required by ...)`` * Try ``export LD_PRELOAD=$LD_PRELOAD:/usr/lib/x86_64-linux-gnu/libstdc++.so.6``. You may need to rewrite the command with your own path of ``libstdc++.so.6`` according to the place you setup your gcc compiler's lib. * OR, directly substitute ``/usr/local/MATLAB/{your version}/sys/os/glnx64/libstdc++.so.6`` with your compiler's ``libstdc++.so.6``. You may need to rewrite the command with the place you setup your matlab. * It indicates your mex compiler automatically links the g++ lib under your matlab directory, which is not compatible with your g++ compiler, during the compilation step. ================================================ FILE: matlab/efanna.m ================================================ classdef efanna < handle properties (SetAccess = private, Hidden = true) objectHandle; index_name; end methods % Constructor function this = efanna(data, varargin) this.objectHandle = findex('new', data, 'kdtreeub', 'l2', varargin{:}); this.index_name = 'kdtreeub'; end % Destructor function delete(this) findex('destruct', this.objectHandle); end function graph_mat = build_index(this) graph_mat = findex('build_index', this.objectHandle); end function build_trees(this) findex('build_trees', this.objectHandle); end function load_index(this, filename) findex('load_index', this.objectHandle, filename); end function save_index(this, filename) findex('save_index', this.objectHandle, filename); end function graph_mat = load_graph(this, filename) graph_mat = findex('load_graph', this.objectHandle, filename); end function save_graph(this, filename) findex('save_graph', this.objectHandle, filename); end function load_trees(this, filename) findex('load_trees', this.objectHandle, filename); end function save_trees(this, filename) findex('save_trees', this.objectHandle, filename); end % set search params (different index may share the same knn_search with different params) function set_search_params(this, varargin) findex('set_search_params', this.objectHandle, this.index_name, varargin{:}); end function knn_search(this, k, query) findex('knn_search', this.objectHandle, k, query); end function save_result(this, filename) findex('save_result', this.objectHandle, filename); end end end ================================================ FILE: matlab/findex.cc ================================================ #include #include "handle_wrapper.hpp" #include #include #include //#define char16_t LIBRARY_char16_t #include //#undef char16_t using namespace efanna; template Matrix copy_matrix(const mxArray *array) { int points_num = mxGetN(array); int dim = mxGetM(array); size_t mem_size = mxGetN(array)*mxGetM(array)*sizeof(T); void* data = malloc(mem_size); memcpy(data, mxGetData(array), mem_size); return Matrix(points_num, dim, (T*)data); } template struct construct_func { typedef FIndex* (*entry)(Matrix, Distance*, int, const mxArray**); }; //TODO: default params template FIndex* _construct_kdtreeub(Matrix dataset, Distance* dist, int in_n, const mxArray *in_array[]){ if (in_n!=7 && in_n!=8 && in_n!=1) { mexErrMsgTxt("Incorrect number of input arguments"); } bool rnn_used = true; int trees = (int)(*mxGetPr(in_array[0])); if (in_n==1) { mexPrintf("kdtreeub params : %d\n", trees); return new FIndex(dataset, dist, KDTreeUbIndexParams(rnn_used, trees, 10, 10, 10, 10, 10, trees, 10)); } int mlevel = (int)(*mxGetPr(in_array[1])); int epoches = (int)(*mxGetPr(in_array[2])); int L = (int)(*mxGetPr(in_array[3])); int check_k = (int)(*mxGetPr(in_array[4])); int K = (int)(*mxGetPr(in_array[5])); int S = (int)(*mxGetPr(in_array[6])); if (in_n==7) { mexPrintf("kdtreeub params : %d %d %d %d %d %d %d\n", trees, mlevel, epoches, L, check_k, K, S); return new FIndex(dataset, dist, KDTreeUbIndexParams(rnn_used, trees, mlevel, epoches, check_k, L, K, trees, S)); } else if (in_n==8) { int build_trees = (int)(*mxGetPr(in_array[7])); mexPrintf("kdtreeub params : %d %d %d %d %d %d %d %d\n", trees, mlevel, epoches, L, check_k, K, S, build_trees); return new FIndex(dataset, dist, KDTreeUbIndexParams(rnn_used, trees, mlevel, epoches, check_k, L, K, build_trees, S)); } return NULL; // ERROR if this line is reached } template void _construct(int out_n, mxArray* out_array[], int in_n, const mxArray *in_array[]) { if ((in_n<3) || (!mxIsChar(in_array[1])) || (!mxIsChar(in_array[2]))) { mexErrMsgTxt("Incorrect number of input arguments"); } if (out_n!=1) { mexErrMsgTxt("Incorrect number of output arguments"); } std::map* > dist_table = { {"l2", new L2DistanceAVX() } }; std::map::entry> index_table = { {"kdtreeub", &_construct_kdtreeub }, {"nndescent", &_construct_kdtreeub }, {"nnexp", &_construct_kdtreeub } }; std::string index_name = mxArrayToString(in_array[1]); if (index_table.find(index_name)==index_table.end()) { mexErrMsgTxt("Error: bad distance selector: wrong distance name."); } std::string dist_name = mxArrayToString(in_array[2]); if (dist_table.find(dist_name)==dist_table.end()) { mexErrMsgTxt("Error: bad distance selector: wrong distance name."); } //TODO: type check Matrix dataset = copy_matrix(in_array[0]); FIndex* result = (*index_table[index_name])(dataset, dist_table[dist_name], in_n-3, in_array+3); out_array[0] = handle2mat >(result); //mxFree(index_name); //mxFree(dist_name); } template void _destruct(int out_n, mxArray* out_array[], int in_n, const mxArray *in_array[]) { if (in_n !=1) { mexErrMsgTxt("Incorrect number of input arguments"); } delete_wrapper >(in_array[0]); } template void _get_graph_mat(int out_n, mxArray* out_array[], int in_n, const mxArray *in_array[]) { //TODO: verify there're no bugs FIndex* handle = mat2handle >(in_array[0]); size_t nrows = handle->getGraphSize(); std::map*> col_graph; int maxnnz = 0; for (unsigned i = 0; i < nrows; i++) { std::vector nodes = handle->getGraphRow(i); maxnnz += nodes.size(); for (unsigned x : nodes) { if (col_graph.find(x) == col_graph.end()) { col_graph[x] = new std::vector(); } col_graph[x]->push_back(i); } } //the type must be double* double* pr = (double *)mxCalloc(maxnnz, sizeof(double)); mwIndex* ir = (size_t *)mxCalloc(maxnnz, sizeof(mwIndex)); mwIndex* jc = (size_t *)mxCalloc(nrows + 1, sizeof(mwIndex)); int nfilled = -1; int njc = 0; jc[0] = 0; for (unsigned i = 0; i < nrows; i++) { if (col_graph.find(i) == col_graph.end()) { njc ++; jc[njc] = jc[njc-1]; continue; } for (unsigned x: *col_graph[i]) { nfilled ++; pr[nfilled] = 1; ir[nfilled] = x; } njc ++; jc[njc] = jc[njc-1] + col_graph[i]->size(); delete col_graph[i]; } out_array[0] = mxCreateSparse(nrows, nrows, maxnnz, mxREAL); mxSetPr(out_array[0], pr); mxSetIr(out_array[0], ir); mxSetJc(out_array[0], jc); } template void _build_index(int out_n, mxArray* out_array[], int in_n, const mxArray *in_array[]) { if (in_n != 1) { mexErrMsgTxt("Incorrect number of input arguments"); } FIndex* handle = mat2handle >(in_array[0]); handle->buildIndex(); _get_graph_mat(out_n, out_array, in_n, in_array); } template void _build_trees(int out_n, mxArray* out_array[], int in_n, const mxArray *in_array[]) { if (in_n != 1) { mexErrMsgTxt("Incorrect number of input arguments"); } FIndex* handle = mat2handle >(in_array[0]); handle->buildTrees(); } template void _load_index(int out_n, mxArray* out_array[], int in_n, const mxArray *in_array[]) { if ((in_n != 2) || (!mxIsChar(in_array[1]))) { mexErrMsgTxt("Error: bad path name"); } char* path = mxArrayToString(in_array[1]); FIndex* handle = mat2handle >(in_array[0]); handle->loadIndex(path); mxFree(path); } template void _save_index(int out_n, mxArray* out_array[], int in_n, const mxArray *in_array[]) { if ((in_n != 2) || (!mxIsChar(in_array[1]))) { mexErrMsgTxt("Error: bad path name"); } char* path = mxArrayToString(in_array[1]); FIndex* handle = mat2handle >(in_array[0]); handle->saveIndex(path); mxFree(path); } template void _load_graph(int out_n, mxArray* out_array[], int in_n, const mxArray *in_array[]) { if ((in_n != 2) || (!mxIsChar(in_array[1]))) { mexErrMsgTxt("Error: bad path name"); } char* path = mxArrayToString(in_array[1]); FIndex* handle = mat2handle >(in_array[0]); handle->loadGraph(path); _get_graph_mat(out_n, out_array, in_n, in_array); mxFree(path); } template void _save_graph(int out_n, mxArray* out_array[], int in_n, const mxArray *in_array[]) { if ((in_n != 2) || (!mxIsChar(in_array[1]))) { mexErrMsgTxt("Error: bad path name"); } char* path = mxArrayToString(in_array[1]); FIndex* handle = mat2handle >(in_array[0]); handle->saveGraph(path); mxFree(path); } template void _load_trees(int out_n, mxArray* out_array[], int in_n, const mxArray *in_array[]) { if ((in_n != 2) || (!mxIsChar(in_array[1]))) { mexErrMsgTxt("Error: bad path name"); } char* path = mxArrayToString(in_array[1]); FIndex* handle = mat2handle >(in_array[0]); handle->loadTrees(path); mxFree(path); } template void _save_trees(int out_n, mxArray* out_array[], int in_n, const mxArray *in_array[]) { if ((in_n != 2) || (!mxIsChar(in_array[1]))) { mexErrMsgTxt("Error: bad path name"); } char* path = mxArrayToString(in_array[1]); FIndex* handle = mat2handle >(in_array[0]); handle->saveTrees(path); mxFree(path); } template void _set_search_params_kdtreeub(FIndex* handle, int in_n, const mxArray *in_array[]) { int search_trees = (int)(*mxGetPr(in_array[0])); int search_epoc = (int)(*mxGetPr(in_array[1])); int poolsz = (int)(*mxGetPr(in_array[2])); int search_extend = (int)(*mxGetPr(in_array[3])); int search_method = (int)(*mxGetPr(in_array[4])); handle->setSearchParams(search_epoc, poolsz, search_extend, search_trees, search_method); } template void _set_search_params_nndescent(FIndex* handle, int in_n, const mxArray *in_array[]) { mexPrintf("No param needs to be set.\n"); } template void _set_search_params_nnexp(FIndex* handle, int in_n, const mxArray *in_array[]) { mexPrintf("No param needs to be set.\n"); } template void _set_search_params(int out_n, mxArray* out_array[], int in_n, const mxArray *in_array[]) { if ((in_n < 2) || (!mxIsChar(in_array[1]))) { mexErrMsgTxt("Incorrect number of input arguments"); } //XXX: why this does not need template type for the function? typedef void (*funcp)(FIndex*, int, const mxArray**); std::map index_table = { {"kdtreeub", &_set_search_params_kdtreeub}, {"nndescent", &_set_search_params_nndescent}, {"nnexp", &_set_search_params_nnexp} }; std::string index_name = mxArrayToString(in_array[1]); if (index_table.find(index_name)==index_table.end()) { mexErrMsgTxt("Error: bad distance selector: wrong distance name."); } FIndex* handle = mat2handle >(in_array[0]); (*index_table[index_name])(handle, in_n-2, in_array+2); //mxFree(index_name); } template void _knn_search(int out_n, mxArray* out_array[], int in_n, const mxArray *in_array[]) { FIndex* handle = mat2handle >(in_array[0]); int k = (int)(*mxGetPr(in_array[1])); //TODO: type check Matrix query = copy_matrix(in_array[2]); handle->knnSearch(k, query); } template void _save_result(int out_n, mxArray* out_array[], int in_n, const mxArray *in_array[]) { if ((in_n != 2) || (!mxIsChar(in_array[1]))) { mexErrMsgTxt("Error: bad path name"); } char* path = mxArrayToString(in_array[1]); FIndex* handle = mat2handle >(in_array[0]); handle->saveResults(path); mxFree(path); } template struct mexfunc { typedef void (*entry)(int, mxArray**, int, const mxArray**); }; void mexFunction(int out_n, mxArray* out_array[], int in_n, const mxArray *in_array[]) { std::map::entry> ftable = { {"new", &_construct}, {"destruct", &_destruct}, {"build_index", &_build_index}, {"build_trees", &_build_trees}, {"load_index", &_load_index}, {"save_index", &_save_index}, {"load_graph", &_load_graph}, {"save_graph", &_save_graph}, {"load_trees", &_load_trees}, {"save_trees", &_save_trees}, {"set_search_params", &_set_search_params}, {"knn_search", &_knn_search}, {"save_result", &_save_result} // {"set_distance_type", &_set_distance_type} //TODO: only L2 distance is provided in general/distance.hpp now }; if ((in_n<=1) || (!mxIsChar(in_array[0]))) { mexErrMsgTxt("Error: bad function selector: wrong type of function name."); } std::string func_name = mxArrayToString(in_array[0]); if (ftable.find(func_name)==ftable.end()) { mexErrMsgTxt("Error: bad function selector: wrong function name."); } mexPrintf("Running Function : %s ...\n", func_name.c_str()); (*ftable[func_name])(out_n, out_array, in_n-1, in_array+1); //mxFree(func_name); } ================================================ FILE: matlab/fvecs_read.m ================================================ % Read a set of vectors stored in the fvec format (int + n * float) % The function returns a set of output vector (one vector per column) % % Syntax: % v = fvecs_read (filename) -> read all vectors % v = fvecs_read (filename, n) -> read n vectors % v = fvecs_read (filename, [a b]) -> read the vectors from a to b (indices starts from 1) function v = fvecs_read (filename, bounds) % open the file and count the number of descriptors fid = fopen (filename, 'rb'); if fid == -1 error ('I/O error : Unable to open the file %s\n', filename) end % Read the vector size d = fread (fid, 1, 'int'); vecsizeof = 1 * 4 + d * 4; % Get the number of vectrors fseek (fid, 0, 1); a = 1; bmax = ftell (fid) / vecsizeof; b = bmax; if nargin >= 2 if length (bounds) == 1 b = bounds; elseif length (bounds) == 2 a = bounds(1); b = bounds(2); end end assert (a >= 1); if b > bmax b = bmax; end if b == 0 | b < a v = []; fclose (fid); return; end % compute the number of vectors that are really read and go in starting positions n = b - a + 1; fseek (fid, (a - 1) * vecsizeof, -1); % read n vectors v = fread (fid, (d + 1) * n, 'float=>single'); v = reshape (v, d + 1, n); % Check if the first column (dimension of the vectors) is correct assert (sum (v (1, 2:end) == v(1, 1)) == n - 1); v = v (2:end, :); fclose (fid); ================================================ FILE: matlab/handle_wrapper.hpp ================================================ //#define char16_t LIBRARY_char16_t #include //#undef char16_t template class handle_wrapper { public: handle_wrapper(base *ptr) : handle(ptr) {} ~handle_wrapper() {delete handle;} base *get_handle() {return handle;} private: base *handle; }; template inline mxArray *handle2mat(base *ptr) { mxArray *mat = mxCreateNumericMatrix(1, 1, mxUINT64_CLASS, mxREAL); *((uint64_t *)mxGetData(mat)) = reinterpret_cast(new handle_wrapper(ptr)); return mat; } template inline handle_wrapper *mat2wrapper(const mxArray *mat) { if (mxGetNumberOfElements(mat) != 1 || mxGetClassID(mat) != mxUINT64_CLASS || mxIsComplex(mat)) mexErrMsgTxt("Input must be a real uint64 scalar."); handle_wrapper *wrapper = reinterpret_cast *>(*((uint64_t *)mxGetData(mat))); return wrapper; } template inline base *mat2handle(const mxArray *mat) { return mat2wrapper(mat)->get_handle(); } template inline void delete_wrapper(const mxArray *mat) { delete mat2wrapper(mat); } ================================================ FILE: matlab/samples/example_buildall.m ================================================ % add the path of efanna library addpath('../') % use fvecs_read util function provided to load datasets dataset = fvecs_read('~/data/sift/sift_base.fvecs'); disp('Data size:'); disp(size(dataset)); % params: dataset, number of trees for overall building, conquer-to-depth, number of iterations, L, checkK, K, S, number of trees for graph building % click following link for more information: https://github.com/fc731097343/efanna/blob/master/README.md ef = efanna(dataset, 16, 8, 8, 30, 25, 10, 10, 8); % build graph and get a sparse matrix describing the NN results spmat = ef.build_index(); disp('Adjacency matrix of KNN graph acquired. Shape:'); disp(size(spmat)); disp('Number of non-zero elements in adjacency matrix of KNN graph:'); disp(nnz(spmat)); % save trees and graph ef.save_trees('./sift.trees'); ef.save_graph('./sift.graph'); ================================================ FILE: matlab/samples/example_buildgraph.m ================================================ % add the path of efanna library addpath('../') % use fvecs_read util function provided to load datasets dataset = fvecs_read('~/data/sift/sift_base.fvecs'); disp('Data size:'); disp(size(dataset)); % params: dataset, number of trees for graph building, conquer-to-depth, number of iterations, L, checkK, K, S % click following link for more information: https://github.com/fc731097343/efanna/blob/master/README.md ef = efanna(dataset, 8, 8, 7, 30, 25, 10, 10); % build graph and get a sparse matrix describing the NN results % the sparse matrix is row-wise organized, i.e. the first row lies the 0-1 vector describes k nearest neighbours for node 1. tic; spmat = ef.build_index(); toc; gdgraph = ivecs_read('~/data/sift/sift_10NN_groundtruth.graph'); spmat = spmat'; [row,col]=find(spmat); row = row-1; nCorrect = 0; for i=1:1000000 for j=1:10 if(find(gdgraph(:,i)==row((i-1)*10+j))) nCorrect = nCorrect + 1; end end end disp(['10NN accuracy: ',num2str(nCorrect/10000000)]); % save graph ef.save_graph('./sift.graph'); ================================================ FILE: matlab/samples/example_buildtree.m ================================================ % add the path of efanna library addpath('../') % use fvecs_read util function provided to load datasets dataset = fvecs_read('~/data/sift/sift_base.fvecs'); disp('Data size:'); disp(size(dataset)); % params: dataset, number of trees to build % click following link for more information: https://github.com/fc731097343/efanna/blob/master/README.md ef = efanna(dataset, 16); % build trees and save ef.build_trees(); ef.save_trees('./sift.trees'); ================================================ FILE: matlab/samples/example_search.m ================================================ % add the path of efanna library addpath('../') % use fvecs_read util function provided to load datasets and queries dataset = fvecs_read('~/data/sift/sift_base.fvecs'); disp('Data size:'); disp(size(dataset)); query = fvecs_read('~/data/sift/sift_query.fvecs'); disp('Query size:'); disp(size(query)); % params: dataset, number of trees for overall building, conquer-to-depth, number of iterations, L, checkK, K, S, number of trees for graph building % all params should be strictly corresponds to the params in building process of loaded graph and trees % click following link for more information: https://github.com/fc731097343/efanna/blob/master/README.md ef = efanna(dataset, 16, 8, 8, 30, 25, 10, 10, 8); % build, or load graph & trees ef.load_trees('./sift.trees'); ef.load_graph('./sift.graph'); %%% ef.build_index(); % search params: number of trees to use, number of epoches, pool size factor, extend factor, searching methods % more details can be found in README.md of efanna root as well. ef.set_search_params(16, 4, 1200, 200, 0); % params: required number of returned neighbors (i.e. k for knn, here are searching for 10-nn) ef.knn_search(10, query); ef.save_result('./sift_search.result'); ================================================ FILE: samples/efanna_index_buildall.cc ================================================ #include #include #include #include #include using namespace efanna; using namespace std; void load_data(char* filename, float*& data, size_t& num,int& dim){// load data with sift10K pattern ifstream in(filename, ios::binary); if(!in.is_open()){cout<<"open file error"< dataset(points_num,dim,data_load); //Matrix query(q_num,qdim,query_load); unsigned int trees = atoi(argv[4]); int mlevel = atoi(argv[5]); unsigned int epochs = atoi(argv[6]); int L = atoi(argv[7]); int checkK = atoi(argv[8]); int kNN = atoi(argv[9]); int S = atoi(argv[10]); //srand(time(NULL)); FIndex index(dataset, new L2DistanceAVX(), efanna::KDTreeUbIndexParams(true, trees ,mlevel ,epochs,checkK,L, kNN, trees, S)); clock_t s,f; s = clock(); index.buildIndex(); f = clock(); cout<<"Index building time : "<<(f-s)*1.0/CLOCKS_PER_SEC<<" seconds"< #include #include #include #include #include using namespace efanna; using namespace std; void load_data(char* filename, float*& data, size_t& num,int& dim){// load data with sift10K pattern ifstream in(filename, ios::binary); if(!in.is_open()){cout<<"open file error"< dataset(points_num,dim,data_load); //Matrix query(q_num,qdim,query_load); unsigned int trees = atoi(argv[3]); int mlevel = atoi(argv[4]); unsigned int epochs = atoi(argv[5]); int L = atoi(argv[6]); int checkK = atoi(argv[7]); int kNN = atoi(argv[8]); int S = atoi(argv[9]); //srand(time(NULL)); FIndex index(dataset, new L2DistanceAVX(), efanna::KDTreeUbIndexParams(true, trees ,mlevel ,epochs,checkK,L, kNN, trees, S)); boost::timer::auto_cpu_timer timer; index.buildIndex(); cout< #include #include #include #include using namespace efanna; using namespace std; void load_data(char* filename, float*& data, size_t& num,int& dim){// load data with sift10K pattern ifstream in(filename, ios::binary); if(!in.is_open()){cout<<"open file error"< dataset(points_num,dim,data_load); unsigned int trees = atoi(argv[3]); FIndex index(dataset, new L2DistanceAVX(), efanna::KDTreeUbIndexParams(true, trees ,8 ,8,25,30, 10, 8, 10)); clock_t s,f; s = clock(); index.buildTrees(); f = clock(); cout<<"Index building time : "<<(f-s)*1.0/CLOCKS_PER_SEC<<" seconds"< #include #include #include #include #include using namespace boost; using namespace efanna; using namespace std; void load_data(char* filename, float*& data, size_t& num,int& dim){// load data with sift10K pattern ifstream in(filename, ios::binary); if(!in.is_open()){cout<<"open file error"< dataset(points_num,dim,data_load); Matrix query(q_num,qdim,query_load); FIndex index(dataset, new L2DistanceAVX(), efanna::KDTreeUbIndexParams(true, 8 ,8 ,10,25,30,10)); index.loadTrees(argv[2]); index.loadGraph(argv[3]); int search_trees = atoi(argv[6]); int search_epoc = atoi(argv[7]); int poolsz = atoi(argv[8]); int search_extend = atoi(argv[9]); int search_method = argc == 12 ? atoi(argv[11]) : 0; index.setSearchParams(search_epoc, poolsz, search_extend, search_trees, search_method); //clock_t s,f; boost::timer::auto_cpu_timer timer; //s=clock(); index.knnSearch(atoi(argv[10])/*query nn*/,query); //f=clock(); //cout<<(f-s)*1.0/CLOCKS_PER_SEC/8< #include #include using namespace std; void load_data(char* filename, int*& data, int& num,int& dim){// load data with sift10K pattern ifstream in(filename, ios::binary); if(!in.is_open()){cout<<"open file error"< dim1){cout<< "result file not enough for k="< result; for(int i=0; i < p1; i++){ result.clear(); for(int j=0; j