Repository: spotify/annoy Branch: main Commit: 379f744667ab Files: 50 Total size: 211.3 KB Directory structure: gitextract__vrufcg9/ ├── .github/ │ └── workflows/ │ ├── ci.yml │ └── publish.yml ├── .gitignore ├── CMakeLists.txt ├── LICENSE ├── MANIFEST.in ├── README.rst ├── README_GO.rst ├── README_Lua.md ├── RELEASE.md ├── annoy/ │ ├── __init__.py │ ├── __init__.pyi │ └── py.typed ├── annoy-dev-1.rockspec ├── debian/ │ ├── changelog │ ├── compat │ ├── control │ └── rules ├── examples/ │ ├── mmap_test.py │ ├── precision_test.cpp │ ├── precision_test.py │ ├── s_compile_cpp.sh │ └── simple_test.py ├── setup.cfg ├── setup.py ├── src/ │ ├── annoygomodule.h │ ├── annoygomodule.i │ ├── annoylib.h │ ├── annoyluamodule.cc │ ├── annoymodule.cc │ ├── kissrandom.h │ └── mman.h ├── test/ │ ├── accuracy_test.py │ ├── angular_index_test.py │ ├── annoy_test.go │ ├── annoy_test.lua │ ├── dot_index_test.py │ ├── euclidean_index_test.py │ ├── examples_test.py │ ├── hamming_index_test.py │ ├── holes_test.py │ ├── index_test.py │ ├── manhattan_index_test.py │ ├── memory_leak_test.py │ ├── multithreaded_build_test.py │ ├── on_disk_build_test.py │ ├── seed_test.py │ ├── threading_test.py │ └── types_test.py └── tox.ini ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/workflows/ci.yml ================================================ name: Annoy on: push: branches: - main pull_request: jobs: unit-tests: runs-on: ubuntu-22.04 strategy: fail-fast: false matrix: python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "3.13"] os: ["ubuntu-20.04", "macos-latest", "windows-latest"] steps: - uses: actions/checkout@v3 # Pull the repository - uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - run: pip install . - run: pip install h5py numpy pytest - run: pytest -v ================================================ FILE: .github/workflows/publish.yml ================================================ name: Publish on: push: tags: - 'v*.*.*' jobs: build: runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: os: [ubuntu-latest, windows-latest, macos-latest] steps: - name: Checkout code uses: actions/checkout@v2 - name: Set up QEMU (for Linux aarch64) if: runner.os == 'Linux' uses: docker/setup-qemu-action@v3 with: platforms: arm64 - name: Set up Python uses: actions/setup-python@v5 with: python-version: '3.x' - name: Install cibuildwheel run: python -m pip install cibuildwheel==3.2.1 - name: Build wheels run: python -m cibuildwheel --output-dir dist env: CIBW_BEFORE_BUILD: python -m pip install -U pip && rm -rf build CIBW_ARCHS_LINUX: auto aarch64 - name: Upload wheels uses: actions/upload-artifact@v4 with: name: built-wheels-${{ matrix.os }}-${{ strategy.job-index }} path: ./dist/*.whl - name: Build source distribution if: matrix.os == 'ubuntu-latest' run: python -m pip install build && python -m build --sdist --outdir dist - name: Upload sdist if: matrix.os == 'ubuntu-latest' uses: actions/upload-artifact@v4 with: name: built-sdist path: ./dist/*.tar.gz publish: needs: build runs-on: ubuntu-latest # pypi trusted publishing via OIDC permissions: id-token: write steps: - name: Download all artifacts uses: actions/download-artifact@v4 with: pattern: built-* path: dist merge-multiple: true - name: Publish package uses: pypa/gh-action-pypi-publish@release/v1 if: startsWith(github.ref, 'refs/tags/v') && github.event_name == 'push' with: password: ${{ secrets.PYPI_API_TOKEN }} ================================================ FILE: .gitignore ================================================ *.egg-info/ *.egg/ *.so *.o build/ dist/ .vscode/ *.pdb MANIFEST *.py[cod] *.idea # testing *.ann *.tree *.annoy *.idx *.hdf5 ================================================ FILE: CMakeLists.txt ================================================ cmake_minimum_required(VERSION 3.15...3.25 FATAL_ERROR) project(Annoy DESCRIPTION "Approximate Nearest Neighbors Oh Yeah" VERSION 1.17.1 LANGUAGES CXX) add_library(Annoy INTERFACE) add_library(Annoy::Annoy ALIAS Annoy) foreach (HEADER annoylib.h kissrandom.h mman.h) configure_file("${CMAKE_CURRENT_SOURCE_DIR}/src/${HEADER}" "${CMAKE_CURRENT_BINARY_DIR}/include/annoy/${HEADER}" COPYONLY) endforeach () target_include_directories(Annoy INTERFACE $ $) # Install include(GNUInstallDirs) install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) install(TARGETS Annoy EXPORT AnnoyTargets) install(EXPORT AnnoyTargets FILE AnnoyConfig.cmake NAMESPACE Annoy:: DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/annoy) export(TARGETS Annoy NAMESPACE Annoy:: FILE AnnoyConfig.cmake) ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright 2021 (c) Spotify and its affiliates. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: MANIFEST.in ================================================ include README.rst LICENSE ann.png include src/annoylib.h include src/kissrandom.h include src/mman.h ================================================ FILE: README.rst ================================================ Annoy ----- .. figure:: https://raw.github.com/spotify/annoy/master/ann.png :alt: Annoy example :align: center .. image:: https://github.com/spotify/annoy/actions/workflows/ci.yml/badge.svg :target: https://github.com/spotify/annoy/actions Annoy (`Approximate Nearest Neighbors `__ Oh Yeah) is a C++ library with Python bindings to search for points in space that are close to a given query point. It also creates large read-only file-based data structures that are `mmapped `__ into memory so that many processes may share the same data. Install ------- To install, simply do ``pip install --user annoy`` to pull down the latest version from `PyPI `_. For the C++ version, just clone the repo and ``#include "annoylib.h"``. Background ---------- There are some other libraries to do nearest neighbor search. Annoy is almost as fast as the fastest libraries, (see below), but there is actually another feature that really sets Annoy apart: it has the ability to **use static files as indexes**. In particular, this means you can **share index across processes**. Annoy also decouples creating indexes from loading them, so you can pass around indexes as files and map them into memory quickly. Another nice thing of Annoy is that it tries to minimize memory footprint so the indexes are quite small. Why is this useful? If you want to find nearest neighbors and you have many CPU's, you only need to build the index once. You can also pass around and distribute static files to use in production environment, in Hadoop jobs, etc. Any process will be able to load (mmap) the index into memory and will be able to do lookups immediately. We use it at `Spotify `__ for music recommendations. After running matrix factorization algorithms, every user/item can be represented as a vector in f-dimensional space. This library helps us search for similar users/items. We have many millions of tracks in a high-dimensional space, so memory usage is a prime concern. Annoy was built by `Erik Bernhardsson `__ in a couple of afternoons during `Hack Week `__. Summary of features ------------------- * `Euclidean distance `__, `Manhattan distance `__, `cosine distance `__, `Hamming distance `__, or `Dot (Inner) Product distance `__ * Cosine distance is equivalent to Euclidean distance of normalized vectors = sqrt(2-2*cos(u, v)) * Works better if you don't have too many dimensions (like <100) but seems to perform surprisingly well even up to 1,000 dimensions * Small memory usage * Lets you share memory between multiple processes * Index creation is separate from lookup (in particular you can not add more items once the tree has been created) * Native Python support, tested with 2.7, 3.6, and 3.7. * Build index on disk to enable indexing big datasets that won't fit into memory (contributed by `Rene Hollander `__) Python code example ------------------- .. code-block:: python from annoy import AnnoyIndex import random f = 40 # Length of item vector that will be indexed t = AnnoyIndex(f, 'angular') for i in range(1000): v = [random.gauss(0, 1) for z in range(f)] t.add_item(i, v) t.build(10) # 10 trees t.save('test.ann') # ... u = AnnoyIndex(f, 'angular') u.load('test.ann') # super fast, will just mmap the file print(u.get_nns_by_item(0, 1000)) # will find the 1000 nearest neighbors Right now it only accepts integers as identifiers for items. Note that it will allocate memory for max(id)+1 items because it assumes your items are numbered 0 … n-1. If you need other id's, you will have to keep track of a map yourself. Full Python API --------------- * ``AnnoyIndex(f, metric)`` returns a new index that's read-write and stores vector of ``f`` dimensions. Metric can be ``"angular"``, ``"euclidean"``, ``"manhattan"``, ``"hamming"``, or ``"dot"``. * ``a.add_item(i, v)`` adds item ``i`` (any nonnegative integer) with vector ``v``. Note that it will allocate memory for ``max(i)+1`` items. * ``a.build(n_trees, n_jobs=-1)`` builds a forest of ``n_trees`` trees. More trees gives higher precision when querying. After calling ``build``, no more items can be added. ``n_jobs`` specifies the number of threads used to build the trees. ``n_jobs=-1`` uses all available CPU cores. * ``a.save(fn, prefault=False)`` saves the index to disk and loads it (see next function). After saving, no more items can be added. * ``a.load(fn, prefault=False)`` loads (mmaps) an index from disk. If `prefault` is set to `True`, it will pre-read the entire file into memory (using mmap with `MAP_POPULATE`). Default is `False`. * ``a.unload()`` unloads. * ``a.get_nns_by_item(i, n, search_k=-1, include_distances=False)`` returns the ``n`` closest items. During the query it will inspect up to ``search_k`` nodes which defaults to ``n_trees * n`` if not provided. ``search_k`` gives you a run-time tradeoff between better accuracy and speed. If you set ``include_distances`` to ``True``, it will return a 2 element tuple with two lists in it: the second one containing all corresponding distances. * ``a.get_nns_by_vector(v, n, search_k=-1, include_distances=False)`` same but query by vector ``v``. * ``a.get_item_vector(i)`` returns the vector for item ``i`` that was previously added. * ``a.get_distance(i, j)`` returns the distance between items ``i`` and ``j``. NOTE: this used to return the *squared* distance, but has been changed as of Aug 2016. * ``a.get_n_items()`` returns the number of items in the index. * ``a.get_n_trees()`` returns the number of trees in the index. * ``a.on_disk_build(fn)`` prepares annoy to build the index in the specified file instead of RAM (execute before adding items, no need to save after build) * ``a.set_seed(seed)`` will initialize the random number generator with the given seed. Only used for building up the tree, i. e. only necessary to pass this before adding the items. Will have no effect after calling `a.build(n_trees)` or `a.load(fn)`. Notes: * There's no bounds checking performed on the values so be careful. * Annoy uses Euclidean distance of normalized vectors for its angular distance, which for two vectors u,v is equal to ``sqrt(2(1-cos(u,v)))`` The C++ API is very similar: just ``#include "annoylib.h"`` to get access to it. Tradeoffs --------- There are just two main parameters needed to tune Annoy: the number of trees ``n_trees`` and the number of nodes to inspect during searching ``search_k``. * ``n_trees`` is provided during build time and affects the build time and the index size. A larger value will give more accurate results, but larger indexes. * ``search_k`` is provided in runtime and affects the search performance. A larger value will give more accurate results, but will take longer time to return. If ``search_k`` is not provided, it will default to ``n * n_trees`` where ``n`` is the number of approximate nearest neighbors. Otherwise, ``search_k`` and ``n_trees`` are roughly independent, i.e. the value of ``n_trees`` will not affect search time if ``search_k`` is held constant and vice versa. Basically it's recommended to set ``n_trees`` as large as possible given the amount of memory you can afford, and it's recommended to set ``search_k`` as large as possible given the time constraints you have for the queries. You can also accept slower search times in favour of reduced loading times, memory usage, and disk IO. On supported platforms the index is prefaulted during ``load`` and ``save``, causing the file to be pre-emptively read from disk into memory. If you set ``prefault`` to ``False``, pages of the mmapped index are instead read from disk and cached in memory on-demand, as necessary for a search to complete. This can significantly increase early search times but may be better suited for systems with low memory compared to index size, when few queries are executed against a loaded index, and/or when large areas of the index are unlikely to be relevant to search queries. How does it work ---------------- Using `random projections `__ and by building up a tree. At every intermediate node in the tree, a random hyperplane is chosen, which divides the space into two subspaces. This hyperplane is chosen by sampling two points from the subset and taking the hyperplane equidistant from them. We do this k times so that we get a forest of trees. k has to be tuned to your need, by looking at what tradeoff you have between precision and performance. Hamming distance (contributed by `Martin Aumüller `__) packs the data into 64-bit integers under the hood and uses built-in bit count primitives so it could be quite fast. All splits are axis-aligned. Dot Product distance (contributed by `Peter Sobot `__ and `Pavel Korobov `__) reduces the provided vectors from dot (or "inner-product") space to a more query-friendly cosine space using `a method by Bachrach et al., at Microsoft Research, published in 2014 `__. More info --------- * `Dirk Eddelbuettel `__ provides an `R version of Annoy `__. * `Andy Sloane `__ provides a `Java version of Annoy `__ although currently limited to cosine and read-only. * `Pishen Tsai `__ provides a `Scala wrapper of Annoy `__ which uses JNA to call the C++ library of Annoy. * `Atsushi Tatsuma `__ provides `Ruby bindings for Annoy `__. * There is `experimental support for Go `__ provided by `Taneli Leppä `__. * `Boris Nagaev `__ wrote `Lua bindings `__. * During part of Spotify Hack Week 2016 (and a bit afterward), `Jim Kang `__ wrote `Node bindings `__ for Annoy. * `Min-Seok Kim `__ built a `Scala version `__ of Annoy. * `hanabi1224 `__ built a read-only `Rust version `__ of Annoy, together with **dotnet, jvm and dart** read-only bindings. * `Presentation from New York Machine Learning meetup `__ about Annoy * Annoy is available as a `conda package `__ on Linux, OS X, and Windows. * `ann-benchmarks `__ is a benchmark for several approximate nearest neighbor libraries. Annoy seems to be fairly competitive, especially at higher precisions: .. figure:: https://raw.githubusercontent.com/erikbern/ann-benchmarks/main/results/glove-100-angular.png :alt: ANN benchmarks :align: center :target: https://github.com/erikbern/ann-benchmarks Source code ----------- It's all written in C++ with a handful of ugly optimizations for performance and memory usage. You have been warned :) The code should support Windows, thanks to `Qiang Kou `__ and `Timothy Riley `__. To run the tests, execute `python setup.py nosetests`. The test suite includes a big real world dataset that is downloaded from the internet, so it will take a few minutes to execute. Discuss ------- Feel free to post any questions or comments to the `annoy-user `__ group. I'm `@fulhack `__ on Twitter. ================================================ FILE: README_GO.rst ================================================ Install ------- To install, you'll need Swig (tested with Swig 4.2.1 on Ubuntu 24.04), and then just:: swig -go -intgosize 64 -cgo -c++ src/annoygomodule.i mkdir -p $(go env GOPATH)/src/annoy cp src/annoygomodule_wrap.cxx src/annoy.go src/annoygomodule.h src/annoylib.h src/kissrandom.h test/annoy_test.go $(go env GOPATH)/src/annoy cd $(go env GOPATH)/src/annoy go mod init github.com/spotify/annoy go mod tidy go test Background ---------- See the main README. Go code example ------------------- .. code-block:: go package main import ( "fmt" "math/rand" "github.com/spotify/annoy" ) func main() { f := 40 t := annoy.NewAnnoyIndexAngular(f) for i := 0; i < 1000; i++ { item := make([]float32, 0, f) for x:= 0; x < f; x++ { item = append(item, rand.Float32()) } t.AddItem(i, item) } t.Build(10) t.Save("test.ann") annoy.DeleteAnnoyIndexAngular(t) t = annoy.NewAnnoyIndexAngular(f) t.Load("test.ann") result := annoyindex.NewAnnoyVectorInt() defer result.Free() t.GetNnsByItem(0, 1000, -1, result) fmt.Printf("%v\n", result.ToSlice()) } Right now it only accepts integers as identifiers for items. Note that it will allocate memory for max(id)+1 items because it assumes your items are numbered 0 … n-1. If you need other id's, you will have to keep track of a map yourself. Full Go API --------------- See annoygomodule.h. Generally the same as Python API except some arguments are not optional. Go binding does not support multithreaded build. Tests ------- A simple test is supplied in test/annoy_test.go. Discuss ------- Memroy leak in the previous versions has been fixed thanks to https://github.com/swig/swig/issues/2292. (memory leak fix is implemented in https://github.com/Rikanishu/annoy-go) Go glue written by Taneli Leppä (@rosmo). You can contact me via email (see https://github.com/rosmo). ================================================ FILE: README_Lua.md ================================================ Install ------- To install, you'll need Lua (binary + library) and LuaRocks. If you have Python and Pip, you can get Lua and LuaRocks using [hererocks](https://github.com/mpeterv/hererocks/), written by Peter Melnichenko. ``` pip install hererocks hererocks here --lua 5.1 --luarocks 2.2 ``` This command installs Lua and LuaRocks locally to directory `here`. To activate it, add `here/bin` to `PATH`: ``` export PATH="$(pwd)/here/bin/:$PATH" ``` Then you can use commands `lua`, `luarocks`, and tools installed by `luarocks`. To build and install `annoy`, type: ``` luarocks make ``` Background ---------- See the main README. Lua code example ---------------- ```lua local annoy = require "annoy" local f = 3 local t = annoy.AnnoyIndex(f) -- Length of item vector that will be indexed for i = 0, 999 do local v = {math.random(), math.random(), math.random()} t:add_item(i, v) end t:build(10) -- 10 trees t:save('test.ann') -- ... local u = annoy.AnnoyIndex(f) u:load('test.ann') -- super fast, will just mmap the file -- find the 10 nearest neighbors local neighbors = u:get_nns_by_item(0, 10) for rank, i in ipairs(neighbors) do print("neighbor", rank, "is", i) end ``` Full Lua API ------------ Lua API closely resembles Python API, see main README. Lua binding does not support multithreaded build. Tests ------- File `test/annoy_test.lua` is the literal translation of `test/annoy_test.py` from Python+Nosetests to Lua+Busted. To run tests, you need [Busted](http://olivinelabs.com/busted/), Elegant Lua unit testing. To install it, type: ``` luarocks install busted ``` To run tests, type: ``` busted test/annoy_test.lua ``` It will take few minutes to execute. Discuss ------- There might be some memory leaks if inputs are incorrect. Some functions allocate stack objects calling Lua functions throwing Lua errors (e.g., `luaL_checkinteger`). A Lua error may omit calling C++ destructors when unwinding the stack. (If it does, depends on the Lua implementation and platform being in use.) Lua binding was written by Boris Nagaev. You can contact me via email (see https://github.com/starius). ================================================ FILE: RELEASE.md ================================================ How to release -------------- 1. Make sure you're on master. `git checkout master && git fetch && git reset --hard origin/master` 1. Update `setup.py` to the newest version, `git add setup.py && git commit -m "version 1.2.3"` 1. `python setup.py sdist bdist_wheel` 1. `git tag -a v1.2.3 -m "version 1.2.3"` 1. `git push --tags origin master` to push the last version to Github 1. Go to https://github.com/spotify/annoy/releases and click "Draft a new release" 1. `twine upload dist/annoy-1.2.3*` TODO ---- * Wheel ================================================ FILE: annoy/__init__.py ================================================ # Copyright (c) 2013 Spotify AB # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy of # the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations under # the License. # This module is a dummy wrapper around the underlying C++ module. from .annoylib import Annoy as AnnoyIndex ================================================ FILE: annoy/__init__.pyi ================================================ from typing import Sized, overload from typing_extensions import Literal, Protocol class _Vector(Protocol, Sized): def __getitem__(self, __index: int) -> float: ... class AnnoyIndex: f: int def __init__(self, f: int, metric: Literal["angular", "euclidean", "manhattan", "hamming", "dot"]) -> None: ... def load(self, fn: str, prefault: bool = ...) -> Literal[True]: ... def save(self, fn: str, prefault: bool = ...) -> Literal[True]: ... @overload def get_nns_by_item(self, i: int, n: int, search_k: int = ..., include_distances: Literal[False] = ...) -> list[int]: ... @overload def get_nns_by_item( self, i: int, n: int, search_k: int, include_distances: Literal[True] ) -> tuple[list[int], list[float]]: ... @overload def get_nns_by_item( self, i: int, n: int, search_k: int = ..., *, include_distances: Literal[True] ) -> tuple[list[int], list[float]]: ... @overload def get_nns_by_vector( self, vector: _Vector, n: int, search_k: int = ..., include_distances: Literal[False] = ... ) -> list[int]: ... @overload def get_nns_by_vector( self, vector: _Vector, n: int, search_k: int, include_distances: Literal[True] ) -> tuple[list[int], list[float]]: ... @overload def get_nns_by_vector( self, vector: _Vector, n: int, search_k: int = ..., *, include_distances: Literal[True] ) -> tuple[list[int], list[float]]: ... def get_item_vector(self, __i: int) -> list[float]: ... def add_item(self, i: int, vector: _Vector) -> None: ... def on_disk_build(self, fn: str) -> Literal[True]: ... def build(self, n_trees: int, n_jobs: int = ...) -> Literal[True]: ... def unbuild(self) -> Literal[True]: ... def unload(self) -> Literal[True]: ... def get_distance(self, __i: int, __j: int) -> float: ... def get_n_items(self) -> int: ... def get_n_trees(self) -> int: ... def verbose(self, __v: bool) -> Literal[True]: ... def set_seed(self, __s: int) -> None: ... ================================================ FILE: annoy/py.typed ================================================ ================================================ FILE: annoy-dev-1.rockspec ================================================ -- Copyright (c) 2016 Boris Nagaev -- -- Licensed under the Apache License, Version 2.0 (the "License"); you may not -- use this file except in compliance with the License. You may obtain a copy of -- the License at -- -- http://www.apache.org/licenses/LICENSE-2.0 -- -- Unless required by applicable law or agreed to in writing, software -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -- License for the specific language governing permissions and limitations under -- the License. package = "annoy" version = "dev-1" source = { url = "git://github.com/spotify/annoy.git", } description = { summary = "Approximate Nearest Neighbors Oh Yeah", homepage = "https://github.com/spotify/annoy", license = "Apache", detailed = [[ Annoy (Approximate Nearest Neighbors Oh Yeah) is a C++ library with Python Go and Lua bindings to search for points in space that are close to a given query point. It also creates large read-only file-based data structures that are mmapped into memory so that many processes may share the same data. ]], } dependencies = { "lua >= 5.1", } build = { type = "builtin", modules = { ['annoy'] = { sources = { "src/annoyluamodule.cc", }, }, }, platforms = { unix = { modules = { ['annoy'] = { libraries = {"stdc++"}, }, }, }, mingw32 = { modules = { ['annoy'] = { libraries = {"stdc++"}, }, }, }, }, } ================================================ FILE: debian/changelog ================================================ spotify-annoy (1.0.0) unstable; urgency=low * Initial release. -- Erik Bernhardsson Wed, 20 Feb 2013 00:00:00 +0000 ================================================ FILE: debian/compat ================================================ 7 ================================================ FILE: debian/control ================================================ Source: spotify-annoy Section: non-free/net Priority: extra Maintainer: Erik Bernhardsson Build-Depends: debhelper (>= 7), python-all-dev, python-setuptools Standards-Version: 3.7.2 XS-Python-Version: >= 2.6 Package: spotify-annoy Architecture: any Depends: ${python:Depends} Description: Python module (written in C++) for high-dimensional approximate nearest neigbor (ANN) queries ================================================ FILE: debian/rules ================================================ #!/usr/bin/make -f %: dh $@ ================================================ FILE: examples/mmap_test.py ================================================ from annoy import AnnoyIndex a = AnnoyIndex(3, 'angular') a.add_item(0, [1, 0, 0]) a.add_item(1, [0, 1, 0]) a.add_item(2, [0, 0, 1]) a.build(-1) a.save('test.tree') b = AnnoyIndex(3) b.load('test.tree') print(b.get_nns_by_item(0, 100)) print(b.get_nns_by_vector([1.0, 0.5, 0.5], 100)) ================================================ FILE: examples/precision_test.cpp ================================================ /* * precision_test.cpp * * Created on: Jul 13, 2016 * Author: Claudio Sanhueza * Contact: csanhuezalobos@gmail.com */ #include #include #include "../src/kissrandom.h" #include "../src/annoylib.h" #include #include #include #include using namespace Annoy; int precision(int f=40, int n=1000000){ std::chrono::high_resolution_clock::time_point t_start, t_end; std::default_random_engine generator; std::normal_distribution distribution(0.0, 1.0); //****************************************************** //Building the tree AnnoyIndex t = AnnoyIndex(f); std::cout << "Building index ... be patient !!" << std::endl; std::cout << "\"Trees that are slow to grow bear the best fruit\" (Moliere)" << std::endl; for(int i=0; i( t_end - t_start ).count(); std::cout << " Done in "<< duration << " secs." << std::endl; std::cout << "Saving index ..."; t.save("precision.tree"); std::cout << " Done" << std::endl; //****************************************************** std::vector limits = {10, 100, 1000, 10000}; int K=10; int prec_n = 1000; std::map prec_sum; std::map time_sum; std::vector closest; //init precision and timers map for(std::vector::iterator it = limits.begin(); it!=limits.end(); ++it){ prec_sum[(*it)] = 0.0; time_sum[(*it)] = 0.0; } // doing the work for(int i=0; i toplist; std::vector intersection; for(std::vector::iterator limit = limits.begin(); limit!=limits.end(); ++limit){ t_start = std::chrono::high_resolution_clock::now(); t.get_nns_by_item(j, (*limit), (size_t) -1, &toplist, nullptr); //search_k defaults to "n_trees * n" if not provided. t_end = std::chrono::high_resolution_clock::now(); auto duration = std::chrono::duration_cast( t_end - t_start ).count(); //intersecting results std::sort(closest.begin(), closest.end(), std::less()); std::sort(toplist.begin(), toplist.end(), std::less()); intersection.resize(std::max(closest.size(), toplist.size())); std::vector::iterator it_set = std::set_intersection(closest.begin(), closest.end(), toplist.begin(), toplist.end(), intersection.begin()); intersection.resize(it_set-intersection.begin()); // storing metrics int found = intersection.size(); double hitrate = found / (double) K; prec_sum[(*limit)] += hitrate; time_sum[(*limit)] += duration; //deallocate memory vector().swap(intersection); vector().swap(toplist); } //print resulting metrics for(std::vector::iterator limit = limits.begin(); limit!=limits.end(); ++limit){ std::cout << "limit: " << (*limit) << "\tprecision: "<< std::fixed << std::setprecision(2) << (100.0 * prec_sum[(*limit)] / (i + 1)) << "% \tavg. time: "<< std::fixed<< std::setprecision(6) << (time_sum[(*limit)] / (i + 1)) * 1e-04 << "s" << std::endl; } closest.clear(); vector().swap(closest); } std::cout << "\nDone" << std::endl; return 0; } void help(){ std::cout << "Annoy Precision C++ example" << std::endl; std::cout << "Usage:" << std::endl; std::cout << "(default) ./precision" << std::endl; std::cout << "(using parameters) ./precision num_features num_nodes" << std::endl; std::cout << std::endl; } void feedback(int f, int n){ std::cout<<"Runing precision example with:" << std::endl; std::cout<<"num. features: "<< f << std::endl; std::cout<<"num. nodes: "<< n << std::endl; std::cout << std::endl; } int main(int argc, char **argv) { int f, n; if(argc == 1){ f = 40; n = 1000000; feedback(f,n); precision(40, 1000000); } else if(argc == 3){ f = atoi(argv[1]); n = atoi(argv[2]); feedback(f,n); precision(f, n); } else { help(); return EXIT_FAILURE; } return EXIT_SUCCESS; } ================================================ FILE: examples/precision_test.py ================================================ from __future__ import print_function import random, time from annoy import AnnoyIndex try: xrange except NameError: # Python 3 compat xrange = range n, f = 100000, 40 t = AnnoyIndex(f, 'angular') for i in xrange(n): v = [] for z in xrange(f): v.append(random.gauss(0, 1)) t.add_item(i, v) t.build(2 * f) t.save('test.tree') limits = [10, 100, 1000, 10000] k = 10 prec_sum = {} prec_n = 1000 time_sum = {} for i in xrange(prec_n): j = random.randrange(0, n) closest = set(t.get_nns_by_item(j, k, n)) for limit in limits: t0 = time.time() toplist = t.get_nns_by_item(j, k, limit) T = time.time() - t0 found = len(closest.intersection(toplist)) hitrate = 1.0 * found / k prec_sum[limit] = prec_sum.get(limit, 0.0) + hitrate time_sum[limit] = time_sum.get(limit, 0.0) + T for limit in limits: print('limit: %-9d precision: %6.2f%% avg time: %.6fs' % (limit, 100.0 * prec_sum[limit] / (i + 1), time_sum[limit] / (i + 1))) ================================================ FILE: examples/s_compile_cpp.sh ================================================ #!/bin/bash echo "compiling precision example..." cmd="g++ precision_test.cpp -DANNOYLIB_MULTITHREADED_BUILD -o precision_test -std=c++14 -pthread" eval $cmd echo "Done" ================================================ FILE: examples/simple_test.py ================================================ from annoy import AnnoyIndex a = AnnoyIndex(3, 'angular') a.add_item(0, [1, 0, 0]) a.add_item(1, [0, 1, 0]) a.add_item(2, [0, 0, 1]) a.build(-1) print(a.get_nns_by_item(0, 100)) print(a.get_nns_by_vector([1.0, 0.5, 0.5], 100)) ================================================ FILE: setup.cfg ================================================ [nosetests] attr=!slow nocapture=1 ================================================ FILE: setup.py ================================================ #!/usr/bin/env python # -*- coding: utf-8 -*- # # Copyright (c) 2013 Spotify AB # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy of # the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations under # the License. from setuptools import setup, Extension import os import platform import sys readme_note = """\ .. note:: For the latest source, discussion, etc, please visit the `GitHub repository `_\n\n .. image:: https://img.shields.io/github/stars/spotify/annoy.svg :target: https://github.com/spotify/annoy """ with open('README.rst', encoding='utf-8') as fobj: long_description = readme_note + fobj.read() # Various platform-dependent extras extra_compile_args = ['-D_CRT_SECURE_NO_WARNINGS', '-fpermissive'] extra_link_args = [] if platform.machine() == 'ppc64le': extra_compile_args += ['-mcpu=native',] if platform.machine() == 'x86_64': # do not apply march on Intel Darwin if platform.system() != 'Darwin': # Not all CPUs have march as a tuning parameter extra_compile_args += ['-march=native',] if os.name != 'nt': extra_compile_args += ['-O3', '-ffast-math', '-fno-associative-math'] # Add multithreaded build flag for all platforms using Python 3 and # for non-Windows Python 2 platforms python_major_version = sys.version_info[0] if python_major_version == 3 or (python_major_version == 2 and os.name != 'nt'): extra_compile_args += ['-DANNOYLIB_MULTITHREADED_BUILD'] if os.name != 'nt': extra_compile_args += ['-std=c++14'] # #349: something with OS X Mojave causes libstd not to be found if platform.system() == 'Darwin': extra_compile_args += ['-mmacosx-version-min=10.12'] extra_link_args += ['-stdlib=libc++', '-mmacosx-version-min=10.12'] # Manual configuration, you're on your own here. manual_compiler_args = os.environ.get('ANNOY_COMPILER_ARGS', None) if manual_compiler_args: extra_compile_args = manual_compiler_args.split(',') manual_linker_args = os.environ.get('ANNOY_LINKER_ARGS', None) if manual_linker_args: extra_link_args = manual_linker_args.split(',') setup(name='annoy', version='1.17.3', description='Approximate Nearest Neighbors in C++/Python optimized for memory usage and loading/saving to disk.', packages=['annoy'], package_data={'annoy': ['__init__.pyi', 'py.typed']}, ext_modules=[ Extension( 'annoy.annoylib', ['src/annoymodule.cc'], depends=['src/annoylib.h', 'src/kissrandom.h', 'src/mman.h'], extra_compile_args=extra_compile_args, extra_link_args=extra_link_args, ) ], long_description=long_description, long_description_content_type='text/x-rst', author='Erik Bernhardsson', author_email='mail@erikbern.com', url='https://github.com/spotify/annoy', license='Apache License 2.0', classifiers=[ 'Development Status :: 5 - Production/Stable', 'Programming Language :: Python', 'Programming Language :: Python :: 2.6', 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3.3', 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', 'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: 3.10', 'Programming Language :: Python :: 3.11', 'Programming Language :: Python :: 3.12', 'Programming Language :: Python :: 3.13', ], keywords='nns, approximate nearest neighbor search', setup_requires=['nose>=1.0'], tests_require=['numpy', 'h5py'] ) ================================================ FILE: src/annoygomodule.h ================================================ #include "annoylib.h" #include "kissrandom.h" using namespace Annoy; namespace GoAnnoy { class AnnoyVectorFloat { protected: float *ptr; int len; public: ~AnnoyVectorFloat() { free(ptr); }; float* ArrayPtr() { return ptr; }; int Len() { return len; }; float Get(int i) { if (i >= len) { return 0.0; } return ptr[i]; }; void fill_from_vector(vector* v) { if (ptr != NULL) { free(ptr); } ptr = (float*) malloc(v->size() * sizeof(float)); for (int i = 0; i < v->size(); i++) { ptr[i] = (float)(*v)[i]; } len = v->size(); }; }; class AnnoyVectorInt { protected: int32_t *ptr; int len; public: ~AnnoyVectorInt() { free(ptr); }; int32_t* ArrayPtr() { return ptr; }; int Len() { return len; }; int32_t Get(int i) { if (i >= len) { return 0.0; } return ptr[i]; }; void fill_from_vector(vector* v) { if (ptr != NULL) { free(ptr); } ptr = (int32_t*) malloc(v->size() * sizeof(int32_t)); for (int i = 0; i < v->size(); i++) { ptr[i] = (int32_t)(*v)[i]; } len = v->size(); }; }; class AnnoyIndex { protected: ::AnnoyIndexInterface *ptr; int f; public: ~AnnoyIndex() { delete ptr; }; void addItem(int item, const float* w) { ptr->add_item(item, w); }; void build(int q) { ptr->build(q, 1); }; bool save(const char* filename, bool prefault) { return ptr->save(filename, prefault); }; bool save(const char* filename) { return ptr->save(filename, true); }; void unload() { ptr->unload(); }; bool load(const char* filename, bool prefault) { return ptr->load(filename, prefault); }; bool load(const char* filename) { return ptr->load(filename, true); }; float getDistance(int i, int j) { return ptr->get_distance(i, j); }; void getNnsByItem(int item, int n, int search_k, AnnoyVectorInt* out_result, AnnoyVectorFloat* out_distances) { vector* result = new vector(); vector* distances = new vector(); ptr->get_nns_by_item(item, n, search_k, result, distances); out_result->fill_from_vector(result); out_distances->fill_from_vector(distances); delete result; delete distances; }; void getNnsByVector(const float* w, int n, int search_k, AnnoyVectorInt* out_result, AnnoyVectorFloat* out_distances) { vector* result = new vector(); vector* distances = new vector(); ptr->get_nns_by_vector(w, n, search_k, result, distances); out_result->fill_from_vector(result); out_distances->fill_from_vector(distances); delete result; delete distances; }; void getNnsByItem(int item, int n, int search_k, AnnoyVectorInt* out_result) { vector* result = new vector(); ptr->get_nns_by_item(item, n, search_k, result, NULL); out_result->fill_from_vector(result); delete result; }; void getNnsByVector(const float* w, int n, int search_k, AnnoyVectorInt* out_result) { vector* result = new vector(); ptr->get_nns_by_vector(w, n, search_k, result, NULL); out_result->fill_from_vector(result); delete result; }; int getNItems() { return (int)ptr->get_n_items(); }; void verbose(bool v) { ptr->verbose(v); }; void getItem(int item, AnnoyVectorFloat *v) { vector* r = new vector(); r->resize(this->f); ptr->get_item(item, &r->front()); v->fill_from_vector(r); }; bool onDiskBuild(const char* filename) { return ptr->on_disk_build(filename); }; }; class AnnoyIndexAngular : public AnnoyIndex { public: AnnoyIndexAngular(int f) { ptr = new ::AnnoyIndex(f); this->f = f; } }; class AnnoyIndexEuclidean : public AnnoyIndex { public: AnnoyIndexEuclidean(int f) { ptr = new ::AnnoyIndex(f); this->f = f; } }; class AnnoyIndexManhattan : public AnnoyIndex { public: AnnoyIndexManhattan(int f) { ptr = new ::AnnoyIndex(f); this->f = f; } }; class AnnoyIndexDotProduct : public AnnoyIndex { public: AnnoyIndexDotProduct(int f) { ptr = new ::AnnoyIndex(f); this->f = f; } }; } ================================================ FILE: src/annoygomodule.i ================================================ %module annoy namespace Annoy {} %{ #include "annoygomodule.h" %} // const float * %typemap(gotype) (const float *) "[]float32" %typemap(gotype) (int32_t) "int32" %typemap(in) (const float *) %{ float *v; vector w; v = (float *)$input.array; for (int i = 0; i < $input.len; i++) { w.push_back(v[i]); } $1 = &w[0]; %} %typemap(gotype) (const char *) "string" %typemap(in) (const char *) %{ $1 = (char *)calloc((((_gostring_)$input).n + 1), sizeof(char)); strncpy($1, (((_gostring_)$input).p), ((_gostring_)$input).n); %} %typemap(freearg) (const char *) %{ free($1); %} %ignore fill_from_vector; %rename(X_RawAnnoyVectorInt) AnnoyVectorInt; %rename(X_RawAnnoyVectorFloat) AnnoyVectorFloat; %insert(go_wrapper) %{ type AnnoyVectorInt interface { X_RawAnnoyVectorInt ToSlice() []int32 Copy(in *[]int32) InnerArray() []int32 Free() } func NewAnnoyVectorInt() AnnoyVectorInt { vec := NewX_RawAnnoyVectorInt() return vec.(SwigcptrX_RawAnnoyVectorInt) } func (p SwigcptrX_RawAnnoyVectorInt) ToSlice() []int32 { var out []int32 p.Copy(&out) return out } func (p SwigcptrX_RawAnnoyVectorInt) Copy(in *[]int32) { out := *in inner := p.InnerArray() if cap(out) >= len(inner) { if len(out) != len(inner) { out = out[:len(inner)] } } else { out = make([]int32, len(inner)) } copy(out, inner) *in = out } func (p SwigcptrX_RawAnnoyVectorInt) Free() { DeleteX_RawAnnoyVectorInt(p) } func (p SwigcptrX_RawAnnoyVectorInt) InnerArray() []int32 { length := p.Len() ptr := unsafe.Pointer(p.ArrayPtr()) return ((*[1 << 30]int32)(ptr))[:length:length] } %} %insert(go_wrapper) %{ type AnnoyVectorFloat interface { X_RawAnnoyVectorFloat ToSlice() []float32 Copy(in *[]float32) InnerArray() []float32 Free() } func NewAnnoyVectorFloat() AnnoyVectorFloat { vec := NewX_RawAnnoyVectorFloat() return vec.(SwigcptrX_RawAnnoyVectorFloat) } func (p SwigcptrX_RawAnnoyVectorFloat) ToSlice() []float32 { var out []float32 p.Copy(&out) return out } func (p SwigcptrX_RawAnnoyVectorFloat) Copy(in *[]float32) { out := *in inner := p.InnerArray() if cap(out) >= len(inner) { if len(out) != len(inner) { out = out[:len(inner)] } } else { out = make([]float32, len(inner)) } copy(out, inner) *in = out } func (p SwigcptrX_RawAnnoyVectorFloat) Free() { DeleteX_RawAnnoyVectorFloat(p) } func (p SwigcptrX_RawAnnoyVectorFloat) InnerArray() []float32 { length := p.Len() ptr := unsafe.Pointer(p.ArrayPtr()) return ((*[1 << 30]float32)(ptr))[:length:length] } %} /* Let's just grab the original header file here */ %include "annoygomodule.h" %feature("notabstract") GoAnnoyIndexAngular; %feature("notabstract") GoAnnoyIndexEuclidean; %feature("notabstract") GoAnnoyIndexManhattan; %feature("notabstract") GoAnnoyIndexDotProduct; ================================================ FILE: src/annoylib.h ================================================ // Copyright (c) 2013 Spotify AB // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy of // the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the // License for the specific language governing permissions and limitations under // the License. #ifndef ANNOY_ANNOYLIB_H #define ANNOY_ANNOYLIB_H #include #include #ifndef _MSC_VER #include #endif #include #include #include #include #include #if defined(_MSC_VER) && _MSC_VER == 1500 typedef unsigned char uint8_t; typedef signed __int32 int32_t; typedef unsigned __int64 uint64_t; typedef signed __int64 int64_t; #else #include #endif #if defined(_MSC_VER) || defined(__MINGW32__) // a bit hacky, but override some definitions to support 64 bit #define off_t int64_t #define lseek_getsize(fd) _lseeki64(fd, 0, SEEK_END) #ifndef NOMINMAX #define NOMINMAX #endif #include "mman.h" #include #else #include #define lseek_getsize(fd) lseek(fd, 0, SEEK_END) #endif #include #include #include #include #include #include #include #if __cplusplus >= 201103L #include #endif #ifdef ANNOYLIB_MULTITHREADED_BUILD #include #include #include #endif #ifdef _MSC_VER // Needed for Visual Studio to disable runtime checks for mempcy #pragma runtime_checks("s", off) #endif // This allows others to supply their own logger / error printer without // requiring Annoy to import their headers. See RcppAnnoy for a use case. #ifndef __ERROR_PRINTER_OVERRIDE__ #define annoylib_showUpdate(...) { fprintf(stderr, __VA_ARGS__ ); } #else #define annoylib_showUpdate(...) { __ERROR_PRINTER_OVERRIDE__( __VA_ARGS__ ); } #endif // Portable alloc definition, cf Writing R Extensions, Section 1.6.4 #ifdef __GNUC__ // Includes GCC, clang and Intel compilers # undef alloca # define alloca(x) __builtin_alloca((x)) #elif defined(__sun) || defined(_AIX) // this is necessary (and sufficient) for Solaris 10 and AIX 6: # include #endif // We let the v array in the Node struct take whatever space is needed, so this is a mostly insignificant number. // Compilers need *some* size defined for the v array, and some memory checking tools will flag for buffer overruns if this is set too low. #define ANNOYLIB_V_ARRAY_SIZE 65536 #ifndef _MSC_VER #define annoylib_popcount __builtin_popcountll #else // See #293, #358 #define annoylib_popcount cole_popcount #endif #if !defined(NO_MANUAL_VECTORIZATION) && defined(__GNUC__) && (__GNUC__ >6) && defined(__AVX512F__) // See #402 #define ANNOYLIB_USE_AVX512 #elif !defined(NO_MANUAL_VECTORIZATION) && defined(__AVX__) && defined (__SSE__) && defined(__SSE2__) && defined(__SSE3__) #define ANNOYLIB_USE_AVX #else #endif #if defined(ANNOYLIB_USE_AVX) || defined(ANNOYLIB_USE_AVX512) #if defined(_MSC_VER) #include #elif defined(__GNUC__) #include #endif #endif #if !defined(__MINGW32__) #define ANNOYLIB_FTRUNCATE_SIZE(x) static_cast(x) #else #define ANNOYLIB_FTRUNCATE_SIZE(x) (x) #endif namespace Annoy { inline void set_error_from_errno(char **error, const char* msg) { annoylib_showUpdate("%s: %s (%d)\n", msg, strerror(errno), errno); if (error) { *error = (char *)malloc(256); // TODO: win doesn't support snprintf snprintf(*error, 255, "%s: %s (%d)", msg, strerror(errno), errno); } } inline void set_error_from_string(char **error, const char* msg) { annoylib_showUpdate("%s\n", msg); if (error) { *error = (char *)malloc(strlen(msg) + 1); strcpy(*error, msg); } } using std::vector; using std::pair; using std::numeric_limits; using std::make_pair; inline bool remap_memory_and_truncate(void** _ptr, int _fd, size_t old_size, size_t new_size) { #ifdef __linux__ *_ptr = mremap(*_ptr, old_size, new_size, MREMAP_MAYMOVE); bool ok = ftruncate(_fd, new_size) != -1; #else munmap(*_ptr, old_size); bool ok = ftruncate(_fd, ANNOYLIB_FTRUNCATE_SIZE(new_size)) != -1; #ifdef MAP_POPULATE *_ptr = mmap(*_ptr, new_size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, _fd, 0); #else *_ptr = mmap(*_ptr, new_size, PROT_READ | PROT_WRITE, MAP_SHARED, _fd, 0); #endif #endif return ok; } namespace { template inline Node* get_node_ptr(const void* _nodes, const size_t _s, const S i) { return (Node*)((uint8_t *)_nodes + (_s * i)); } template inline T dot(const T* x, const T* y, int f) { T s = 0; for (int z = 0; z < f; z++) { s += (*x) * (*y); x++; y++; } return s; } template inline T manhattan_distance(const T* x, const T* y, int f) { T d = 0.0; for (int i = 0; i < f; i++) d += fabs(x[i] - y[i]); return d; } template inline T euclidean_distance(const T* x, const T* y, int f) { // Don't use dot-product: avoid catastrophic cancellation in #314. T d = 0.0; for (int i = 0; i < f; ++i) { const T tmp=*x - *y; d += tmp * tmp; ++x; ++y; } return d; } #ifdef ANNOYLIB_USE_AVX // Horizontal single sum of 256bit vector. inline float hsum256_ps_avx(__m256 v) { const __m128 x128 = _mm_add_ps(_mm256_extractf128_ps(v, 1), _mm256_castps256_ps128(v)); const __m128 x64 = _mm_add_ps(x128, _mm_movehl_ps(x128, x128)); const __m128 x32 = _mm_add_ss(x64, _mm_shuffle_ps(x64, x64, 0x55)); return _mm_cvtss_f32(x32); } template<> inline float dot(const float* x, const float *y, int f) { float result = 0; if (f > 7) { __m256 d = _mm256_setzero_ps(); for (; f > 7; f -= 8) { d = _mm256_add_ps(d, _mm256_mul_ps(_mm256_loadu_ps(x), _mm256_loadu_ps(y))); x += 8; y += 8; } // Sum all floats in dot register. result += hsum256_ps_avx(d); } // Don't forget the remaining values. for (; f > 0; f--) { result += *x * *y; x++; y++; } return result; } template<> inline float manhattan_distance(const float* x, const float* y, int f) { float result = 0; int i = f; if (f > 7) { __m256 manhattan = _mm256_setzero_ps(); __m256 minus_zero = _mm256_set1_ps(-0.0f); for (; i > 7; i -= 8) { const __m256 x_minus_y = _mm256_sub_ps(_mm256_loadu_ps(x), _mm256_loadu_ps(y)); const __m256 distance = _mm256_andnot_ps(minus_zero, x_minus_y); // Absolute value of x_minus_y (forces sign bit to zero) manhattan = _mm256_add_ps(manhattan, distance); x += 8; y += 8; } // Sum all floats in manhattan register. result = hsum256_ps_avx(manhattan); } // Don't forget the remaining values. for (; i > 0; i--) { result += fabsf(*x - *y); x++; y++; } return result; } template<> inline float euclidean_distance(const float* x, const float* y, int f) { float result=0; if (f > 7) { __m256 d = _mm256_setzero_ps(); for (; f > 7; f -= 8) { const __m256 diff = _mm256_sub_ps(_mm256_loadu_ps(x), _mm256_loadu_ps(y)); d = _mm256_add_ps(d, _mm256_mul_ps(diff, diff)); // no support for fmadd in AVX... x += 8; y += 8; } // Sum all floats in dot register. result = hsum256_ps_avx(d); } // Don't forget the remaining values. for (; f > 0; f--) { float tmp = *x - *y; result += tmp * tmp; x++; y++; } return result; } #endif #ifdef ANNOYLIB_USE_AVX512 template<> inline float dot(const float* x, const float *y, int f) { float result = 0; if (f > 15) { __m512 d = _mm512_setzero_ps(); for (; f > 15; f -= 16) { //AVX512F includes FMA d = _mm512_fmadd_ps(_mm512_loadu_ps(x), _mm512_loadu_ps(y), d); x += 16; y += 16; } // Sum all floats in dot register. result += _mm512_reduce_add_ps(d); } // Don't forget the remaining values. for (; f > 0; f--) { result += *x * *y; x++; y++; } return result; } template<> inline float manhattan_distance(const float* x, const float* y, int f) { float result = 0; int i = f; if (f > 15) { __m512 manhattan = _mm512_setzero_ps(); for (; i > 15; i -= 16) { const __m512 x_minus_y = _mm512_sub_ps(_mm512_loadu_ps(x), _mm512_loadu_ps(y)); manhattan = _mm512_add_ps(manhattan, _mm512_abs_ps(x_minus_y)); x += 16; y += 16; } // Sum all floats in manhattan register. result = _mm512_reduce_add_ps(manhattan); } // Don't forget the remaining values. for (; i > 0; i--) { result += fabsf(*x - *y); x++; y++; } return result; } template<> inline float euclidean_distance(const float* x, const float* y, int f) { float result=0; if (f > 15) { __m512 d = _mm512_setzero_ps(); for (; f > 15; f -= 16) { const __m512 diff = _mm512_sub_ps(_mm512_loadu_ps(x), _mm512_loadu_ps(y)); d = _mm512_fmadd_ps(diff, diff, d); x += 16; y += 16; } // Sum all floats in dot register. result = _mm512_reduce_add_ps(d); } // Don't forget the remaining values. for (; f > 0; f--) { float tmp = *x - *y; result += tmp * tmp; x++; y++; } return result; } #endif template inline void two_means(const vector& nodes, int f, Random& random, bool cosine, Node* p, Node* q) { /* This algorithm is a huge heuristic. Empirically it works really well, but I can't motivate it well. The basic idea is to keep two centroids and assign points to either one of them. We weight each centroid by the number of points assigned to it, so to balance it. */ static int iteration_steps = 200; size_t count = nodes.size(); size_t i = random.index(count); size_t j = random.index(count-1); j += (j >= i); // ensure that i != j Distance::template copy_node(p, nodes[i], f); Distance::template copy_node(q, nodes[j], f); if (cosine) { Distance::template normalize(p, f); Distance::template normalize(q, f); } Distance::init_node(p, f); Distance::init_node(q, f); int ic = 1, jc = 1; for (int l = 0; l < iteration_steps; l++) { size_t k = random.index(count); T di = ic * Distance::distance(p, nodes[k], f), dj = jc * Distance::distance(q, nodes[k], f); T norm = cosine ? Distance::template get_norm(nodes[k], f) : 1; if (!(norm > T(0))) { continue; } if (di < dj) { Distance::update_mean(p, nodes[k], norm, ic, f); Distance::init_node(p, f); ic++; } else if (dj < di) { Distance::update_mean(q, nodes[k], norm, jc, f); Distance::init_node(q, f); jc++; } } } } // namespace struct Base { template static inline void preprocess(void* nodes, size_t _s, const S node_count, const int f) { // Override this in specific metric structs below if you need to do any pre-processing // on the entire set of nodes passed into this index. } template static inline void postprocess(void* nodes, size_t _s, const S node_count, const int f) { // Override this in specific metric structs below if you need to do any post-processing // on the entire set of nodes passed into this index. } template static inline void zero_value(Node* dest) { // Initialize any fields that require sane defaults within this node. } template static inline void copy_node(Node* dest, const Node* source, const int f) { memcpy(dest->v, source->v, f * sizeof(T)); } template static inline T get_norm(Node* node, int f) { return sqrt(dot(node->v, node->v, f)); } template static inline void normalize(Node* node, int f) { T norm = Base::get_norm(node, f); if (norm > 0) { for (int z = 0; z < f; z++) node->v[z] /= norm; } } template static inline void update_mean(Node* mean, Node* new_node, T norm, int c, int f) { for (int z = 0; z < f; z++) mean->v[z] = (mean->v[z] * c + new_node->v[z] / norm) / (c + 1); } }; struct Angular : Base { template struct Node { /* * We store a binary tree where each node has two things * - A vector associated with it * - Two children * All nodes occupy the same amount of memory * All nodes with n_descendants == 1 are leaf nodes. * A memory optimization is that for nodes with 2 <= n_descendants <= K, * we skip the vector. Instead we store a list of all descendants. K is * determined by the number of items that fits in the space of the vector. * For nodes with n_descendants == 1 the vector is a data point. * For nodes with n_descendants > K the vector is the normal of the split plane. * Note that we can't really do sizeof(node) because we cheat and allocate * more memory to be able to fit the vector outside */ S n_descendants; union { S children[2]; // Will possibly store more than 2 T norm; }; T v[ANNOYLIB_V_ARRAY_SIZE]; }; template static inline T distance(const Node* x, const Node* y, int f) { // want to calculate (a/|a| - b/|b|)^2 // = a^2 / a^2 + b^2 / b^2 - 2ab/|a||b| // = 2 - 2cos T pp = x->norm ? x->norm : dot(x->v, x->v, f); // For backwards compatibility reasons, we need to fall back and compute the norm here T qq = y->norm ? y->norm : dot(y->v, y->v, f); T pq = dot(x->v, y->v, f); T ppqq = pp * qq; if (ppqq > 0) return 2.0 - 2.0 * pq / sqrt(ppqq); else return 2.0; // cos is 0 } template static inline T margin(const Node* n, const T* y, int f) { return dot(n->v, y, f); } template static inline bool side(const Node* n, const T* y, int f, Random& random) { T dot = margin(n, y, f); if (dot != 0) return (dot > 0); else return (bool)random.flip(); } template static inline bool side(const Node* n, const Node* y, int f, Random& random) { return side(n, y->v, f, random); } template static inline void create_split(const vector*>& nodes, int f, size_t s, Random& random, Node* n) { Node* p = (Node*)alloca(s); Node* q = (Node*)alloca(s); two_means >(nodes, f, random, true, p, q); for (int z = 0; z < f; z++) n->v[z] = p->v[z] - q->v[z]; Base::normalize >(n, f); } template static inline T normalized_distance(T distance) { // Used when requesting distances from Python layer // Turns out sometimes the squared distance is -0.0 // so we have to make sure it's a positive number. return sqrt(std::max(distance, T(0))); } template static inline T pq_distance(T distance, T margin, int child_nr) { if (child_nr == 0) margin = -margin; return std::min(distance, margin); } template static inline T pq_initial_value() { return numeric_limits::infinity(); } template static inline void init_node(Node* n, int f) { n->norm = dot(n->v, n->v, f); } static const char* name() { return "angular"; } }; struct DotProduct : Angular { template struct Node { /* * This is an extension of the Angular node with extra attributes for the DotProduct metric. * It has dot_factor which is needed to reduce the task to Angular distance metric (see the preprocess method) * and also a built flag that helps to compute exact dot products when an index is already built. */ S n_descendants; S children[2]; // Will possibly store more than 2 T dot_factor; T norm; bool built; T v[ANNOYLIB_V_ARRAY_SIZE]; }; static const char* name() { return "dot"; } template static inline T get_norm(Node* node, int f) { return sqrt(dot(node->v, node->v, f) + node->dot_factor * node->dot_factor); } template static inline void update_mean(Node* mean, Node* new_node, T norm, int c, int f) { for (int z = 0; z < f; z++) mean->v[z] = (mean->v[z] * c + new_node->v[z] / norm) / (c + 1); mean->dot_factor = (mean->dot_factor * c + new_node->dot_factor / norm) / (c + 1); } template static inline T distance(const Node* x, const Node* y, int f) { if (x->built || y->built) { // When index is already built, we don't need angular distances to retrieve NNs // Thus, we can return dot product scores itself return -dot(x->v, y->v, f); } // Calculated by analogy with the angular case T pp = x->norm ? x->norm : dot(x->v, x->v, f) + x->dot_factor * x->dot_factor; T qq = y->norm ? y->norm : dot(y->v, y->v, f) + y->dot_factor * y->dot_factor; T pq = dot(x->v, y->v, f) + x->dot_factor * y->dot_factor; T ppqq = pp * qq; if (ppqq > 0) return 2.0 - 2.0 * pq / sqrt(ppqq); else return 2.0; } template static inline void zero_value(Node* dest) { dest->dot_factor = 0; } template static inline void init_node(Node* n, int f) { n->built = false; n->norm = dot(n->v, n->v, f) + n->dot_factor * n->dot_factor; } template static inline void copy_node(Node* dest, const Node* source, const int f) { memcpy(dest->v, source->v, f * sizeof(T)); dest->dot_factor = source->dot_factor; } template static inline void create_split(const vector*>& nodes, int f, size_t s, Random& random, Node* n) { Node* p = (Node*)alloca(s); Node* q = (Node*)alloca(s); DotProduct::zero_value(p); DotProduct::zero_value(q); two_means >(nodes, f, random, true, p, q); for (int z = 0; z < f; z++) n->v[z] = p->v[z] - q->v[z]; n->dot_factor = p->dot_factor - q->dot_factor; DotProduct::normalize >(n, f); } template static inline void normalize(Node* node, int f) { T norm = sqrt(dot(node->v, node->v, f) + pow(node->dot_factor, 2)); if (norm > 0) { for (int z = 0; z < f; z++) node->v[z] /= norm; node->dot_factor /= norm; } } template static inline T margin(const Node* n, const T* y, int f) { return dot(n->v, y, f); } template static inline T margin(const Node* n, const Node* y, int f) { return dot(n->v, y->v, f) + n->dot_factor * y->dot_factor; } template static inline bool side(const Node* n, const Node* y, int f, Random& random) { T dot = margin(n, y, f); if (dot != 0) return (dot > 0); else return (bool)random.flip(); } template static inline bool side(const Node* n, const T* y, int f, Random& random) { T dot = margin(n, y, f); if (dot != 0) return (dot > 0); else return (bool)random.flip(); } template static inline T normalized_distance(T distance) { return -distance; } template static inline void preprocess(void* nodes, size_t _s, const S node_count, const int f) { // This uses a method from Microsoft Research for transforming inner product spaces to cosine/angular-compatible spaces. // (Bachrach et al., 2014, see https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/XboxInnerProduct.pdf) // Step one: compute the norm of each vector and store that in its extra dimension (f-1) for (S i = 0; i < node_count; i++) { Node* node = get_node_ptr(nodes, _s, i); T d = dot(node->v, node->v, f); T norm = d < 0 ? 0 : sqrt(d); node->dot_factor = norm; node->built = false; } // Step two: find the maximum norm T max_norm = 0; for (S i = 0; i < node_count; i++) { Node* node = get_node_ptr(nodes, _s, i); if (node->dot_factor > max_norm) { max_norm = node->dot_factor; } } // Step three: set each vector's extra dimension to sqrt(max_norm^2 - norm^2) for (S i = 0; i < node_count; i++) { Node* node = get_node_ptr(nodes, _s, i); T node_norm = node->dot_factor; T squared_norm_diff = pow(max_norm, static_cast(2.0)) - pow(node_norm, static_cast(2.0)); T dot_factor = squared_norm_diff < 0 ? 0 : sqrt(squared_norm_diff); node->norm = pow(max_norm, static_cast(2.0)); node->dot_factor = dot_factor; } } template static inline void postprocess(void* nodes, size_t _s, const S node_count, const int f) { for (S i = 0; i < node_count; i++) { Node* node = get_node_ptr(nodes, _s, i); // When an index is built, we will remember it in index item nodes to compute distances differently node->built = true; } } }; struct Hamming : Base { template struct Node { S n_descendants; S children[2]; T v[ANNOYLIB_V_ARRAY_SIZE]; }; static const size_t max_iterations = 20; template static inline T pq_distance(T distance, T margin, int child_nr) { return distance - (margin != (unsigned int) child_nr); } template static inline T pq_initial_value() { return numeric_limits::max(); } template static inline int cole_popcount(T v) { // Note: Only used with MSVC 9, which lacks intrinsics and fails to // calculate std::bitset::count for v > 32bit. Uses the generalized // approach by Eric Cole. // See https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSet64 v = v - ((v >> 1) & (T)~(T)0/3); v = (v & (T)~(T)0/15*3) + ((v >> 2) & (T)~(T)0/15*3); v = (v + (v >> 4)) & (T)~(T)0/255*15; return (T)(v * ((T)~(T)0/255)) >> (sizeof(T) - 1) * 8; } template static inline T distance(const Node* x, const Node* y, int f) { size_t dist = 0; for (int i = 0; i < f; i++) { dist += annoylib_popcount(x->v[i] ^ y->v[i]); } return dist; } template static inline bool margin(const Node* n, const T* y, int f) { static const size_t n_bits = sizeof(T) * 8; T chunk = n->v[0] / n_bits; return (y[chunk] & (static_cast(1) << (n_bits - 1 - (n->v[0] % n_bits)))) != 0; } template static inline bool side(const Node* n, const T* y, int f, Random& random) { return margin(n, y, f); } template static inline bool side(const Node* n, const Node* y, int f, Random& random) { return side(n, y->v, f, random); } template static inline void create_split(const vector*>& nodes, int f, size_t s, Random& random, Node* n) { size_t cur_size = 0; size_t i = 0; int dim = f * 8 * sizeof(T); for (; i < max_iterations; i++) { // choose random position to split at n->v[0] = random.index(dim); cur_size = 0; for (typename vector*>::const_iterator it = nodes.begin(); it != nodes.end(); ++it) { if (margin(n, (*it)->v, f)) { cur_size++; } } if (cur_size > 0 && cur_size < nodes.size()) { break; } } // brute-force search for splitting coordinate if (i == max_iterations) { int j = 0; for (; j < dim; j++) { n->v[0] = j; cur_size = 0; for (typename vector*>::const_iterator it = nodes.begin(); it != nodes.end(); ++it) { if (margin(n, (*it)->v, f)) { cur_size++; } } if (cur_size > 0 && cur_size < nodes.size()) { break; } } } } template static inline T normalized_distance(T distance) { return distance; } template static inline void init_node(Node* n, int f) { } static const char* name() { return "hamming"; } }; struct Minkowski : Base { template struct Node { S n_descendants; T a; // need an extra constant term to determine the offset of the plane S children[2]; T v[ANNOYLIB_V_ARRAY_SIZE]; }; template static inline T margin(const Node* n, const T* y, int f) { return n->a + dot(n->v, y, f); } template static inline bool side(const Node* n, const T* y, int f, Random& random) { T dot = margin(n, y, f); if (dot != 0) return (dot > 0); else return (bool)random.flip(); } template static inline bool side(const Node* n, const Node* y, int f, Random& random) { return side(n, y->v, f, random); } template static inline T pq_distance(T distance, T margin, int child_nr) { if (child_nr == 0) margin = -margin; return std::min(distance, margin); } template static inline T pq_initial_value() { return numeric_limits::infinity(); } }; struct Euclidean : Minkowski { template static inline T distance(const Node* x, const Node* y, int f) { return euclidean_distance(x->v, y->v, f); } template static inline void create_split(const vector*>& nodes, int f, size_t s, Random& random, Node* n) { Node* p = (Node*)alloca(s); Node* q = (Node*)alloca(s); two_means >(nodes, f, random, false, p, q); for (int z = 0; z < f; z++) n->v[z] = p->v[z] - q->v[z]; Base::normalize >(n, f); n->a = 0.0; for (int z = 0; z < f; z++) n->a += -n->v[z] * (p->v[z] + q->v[z]) / 2; } template static inline T normalized_distance(T distance) { return sqrt(std::max(distance, T(0))); } template static inline void init_node(Node* n, int f) { } static const char* name() { return "euclidean"; } }; struct Manhattan : Minkowski { template static inline T distance(const Node* x, const Node* y, int f) { return manhattan_distance(x->v, y->v, f); } template static inline void create_split(const vector*>& nodes, int f, size_t s, Random& random, Node* n) { Node* p = (Node*)alloca(s); Node* q = (Node*)alloca(s); two_means >(nodes, f, random, false, p, q); for (int z = 0; z < f; z++) n->v[z] = p->v[z] - q->v[z]; Base::normalize >(n, f); n->a = 0.0; for (int z = 0; z < f; z++) n->a += -n->v[z] * (p->v[z] + q->v[z]) / 2; } template static inline T normalized_distance(T distance) { return std::max(distance, T(0)); } template static inline void init_node(Node* n, int f) { } static const char* name() { return "manhattan"; } }; template class AnnoyIndexInterface { public: // Note that the methods with an **error argument will allocate memory and write the pointer to that string if error is non-NULL virtual ~AnnoyIndexInterface() {}; virtual bool add_item(S item, const T* w, char** error=NULL) = 0; virtual bool build(int q, int n_threads=-1, char** error=NULL) = 0; virtual bool unbuild(char** error=NULL) = 0; virtual bool save(const char* filename, bool prefault=false, char** error=NULL) = 0; virtual void unload() = 0; virtual bool load(const char* filename, bool prefault=false, char** error=NULL) = 0; virtual T get_distance(S i, S j) const = 0; virtual void get_nns_by_item(S item, size_t n, int search_k, vector* result, vector* distances) const = 0; virtual void get_nns_by_vector(const T* w, size_t n, int search_k, vector* result, vector* distances) const = 0; virtual S get_n_items() const = 0; virtual S get_n_trees() const = 0; virtual void verbose(bool v) = 0; virtual void get_item(S item, T* v) const = 0; virtual void set_seed(R q) = 0; virtual bool on_disk_build(const char* filename, char** error=NULL) = 0; }; template class AnnoyIndex : public AnnoyIndexInterface= 201103L typename std::remove_const::type #else typename Random::seed_type #endif > { /* * We use random projection to build a forest of binary trees of all items. * Basically just split the hyperspace into two sides by a hyperplane, * then recursively split each of those subtrees etc. * We create a tree like this q times. The default q is determined automatically * in such a way that we at most use 2x as much memory as the vectors take. */ public: typedef Distance D; typedef typename D::template Node Node; #if __cplusplus >= 201103L typedef typename std::remove_const::type R; #else typedef typename Random::seed_type R; #endif protected: const int _f; size_t _s; S _n_items; void* _nodes; // Could either be mmapped, or point to a memory buffer that we reallocate S _n_nodes; S _nodes_size; vector _roots; S _K; R _seed; bool _loaded; bool _verbose; int _fd; bool _on_disk; bool _built; public: AnnoyIndex(int f) : _f(f), _seed(Random::default_seed) { _s = offsetof(Node, v) + _f * sizeof(T); // Size of each node _verbose = false; _built = false; _K = (S) (((size_t) (_s - offsetof(Node, children))) / sizeof(S)); // Max number of descendants to fit into node reinitialize(); // Reset everything } ~AnnoyIndex() { unload(); } int get_f() const { return _f; } bool add_item(S item, const T* w, char** error=NULL) { return add_item_impl(item, w, error); } template bool add_item_impl(S item, const W& w, char** error=NULL) { if (_loaded) { set_error_from_string(error, "You can't add an item to a loaded index"); return false; } _allocate_size(item + 1); Node* n = _get(item); D::zero_value(n); n->children[0] = 0; n->children[1] = 0; n->n_descendants = 1; for (int z = 0; z < _f; z++) n->v[z] = w[z]; D::init_node(n, _f); if (item >= _n_items) _n_items = item + 1; return true; } bool on_disk_build(const char* file, char** error=NULL) { _on_disk = true; #ifndef _MSC_VER _fd = open(file, O_RDWR | O_CREAT | O_TRUNC, (int) 0600); #else _fd = _open(file, _O_RDWR | _O_CREAT | _O_TRUNC, (int) 0600); #endif if (_fd == -1) { set_error_from_errno(error, "Unable to open"); _fd = 0; return false; } _nodes_size = 1; if (ftruncate(_fd, ANNOYLIB_FTRUNCATE_SIZE(_s) * ANNOYLIB_FTRUNCATE_SIZE(_nodes_size)) == -1) { set_error_from_errno(error, "Unable to truncate"); return false; } #ifdef MAP_POPULATE _nodes = (Node*) mmap(0, _s * _nodes_size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, _fd, 0); #else _nodes = (Node*) mmap(0, _s * _nodes_size, PROT_READ | PROT_WRITE, MAP_SHARED, _fd, 0); #endif return true; } bool build(int q, int n_threads=-1, char** error=NULL) { if (_loaded) { set_error_from_string(error, "You can't build a loaded index"); return false; } if (_built) { set_error_from_string(error, "You can't build a built index"); return false; } D::template preprocess(_nodes, _s, _n_items, _f); _n_nodes = _n_items; ThreadedBuildPolicy::template build(this, q, n_threads); // Also, copy the roots into the last segment of the array // This way we can load them faster without reading the whole file _allocate_size(_n_nodes + (S)_roots.size()); for (size_t i = 0; i < _roots.size(); i++) memcpy(_get(_n_nodes + (S)i), _get(_roots[i]), _s); _n_nodes += _roots.size(); if (_verbose) annoylib_showUpdate("has %d nodes\n", _n_nodes); if (_on_disk) { if (!remap_memory_and_truncate(&_nodes, _fd, static_cast(_s) * static_cast(_nodes_size), static_cast(_s) * static_cast(_n_nodes))) { // TODO: this probably creates an index in a corrupt state... not sure what to do set_error_from_errno(error, "Unable to truncate"); return false; } _nodes_size = _n_nodes; } D::template postprocess(_nodes, _s, _n_items, _f); _built = true; return true; } bool unbuild(char** error=NULL) { if (_loaded) { set_error_from_string(error, "You can't unbuild a loaded index"); return false; } _roots.clear(); _n_nodes = _n_items; _built = false; return true; } bool save(const char* filename, bool prefault=false, char** error=NULL) { if (!_built) { set_error_from_string(error, "You can't save an index that hasn't been built"); return false; } if (_on_disk) { return true; } else { // Delete file if it already exists (See issue #335) #ifndef _MSC_VER unlink(filename); #else _unlink(filename); #endif FILE *f = fopen(filename, "wb"); if (f == NULL) { set_error_from_errno(error, "Unable to open"); return false; } if (fwrite(_nodes, _s, _n_nodes, f) != (size_t) _n_nodes) { set_error_from_errno(error, "Unable to write"); return false; } if (fclose(f) == EOF) { set_error_from_errno(error, "Unable to close"); return false; } unload(); return load(filename, prefault, error); } } void reinitialize() { _fd = 0; _nodes = NULL; _loaded = false; _n_items = 0; _n_nodes = 0; _nodes_size = 0; _on_disk = false; _seed = Random::default_seed; _roots.clear(); } void unload() { if (_on_disk && _fd) { #ifndef _MSC_VER close(_fd); #else _close(_fd); #endif munmap(_nodes, _s * _nodes_size); } else { if (_fd) { // we have mmapped data #ifndef _MSC_VER close(_fd); #else _close(_fd); #endif munmap(_nodes, _n_nodes * _s); } else if (_nodes) { // We have heap allocated data free(_nodes); } } reinitialize(); if (_verbose) annoylib_showUpdate("unloaded\n"); } bool load(const char* filename, bool prefault=false, char** error=NULL) { #ifndef _MSC_VER _fd = open(filename, O_RDONLY, (int)0400); #else _fd = _open(filename, _O_RDONLY, (int)0400); #endif if (_fd == -1) { set_error_from_errno(error, "Unable to open"); _fd = 0; return false; } off_t size = lseek_getsize(_fd); if (size == -1) { set_error_from_errno(error, "Unable to get size"); return false; } else if (size == 0) { set_error_from_errno(error, "Size of file is zero"); return false; } else if (size % _s) { // Something is fishy with this index! set_error_from_errno(error, "Index size is not a multiple of vector size. Ensure you are opening using the same metric you used to create the index."); return false; } int flags = MAP_SHARED; if (prefault) { #ifdef MAP_POPULATE flags |= MAP_POPULATE; #else annoylib_showUpdate("prefault is set to true, but MAP_POPULATE is not defined on this platform"); #endif } _nodes = (Node*)mmap(0, size, PROT_READ, flags, _fd, 0); _n_nodes = (S)(size / _s); // Find the roots by scanning the end of the file and taking the nodes with most descendants _roots.clear(); S m = -1; for (S i = _n_nodes - 1; i >= 0; i--) { S k = _get(i)->n_descendants; if (m == -1 || k == m) { _roots.push_back(i); m = k; } else { break; } } // hacky fix: since the last root precedes the copy of all roots, delete it if (_roots.size() > 1 && _get(_roots.front())->children[0] == _get(_roots.back())->children[0]) _roots.pop_back(); _loaded = true; _built = true; _n_items = m; if (_verbose) annoylib_showUpdate("found %zu roots with degree %d\n", _roots.size(), m); return true; } T get_distance(S i, S j) const { return D::normalized_distance(D::distance(_get(i), _get(j), _f)); } void get_nns_by_item(S item, size_t n, int search_k, vector* result, vector* distances) const { // TODO: handle OOB const Node* m = _get(item); _get_all_nns(m->v, n, search_k, result, distances); } void get_nns_by_vector(const T* w, size_t n, int search_k, vector* result, vector* distances) const { _get_all_nns(w, n, search_k, result, distances); } S get_n_items() const { return _n_items; } S get_n_trees() const { return (S)_roots.size(); } void verbose(bool v) { _verbose = v; } void get_item(S item, T* v) const { // TODO: handle OOB Node* m = _get(item); memcpy(v, m->v, (_f) * sizeof(T)); } void set_seed(R seed) { _seed = seed; } void thread_build(int q, int thread_idx, ThreadedBuildPolicy& threaded_build_policy) { // Each thread needs its own seed, otherwise each thread would be building the same tree(s) Random _random(_seed + thread_idx); vector thread_roots; while (1) { if (q == -1) { threaded_build_policy.lock_n_nodes(); if (_n_nodes >= 2 * _n_items) { threaded_build_policy.unlock_n_nodes(); break; } threaded_build_policy.unlock_n_nodes(); } else { if (thread_roots.size() >= (size_t)q) { break; } } if (_verbose) annoylib_showUpdate("pass %zd...\n", thread_roots.size()); vector indices; threaded_build_policy.lock_shared_nodes(); for (S i = 0; i < _n_items; i++) { if (_get(i)->n_descendants >= 1) { // Issue #223 indices.push_back(i); } } threaded_build_policy.unlock_shared_nodes(); thread_roots.push_back(_make_tree(indices, true, _random, threaded_build_policy)); } threaded_build_policy.lock_roots(); _roots.insert(_roots.end(), thread_roots.begin(), thread_roots.end()); threaded_build_policy.unlock_roots(); } protected: void _reallocate_nodes(S n) { const double reallocation_factor = 1.3; S new_nodes_size = std::max(n, (S) ((_nodes_size + 1) * reallocation_factor)); void *old = _nodes; if (_on_disk) { if (!remap_memory_and_truncate(&_nodes, _fd, static_cast(_s) * static_cast(_nodes_size), static_cast(_s) * static_cast(new_nodes_size)) && _verbose) annoylib_showUpdate("File truncation error\n"); } else { _nodes = realloc(_nodes, _s * new_nodes_size); memset((char *) _nodes + (_nodes_size * _s) / sizeof(char), 0, (new_nodes_size - _nodes_size) * _s); } _nodes_size = new_nodes_size; if (_verbose) annoylib_showUpdate("Reallocating to %d nodes: old_address=%p, new_address=%p\n", new_nodes_size, old, _nodes); } void _allocate_size(S n, ThreadedBuildPolicy& threaded_build_policy) { if (n > _nodes_size) { threaded_build_policy.lock_nodes(); _reallocate_nodes(n); threaded_build_policy.unlock_nodes(); } } void _allocate_size(S n) { if (n > _nodes_size) { _reallocate_nodes(n); } } Node* _get(const S i) const { return get_node_ptr(_nodes, _s, i); } double _split_imbalance(const vector& left_indices, const vector& right_indices) { double ls = (float)left_indices.size(); double rs = (float)right_indices.size(); float f = ls / (ls + rs + 1e-9); // Avoid 0/0 return std::max(f, 1-f); } S _make_tree(const vector& indices, bool is_root, Random& _random, ThreadedBuildPolicy& threaded_build_policy) { // The basic rule is that if we have <= _K items, then it's a leaf node, otherwise it's a split node. // There's some regrettable complications caused by the problem that root nodes have to be "special": // 1. We identify root nodes by the arguable logic that _n_items == n->n_descendants, regardless of how many descendants they actually have // 2. Root nodes with only 1 child need to be a "dummy" parent // 3. Due to the _n_items "hack", we need to be careful with the cases where _n_items <= _K or _n_items > _K if (indices.size() == 1 && !is_root) return indices[0]; if (indices.size() <= (size_t)_K && (!is_root || (size_t)_n_items <= (size_t)_K || indices.size() == 1)) { threaded_build_policy.lock_n_nodes(); _allocate_size(_n_nodes + 1, threaded_build_policy); S item = _n_nodes++; threaded_build_policy.unlock_n_nodes(); threaded_build_policy.lock_shared_nodes(); Node* m = _get(item); m->n_descendants = is_root ? _n_items : (S)indices.size(); // Using std::copy instead of a loop seems to resolve issues #3 and #13, // probably because gcc 4.8 goes overboard with optimizations. // Using memcpy instead of std::copy for MSVC compatibility. #235 // Only copy when necessary to avoid crash in MSVC 9. #293 if (!indices.empty()) memcpy(m->children, &indices[0], indices.size() * sizeof(S)); threaded_build_policy.unlock_shared_nodes(); return item; } threaded_build_policy.lock_shared_nodes(); vector children; for (size_t i = 0; i < indices.size(); i++) { S j = indices[i]; Node* n = _get(j); if (n) children.push_back(n); } vector children_indices[2]; Node* m = (Node*)alloca(_s); for (int attempt = 0; attempt < 3; attempt++) { children_indices[0].clear(); children_indices[1].clear(); D::create_split(children, _f, _s, _random, m); for (size_t i = 0; i < indices.size(); i++) { S j = indices[i]; Node* n = _get(j); if (n) { bool side = D::side(m, n, _f, _random); children_indices[side].push_back(j); } else { annoylib_showUpdate("No node for index %d?\n", j); } } if (_split_imbalance(children_indices[0], children_indices[1]) < 0.95) break; } threaded_build_policy.unlock_shared_nodes(); // If we didn't find a hyperplane, just randomize sides as a last option while (_split_imbalance(children_indices[0], children_indices[1]) > 0.99) { if (_verbose) annoylib_showUpdate("\tNo hyperplane found (left has %zu children, right has %zu children)\n", children_indices[0].size(), children_indices[1].size()); children_indices[0].clear(); children_indices[1].clear(); // Set the vector to 0.0 for (int z = 0; z < _f; z++) m->v[z] = 0; for (size_t i = 0; i < indices.size(); i++) { S j = indices[i]; // Just randomize... children_indices[_random.flip()].push_back(j); } } int flip = (children_indices[0].size() > children_indices[1].size()); m->n_descendants = is_root ? _n_items : (S)indices.size(); for (int side = 0; side < 2; side++) { // run _make_tree for the smallest child first (for cache locality) m->children[side^flip] = _make_tree(children_indices[side^flip], false, _random, threaded_build_policy); } threaded_build_policy.lock_n_nodes(); _allocate_size(_n_nodes + 1, threaded_build_policy); S item = _n_nodes++; threaded_build_policy.unlock_n_nodes(); threaded_build_policy.lock_shared_nodes(); memcpy(_get(item), m, _s); threaded_build_policy.unlock_shared_nodes(); return item; } void _get_all_nns(const T* v, size_t n, int search_k, vector* result, vector* distances) const { Node* v_node = (Node *)alloca(_s); D::template zero_value(v_node); memcpy(v_node->v, v, sizeof(T) * _f); D::init_node(v_node, _f); std::priority_queue > q; if (search_k == -1) { search_k = n * _roots.size(); } for (size_t i = 0; i < _roots.size(); i++) { q.push(make_pair(Distance::template pq_initial_value(), _roots[i])); } std::vector nns; while (nns.size() < (size_t)search_k && !q.empty()) { const pair& top = q.top(); T d = top.first; S i = top.second; Node* nd = _get(i); q.pop(); if (nd->n_descendants == 1 && i < _n_items) { nns.push_back(i); } else if (nd->n_descendants <= _K) { const S* dst = nd->children; nns.insert(nns.end(), dst, &dst[nd->n_descendants]); } else { T margin = D::margin(nd, v, _f); q.push(make_pair(D::pq_distance(d, margin, 1), static_cast(nd->children[1]))); q.push(make_pair(D::pq_distance(d, margin, 0), static_cast(nd->children[0]))); } } // Get distances for all items // To avoid calculating distance multiple times for any items, sort by id std::sort(nns.begin(), nns.end()); vector > nns_dist; S last = -1; for (size_t i = 0; i < nns.size(); i++) { S j = nns[i]; if (j == last) continue; last = j; if (_get(j)->n_descendants == 1) // This is only to guard a really obscure case, #284 nns_dist.push_back(make_pair(D::distance(v_node, _get(j), _f), j)); } size_t m = nns_dist.size(); size_t p = n < m ? n : m; // Return this many items std::partial_sort(nns_dist.begin(), nns_dist.begin() + p, nns_dist.end()); for (size_t i = 0; i < p; i++) { if (distances) distances->push_back(D::normalized_distance(nns_dist[i].first)); result->push_back(nns_dist[i].second); } } }; class AnnoyIndexSingleThreadedBuildPolicy { public: template static void build(AnnoyIndex* annoy, int q, int n_threads) { AnnoyIndexSingleThreadedBuildPolicy threaded_build_policy; annoy->thread_build(q, 0, threaded_build_policy); } void lock_n_nodes() {} void unlock_n_nodes() {} void lock_nodes() {} void unlock_nodes() {} void lock_shared_nodes() {} void unlock_shared_nodes() {} void lock_roots() {} void unlock_roots() {} }; #ifdef ANNOYLIB_MULTITHREADED_BUILD class AnnoyIndexMultiThreadedBuildPolicy { private: std::shared_timed_mutex nodes_mutex; std::mutex n_nodes_mutex; std::mutex roots_mutex; public: template static void build(AnnoyIndex* annoy, int q, int n_threads) { AnnoyIndexMultiThreadedBuildPolicy threaded_build_policy; if (n_threads == -1) { // If the hardware_concurrency() value is not well defined or not computable, it returns 0. // We guard against this by using at least 1 thread. n_threads = std::max(1, (int)std::thread::hardware_concurrency()); } vector threads(n_threads); for (int thread_idx = 0; thread_idx < n_threads; thread_idx++) { int trees_per_thread = q == -1 ? -1 : (int)floor((q + thread_idx) / n_threads); threads[thread_idx] = std::thread( &AnnoyIndex::thread_build, annoy, trees_per_thread, thread_idx, std::ref(threaded_build_policy) ); } for (auto& thread : threads) { thread.join(); } } void lock_n_nodes() { n_nodes_mutex.lock(); } void unlock_n_nodes() { n_nodes_mutex.unlock(); } void lock_nodes() { nodes_mutex.lock(); } void unlock_nodes() { nodes_mutex.unlock(); } void lock_shared_nodes() { nodes_mutex.lock_shared(); } void unlock_shared_nodes() { nodes_mutex.unlock_shared(); } void lock_roots() { roots_mutex.lock(); } void unlock_roots() { roots_mutex.unlock(); } }; #endif } #endif // vim: tabstop=2 shiftwidth=2 ================================================ FILE: src/annoyluamodule.cc ================================================ // Copyright (c) 2016 Boris Nagaev // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy of // the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the // License for the specific language governing permissions and limitations under // the License. #include #include #include #include "annoylib.h" #include "kissrandom.h" #if LUA_VERSION_NUM == 501 #define compat_setfuncs(L, funcs) luaL_register(L, NULL, funcs) #define compat_rawlen lua_objlen #else #define compat_setfuncs(L, funcs) luaL_setfuncs(L, funcs, 0) #define compat_rawlen lua_rawlen #endif using namespace Annoy; template class LuaAnnoy { public: typedef int32_t AnnoyS; typedef float AnnoyT; typedef AnnoyIndex Impl; typedef LuaAnnoy ThisClass; class LuaArrayProxy { public: LuaArrayProxy(lua_State* L, int object, int f) : L_(L) , object_(object) { luaL_checktype(L, object, LUA_TTABLE); int v_len = compat_rawlen(L, object); luaL_argcheck(L, v_len == f, object, "Length of v != f"); } double operator[](int index) const { lua_rawgeti(L_, object_, index + 1); double result = lua_tonumber(L_, -1); lua_pop(L_, 1); return result; } private: lua_State* L_; int object_; }; static void toVector(lua_State* L, int object, int f, AnnoyT* dst) { LuaArrayProxy proxy(L, object, f); for (int i = 0; i < f; i++) { dst[i] = proxy[i]; } } template static void pushVector(lua_State* L, const Vector& v) { lua_createtable(L, v.size(), 0); for (int j = 0; j < v.size(); j++) { lua_pushnumber(L, v[j]); lua_rawseti(L, -2, j + 1); } } static const char* typeAsString() { return typeid(Impl).name(); } static Impl* getAnnoy(lua_State* L, int object) { return reinterpret_cast( luaL_checkudata(L, object, typeAsString()) ); } static int getItemIndex(lua_State* L, int object, int size = -1) { int item = luaL_checkinteger(L, object); luaL_argcheck(L, item >= 0, object, "Index must be >= 0"); if (size != -1) { luaL_argcheck(L, item < size, object, "Index must be < size"); } return item; } static int gc(lua_State* L) { Impl* self = getAnnoy(L, 1); self->~Impl(); return 0; } static int tostring(lua_State* L) { Impl* self = getAnnoy(L, 1); lua_pushfstring( L, "annoy.AnnoyIndex object (%dx%d, %s distance)", self->get_n_items(), self->get_f(), Distance::name() ); return 1; } static int add_item(lua_State* L) { Impl* self = getAnnoy(L, 1); int item = getItemIndex(L, 2); self->add_item_impl(item, LuaArrayProxy(L, 3, self->get_f())); return 0; } static int build(lua_State* L) { int nargs = lua_gettop(L); Impl* self = getAnnoy(L, 1); int n_trees = luaL_checkinteger(L, 2); self->build(n_trees, 1); lua_pushboolean(L, true); return 1; } static int on_disk_build(lua_State* L) { Impl* self = getAnnoy(L, 1); const char* filename = luaL_checkstring(L, 2); self->on_disk_build(filename); lua_pushboolean(L, true); return 1; } static int save(lua_State* L) { int nargs = lua_gettop(L); Impl* self = getAnnoy(L, 1); const char* filename = luaL_checkstring(L, 2); bool prefault = true; if (nargs >= 3) { prefault = lua_toboolean(L, 3); } self->save(filename, prefault); lua_pushboolean(L, true); return 1; } static int load(lua_State* L) { Impl* self = getAnnoy(L, 1); int nargs = lua_gettop(L); const char* filename = luaL_checkstring(L, 2); bool prefault = true; if (nargs >= 3) { prefault = lua_toboolean(L, 3); } if (!self->load(filename, prefault)) { return luaL_error(L, "Can't load file: %s", filename); } lua_pushboolean(L, true); return 1; } static int unload(lua_State* L) { Impl* self = getAnnoy(L, 1); self->unload(); lua_pushboolean(L, true); return 1; } struct Searcher { std::vector result; std::vector distances; Impl* self; int n; int search_k; bool include_distances; Searcher(lua_State* L) { int nargs = lua_gettop(L); self = getAnnoy(L, 1); n = luaL_checkinteger(L, 3); search_k = -1; if (nargs >= 4) { search_k = luaL_checkinteger(L, 4); } include_distances = false; if (nargs >= 5) { include_distances = lua_toboolean(L, 5); } } int pushResults(lua_State* L) { pushVector(L, result); if (include_distances) { pushVector(L, distances); } return include_distances ? 2 : 1; } }; static int get_nns_by_item(lua_State* L) { Searcher s(L); int item = getItemIndex(L, 2, s.self->get_n_items()); s.self->get_nns_by_item(item, s.n, s.search_k, &s.result, s.include_distances ? &s.distances : NULL); return s.pushResults(L); } static int get_nns_by_vector(lua_State* L) { Searcher s(L); std::vector _vec(s.self->get_f()); AnnoyT* vec = &(_vec[0]); toVector(L, 2, s.self->get_f(), vec); s.self->get_nns_by_vector(vec, s.n, s.search_k, &s.result, s.include_distances ? &s.distances : NULL); return s.pushResults(L); } static int get_item_vector(lua_State* L) { Impl* self = getAnnoy(L, 1); int item = getItemIndex(L, 2, self->get_n_items()); std::vector _vec(self->get_f()); AnnoyT* vec = &(_vec[0]); self->get_item(item, vec); pushVector(L, _vec); return 1; } static int get_distance(lua_State* L) { Impl* self = getAnnoy(L, 1); int i = getItemIndex(L, 2, self->get_n_items()); int j = getItemIndex(L, 3, self->get_n_items()); AnnoyT distance = self->get_distance(i, j); lua_pushnumber(L, distance); return 1; } static int get_n_items(lua_State* L) { Impl* self = getAnnoy(L, 1); lua_pushnumber(L, self->get_n_items()); return 1; } static const luaL_Reg* getMetatable() { static const luaL_Reg funcs[] = { {"__gc", &ThisClass::gc}, {"__tostring", &ThisClass::tostring}, {NULL, NULL}, }; return funcs; } static const luaL_Reg* getMethods() { static const luaL_Reg funcs[] = { {"add_item", &ThisClass::add_item}, {"build", &ThisClass::build}, {"save", &ThisClass::save}, {"load", &ThisClass::load}, {"unload", &ThisClass::unload}, {"get_nns_by_item", &ThisClass::get_nns_by_item}, {"get_nns_by_vector", &ThisClass::get_nns_by_vector}, {"get_item_vector", &ThisClass::get_item_vector}, {"get_distance", &ThisClass::get_distance}, {"get_n_items", &ThisClass::get_n_items}, {"on_disk_build", &ThisClass::on_disk_build}, {NULL, NULL}, }; return funcs; } static void createNew(lua_State* L, int f) { void* self = lua_newuserdata(L, sizeof(Impl)); if (luaL_newmetatable(L, typeAsString())) { compat_setfuncs(L, getMetatable()); lua_newtable(L); compat_setfuncs(L, getMethods()); lua_setfield(L, -2, "__index"); } new (self) Impl(f); lua_setmetatable(L, -2); } }; static int lua_an_make(lua_State* L) { int f = luaL_checkinteger(L, 1); const char* metric = "angular"; if (lua_gettop(L) >= 2) { metric = luaL_checkstring(L, 2); } if (strcmp(metric, "angular") == 0) { LuaAnnoy::createNew(L, f); return 1; } else if (strcmp(metric, "euclidean") == 0) { LuaAnnoy::createNew(L, f); return 1; } else if (strcmp(metric, "manhattan") == 0) { LuaAnnoy::createNew(L, f); return 1; } else { return luaL_error(L, "Unknown metric: %s", metric); } } static const luaL_Reg LUA_ANNOY_FUNCS[] = { {"AnnoyIndex", lua_an_make}, {NULL, NULL}, }; extern "C" { int luaopen_annoy(lua_State* L) { lua_newtable(L); compat_setfuncs(L, LUA_ANNOY_FUNCS); return 1; } } // vim: tabstop=2 shiftwidth=2 ================================================ FILE: src/annoymodule.cc ================================================ // Copyright (c) 2013 Spotify AB // // Licensed under the Apache License, Version 2.0 (the "License"); you may not // use this file except in compliance with the License. You may obtain a copy of // the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the // License for the specific language governing permissions and limitations under // the License. #include "annoylib.h" #include "kissrandom.h" #include "Python.h" #include "structmember.h" #include #if defined(_MSC_VER) && _MSC_VER == 1500 typedef signed __int32 int32_t; #else #include #endif #if defined(ANNOYLIB_USE_AVX512) #define AVX_INFO "Using 512-bit AVX instructions" #elif defined(ANNOYLIB_USE_AVX128) #define AVX_INFO "Using 128-bit AVX instructions" #else #define AVX_INFO "Not using AVX instructions" #endif #if defined(_MSC_VER) #define COMPILER_INFO "Compiled using MSC" #elif defined(__GNUC__) #define COMPILER_INFO "Compiled on GCC" #else #define COMPILER_INFO "Compiled on unknown platform" #endif #define ANNOY_DOC (COMPILER_INFO ". " AVX_INFO ".") #if PY_MAJOR_VERSION >= 3 #define IS_PY3K #endif #ifndef Py_TYPE #define Py_TYPE(ob) (((PyObject*)(ob))->ob_type) #endif #ifdef IS_PY3K #define PyInt_FromLong PyLong_FromLong #endif using namespace Annoy; #ifdef ANNOYLIB_MULTITHREADED_BUILD typedef AnnoyIndexMultiThreadedBuildPolicy AnnoyIndexThreadedBuildPolicy; #else typedef AnnoyIndexSingleThreadedBuildPolicy AnnoyIndexThreadedBuildPolicy; #endif template class Annoy::AnnoyIndexInterface; class HammingWrapper : public AnnoyIndexInterface { // Wrapper class for Hamming distance, using composition. // This translates binary (float) vectors into packed uint64_t vectors. // This is questionable from a performance point of view. Should reconsider this solution. private: int32_t _f_external, _f_internal; AnnoyIndex _index; void _pack(const float* src, uint64_t* dst) const { for (int32_t i = 0; i < _f_internal; i++) { dst[i] = 0; for (int32_t j = 0; j < 64 && i*64+j < _f_external; j++) { dst[i] |= (uint64_t)(src[i * 64 + j] > 0.5) << j; } } }; void _unpack(const uint64_t* src, float* dst) const { for (int32_t i = 0; i < _f_external; i++) { dst[i] = (src[i / 64] >> (i % 64)) & 1; } }; public: HammingWrapper(int f) : _f_external(f), _f_internal((f + 63) / 64), _index((f + 63) / 64) {}; bool add_item(int32_t item, const float* w, char**error) { vector w_internal(_f_internal, 0); _pack(w, &w_internal[0]); return _index.add_item(item, &w_internal[0], error); }; bool build(int q, int n_threads, char** error) { return _index.build(q, n_threads, error); }; bool unbuild(char** error) { return _index.unbuild(error); }; bool save(const char* filename, bool prefault, char** error) { return _index.save(filename, prefault, error); }; void unload() { _index.unload(); }; bool load(const char* filename, bool prefault, char** error) { return _index.load(filename, prefault, error); }; float get_distance(int32_t i, int32_t j) const { return _index.get_distance(i, j); }; void get_nns_by_item(int32_t item, size_t n, int search_k, vector* result, vector* distances) const { if (distances) { vector distances_internal; _index.get_nns_by_item(item, n, search_k, result, &distances_internal); distances->insert(distances->begin(), distances_internal.begin(), distances_internal.end()); } else { _index.get_nns_by_item(item, n, search_k, result, NULL); } }; void get_nns_by_vector(const float* w, size_t n, int search_k, vector* result, vector* distances) const { vector w_internal(_f_internal, 0); _pack(w, &w_internal[0]); if (distances) { vector distances_internal; _index.get_nns_by_vector(&w_internal[0], n, search_k, result, &distances_internal); distances->insert(distances->begin(), distances_internal.begin(), distances_internal.end()); } else { _index.get_nns_by_vector(&w_internal[0], n, search_k, result, NULL); } }; int32_t get_n_items() const { return _index.get_n_items(); }; int32_t get_n_trees() const { return _index.get_n_trees(); }; void verbose(bool v) { _index.verbose(v); }; void get_item(int32_t item, float* v) const { vector v_internal(_f_internal, 0); _index.get_item(item, &v_internal[0]); _unpack(&v_internal[0], v); }; void set_seed(uint64_t q) { _index.set_seed(q); }; bool on_disk_build(const char* filename, char** error) { return _index.on_disk_build(filename, error); }; }; // annoy python object typedef struct { PyObject_HEAD int f; AnnoyIndexInterface* ptr; } py_annoy; static PyObject * py_an_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) { py_annoy *self = (py_annoy *)type->tp_alloc(type, 0); if (self == NULL) { return NULL; } const char *metric = NULL; static char const * kwlist[] = {"f", "metric", NULL}; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "i|s", (char**)kwlist, &self->f, &metric)) return NULL; if (!metric) { // This keeps coming up, see #368 etc PyErr_WarnEx(PyExc_FutureWarning, "The default argument for metric will be removed " "in future version of Annoy. Please pass metric='angular' explicitly.", 1); self->ptr = new AnnoyIndex(self->f); } else if (!strcmp(metric, "angular")) { self->ptr = new AnnoyIndex(self->f); } else if (!strcmp(metric, "euclidean")) { self->ptr = new AnnoyIndex(self->f); } else if (!strcmp(metric, "manhattan")) { self->ptr = new AnnoyIndex(self->f); } else if (!strcmp(metric, "hamming")) { self->ptr = new HammingWrapper(self->f); } else if (!strcmp(metric, "dot")) { self->ptr = new AnnoyIndex(self->f); } else { PyErr_SetString(PyExc_ValueError, "No such metric"); return NULL; } return (PyObject *)self; } static int py_an_init(py_annoy *self, PyObject *args, PyObject *kwargs) { // Seems to be needed for Python 3 const char *metric = NULL; int f; static char const * kwlist[] = {"f", "metric", NULL}; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "i|s", (char**)kwlist, &f, &metric)) return (int) NULL; return 0; } static void py_an_dealloc(py_annoy* self) { delete self->ptr; Py_TYPE(self)->tp_free((PyObject*)self); } static PyMemberDef py_annoy_members[] = { {(char*)"f", T_INT, offsetof(py_annoy, f), 0, (char*)""}, {NULL} /* Sentinel */ }; static PyObject * py_an_load(py_annoy *self, PyObject *args, PyObject *kwargs) { char *filename, *error; bool prefault = false; if (!self->ptr) return NULL; static char const * kwlist[] = {"fn", "prefault", NULL}; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s|b", (char**)kwlist, &filename, &prefault)) return NULL; if (!self->ptr->load(filename, prefault, &error)) { PyErr_SetString(PyExc_IOError, error); free(error); return NULL; } Py_RETURN_TRUE; } static PyObject * py_an_save(py_annoy *self, PyObject *args, PyObject *kwargs) { char *filename, *error; bool prefault = false; if (!self->ptr) return NULL; static char const * kwlist[] = {"fn", "prefault", NULL}; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s|b", (char**)kwlist, &filename, &prefault)) return NULL; if (!self->ptr->save(filename, prefault, &error)) { PyErr_SetString(PyExc_IOError, error); free(error); return NULL; } Py_RETURN_TRUE; } PyObject* get_nns_to_python(const vector& result, const vector& distances, int include_distances) { PyObject* l = NULL; PyObject* d = NULL; PyObject* t = NULL; if ((l = PyList_New(result.size())) == NULL) { goto error; } for (size_t i = 0; i < result.size(); i++) { PyObject* res = PyInt_FromLong(result[i]); if (res == NULL) { goto error; } PyList_SetItem(l, i, res); } if (!include_distances) return l; if ((d = PyList_New(distances.size())) == NULL) { goto error; } for (size_t i = 0; i < distances.size(); i++) { PyObject* dist = PyFloat_FromDouble(distances[i]); if (dist == NULL) { goto error; } PyList_SetItem(d, i, dist); } if ((t = PyTuple_Pack(2, l, d)) == NULL) { goto error; } Py_XDECREF(l); Py_XDECREF(d); return t; error: Py_XDECREF(l); Py_XDECREF(d); Py_XDECREF(t); return NULL; } bool check_constraints(py_annoy *self, int32_t item, bool building) { if (item < 0) { PyErr_SetString(PyExc_IndexError, "Item index can not be negative"); return false; } else if (!building && item >= self->ptr->get_n_items()) { PyErr_SetString(PyExc_IndexError, "Item index larger than the largest item index"); return false; } else { return true; } } static PyObject* py_an_get_nns_by_item(py_annoy *self, PyObject *args, PyObject *kwargs) { int32_t item, n, search_k=-1, include_distances=0; if (!self->ptr) return NULL; static char const * kwlist[] = {"i", "n", "search_k", "include_distances", NULL}; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "ii|ii", (char**)kwlist, &item, &n, &search_k, &include_distances)) return NULL; if (!check_constraints(self, item, false)) { return NULL; } vector result; vector distances; Py_BEGIN_ALLOW_THREADS; self->ptr->get_nns_by_item(item, n, search_k, &result, include_distances ? &distances : NULL); Py_END_ALLOW_THREADS; return get_nns_to_python(result, distances, include_distances); } bool convert_list_to_vector(PyObject* v, int f, vector* w) { Py_ssize_t length = PyObject_Size(v); if (length == -1) { return false; } if (length != f) { PyErr_Format(PyExc_IndexError, "Vector has wrong length (expected %d, got %ld)", f, length); return false; } for (int z = 0; z < f; z++) { PyObject *key = PyInt_FromLong(z); if (key == NULL) { return false; } PyObject *pf = PyObject_GetItem(v, key); Py_DECREF(key); if (pf == NULL) { return false; } double value = PyFloat_AsDouble(pf); Py_DECREF(pf); if (value == -1.0 && PyErr_Occurred()) { return false; } (*w)[z] = value; } return true; } static PyObject* py_an_get_nns_by_vector(py_annoy *self, PyObject *args, PyObject *kwargs) { PyObject* v; int32_t n, search_k=-1, include_distances=0; if (!self->ptr) return NULL; static char const * kwlist[] = {"vector", "n", "search_k", "include_distances", NULL}; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "Oi|ii", (char**)kwlist, &v, &n, &search_k, &include_distances)) return NULL; vector w(self->f); if (!convert_list_to_vector(v, self->f, &w)) { return NULL; } vector result; vector distances; Py_BEGIN_ALLOW_THREADS; self->ptr->get_nns_by_vector(&w[0], n, search_k, &result, include_distances ? &distances : NULL); Py_END_ALLOW_THREADS; return get_nns_to_python(result, distances, include_distances); } static PyObject* py_an_get_item_vector(py_annoy *self, PyObject *args) { int32_t item; if (!self->ptr) return NULL; if (!PyArg_ParseTuple(args, "i", &item)) return NULL; if (!check_constraints(self, item, false)) { return NULL; } vector v(self->f); self->ptr->get_item(item, &v[0]); PyObject* l = PyList_New(self->f); if (l == NULL) { return NULL; } for (int z = 0; z < self->f; z++) { PyObject* dist = PyFloat_FromDouble(v[z]); if (dist == NULL) { goto error; } PyList_SetItem(l, z, dist); } return l; error: Py_XDECREF(l); return NULL; } static PyObject* py_an_add_item(py_annoy *self, PyObject *args, PyObject* kwargs) { PyObject* v; int32_t item; if (!self->ptr) return NULL; static char const * kwlist[] = {"i", "vector", NULL}; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "iO", (char**)kwlist, &item, &v)) return NULL; if (!check_constraints(self, item, true)) { return NULL; } vector w(self->f); if (!convert_list_to_vector(v, self->f, &w)) { return NULL; } char* error; if (!self->ptr->add_item(item, &w[0], &error)) { PyErr_SetString(PyExc_Exception, error); free(error); return NULL; } Py_RETURN_NONE; } static PyObject * py_an_on_disk_build(py_annoy *self, PyObject *args, PyObject *kwargs) { char *filename, *error; if (!self->ptr) return NULL; static char const * kwlist[] = {"fn", NULL}; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s", (char**)kwlist, &filename)) return NULL; if (!self->ptr->on_disk_build(filename, &error)) { PyErr_SetString(PyExc_IOError, error); free(error); return NULL; } Py_RETURN_TRUE; } static PyObject * py_an_build(py_annoy *self, PyObject *args, PyObject *kwargs) { int q; int n_jobs = -1; if (!self->ptr) return NULL; static char const * kwlist[] = {"n_trees", "n_jobs", NULL}; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "i|i", (char**)kwlist, &q, &n_jobs)) return NULL; bool res; char* error; Py_BEGIN_ALLOW_THREADS; res = self->ptr->build(q, n_jobs, &error); Py_END_ALLOW_THREADS; if (!res) { PyErr_SetString(PyExc_Exception, error); free(error); return NULL; } Py_RETURN_TRUE; } static PyObject * py_an_unbuild(py_annoy *self) { if (!self->ptr) return NULL; char* error; if (!self->ptr->unbuild(&error)) { PyErr_SetString(PyExc_Exception, error); free(error); return NULL; } Py_RETURN_TRUE; } static PyObject * py_an_unload(py_annoy *self) { if (!self->ptr) return NULL; self->ptr->unload(); Py_RETURN_TRUE; } static PyObject * py_an_get_distance(py_annoy *self, PyObject *args) { int32_t i, j; if (!self->ptr) return NULL; if (!PyArg_ParseTuple(args, "ii", &i, &j)) return NULL; if (!check_constraints(self, i, false) || !check_constraints(self, j, false)) { return NULL; } double d = self->ptr->get_distance(i,j); return PyFloat_FromDouble(d); } static PyObject * py_an_get_n_items(py_annoy *self) { if (!self->ptr) return NULL; int32_t n = self->ptr->get_n_items(); return PyInt_FromLong(n); } static PyObject * py_an_get_n_trees(py_annoy *self) { if (!self->ptr) return NULL; int32_t n = self->ptr->get_n_trees(); return PyInt_FromLong(n); } static PyObject * py_an_verbose(py_annoy *self, PyObject *args) { int verbose; if (!self->ptr) return NULL; if (!PyArg_ParseTuple(args, "i", &verbose)) return NULL; self->ptr->verbose((bool)verbose); Py_RETURN_TRUE; } static PyObject * py_an_set_seed(py_annoy *self, PyObject *args) { int q; if (!self->ptr) return NULL; if (!PyArg_ParseTuple(args, "i", &q)) return NULL; self->ptr->set_seed(q); Py_RETURN_NONE; } static PyMethodDef AnnoyMethods[] = { {"load", (PyCFunction)py_an_load, METH_VARARGS | METH_KEYWORDS, "Loads (mmaps) an index from disk."}, {"save", (PyCFunction)py_an_save, METH_VARARGS | METH_KEYWORDS, "Saves the index to disk."}, {"get_nns_by_item",(PyCFunction)py_an_get_nns_by_item, METH_VARARGS | METH_KEYWORDS, "Returns the `n` closest items to item `i`.\n\n:param search_k: the query will inspect up to `search_k` nodes.\n`search_k` gives you a run-time tradeoff between better accuracy and speed.\n`search_k` defaults to `n_trees * n` if not provided.\n\n:param include_distances: If `True`, this function will return a\n2 element tuple of lists. The first list contains the `n` closest items.\nThe second list contains the corresponding distances."}, {"get_nns_by_vector",(PyCFunction)py_an_get_nns_by_vector, METH_VARARGS | METH_KEYWORDS, "Returns the `n` closest items to vector `vector`.\n\n:param search_k: the query will inspect up to `search_k` nodes.\n`search_k` gives you a run-time tradeoff between better accuracy and speed.\n`search_k` defaults to `n_trees * n` if not provided.\n\n:param include_distances: If `True`, this function will return a\n2 element tuple of lists. The first list contains the `n` closest items.\nThe second list contains the corresponding distances."}, {"get_item_vector",(PyCFunction)py_an_get_item_vector, METH_VARARGS, "Returns the vector for item `i` that was previously added."}, {"add_item",(PyCFunction)py_an_add_item, METH_VARARGS | METH_KEYWORDS, "Adds item `i` (any nonnegative integer) with vector `v`.\n\nNote that it will allocate memory for `max(i)+1` items."}, {"on_disk_build",(PyCFunction)py_an_on_disk_build, METH_VARARGS | METH_KEYWORDS, "Build will be performed with storage on disk instead of RAM."}, {"build",(PyCFunction)py_an_build, METH_VARARGS | METH_KEYWORDS, "Builds a forest of `n_trees` trees.\n\nMore trees give higher precision when querying. After calling `build`,\nno more items can be added. `n_jobs` specifies the number of threads used to build the trees. `n_jobs=-1` uses all available CPU cores."}, {"unbuild",(PyCFunction)py_an_unbuild, METH_NOARGS, "Unbuilds the tree in order to allows adding new items.\n\nbuild() has to be called again afterwards in order to\nrun queries."}, {"unload",(PyCFunction)py_an_unload, METH_NOARGS, "Unloads an index from disk."}, {"get_distance",(PyCFunction)py_an_get_distance, METH_VARARGS, "Returns the distance between items `i` and `j`."}, {"get_n_items",(PyCFunction)py_an_get_n_items, METH_NOARGS, "Returns the number of items in the index."}, {"get_n_trees",(PyCFunction)py_an_get_n_trees, METH_NOARGS, "Returns the number of trees in the index."}, {"verbose",(PyCFunction)py_an_verbose, METH_VARARGS, ""}, {"set_seed",(PyCFunction)py_an_set_seed, METH_VARARGS, "Sets the seed of Annoy's random number generator."}, {NULL, NULL, 0, NULL} /* Sentinel */ }; static PyTypeObject PyAnnoyType = { PyVarObject_HEAD_INIT(NULL, 0) "annoy.Annoy", /*tp_name*/ sizeof(py_annoy), /*tp_basicsize*/ 0, /*tp_itemsize*/ (destructor)py_an_dealloc, /*tp_dealloc*/ 0, /*tp_print*/ 0, /*tp_getattr*/ 0, /*tp_setattr*/ 0, /*tp_compare*/ 0, /*tp_repr*/ 0, /*tp_as_number*/ 0, /*tp_as_sequence*/ 0, /*tp_as_mapping*/ 0, /*tp_hash */ 0, /*tp_call*/ 0, /*tp_str*/ 0, /*tp_getattro*/ 0, /*tp_setattro*/ 0, /*tp_as_buffer*/ Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /*tp_flags*/ ANNOY_DOC, /* tp_doc */ 0, /* tp_traverse */ 0, /* tp_clear */ 0, /* tp_richcompare */ 0, /* tp_weaklistoffset */ 0, /* tp_iter */ 0, /* tp_iternext */ AnnoyMethods, /* tp_methods */ py_annoy_members, /* tp_members */ 0, /* tp_getset */ 0, /* tp_base */ 0, /* tp_dict */ 0, /* tp_descr_get */ 0, /* tp_descr_set */ 0, /* tp_dictoffset */ (initproc)py_an_init, /* tp_init */ 0, /* tp_alloc */ py_an_new, /* tp_new */ }; static PyMethodDef module_methods[] = { {NULL} /* Sentinel */ }; #if PY_MAJOR_VERSION >= 3 static struct PyModuleDef moduledef = { PyModuleDef_HEAD_INIT, "annoylib", /* m_name */ ANNOY_DOC, /* m_doc */ -1, /* m_size */ module_methods, /* m_methods */ NULL, /* m_reload */ NULL, /* m_traverse */ NULL, /* m_clear */ NULL, /* m_free */ }; #endif PyObject *create_module(void) { PyObject *m; if (PyType_Ready(&PyAnnoyType) < 0) return NULL; #if PY_MAJOR_VERSION >= 3 m = PyModule_Create(&moduledef); #else m = Py_InitModule("annoylib", module_methods); #endif if (m == NULL) return NULL; Py_INCREF(&PyAnnoyType); PyModule_AddObject(m, "Annoy", (PyObject *)&PyAnnoyType); return m; } #if PY_MAJOR_VERSION >= 3 PyMODINIT_FUNC PyInit_annoylib(void) { return create_module(); // it should return moudule object in py3 } #else PyMODINIT_FUNC initannoylib(void) { create_module(); } #endif // vim: tabstop=2 shiftwidth=2 ================================================ FILE: src/kissrandom.h ================================================ #ifndef ANNOY_KISSRANDOM_H #define ANNOY_KISSRANDOM_H #if defined(_MSC_VER) && _MSC_VER == 1500 typedef unsigned __int32 uint32_t; typedef unsigned __int64 uint64_t; #else #include #endif namespace Annoy { // KISS = "keep it simple, stupid", but high quality random number generator // http://www0.cs.ucl.ac.uk/staff/d.jones/GoodPracticeRNG.pdf -> "Use a good RNG and build it into your code" // http://mathforum.org/kb/message.jspa?messageID=6627731 // https://de.wikipedia.org/wiki/KISS_(Zufallszahlengenerator) // 32 bit KISS struct Kiss32Random { uint32_t x; uint32_t y; uint32_t z; uint32_t c; static const uint32_t default_seed = 123456789; #if __cplusplus < 201103L typedef uint32_t seed_type; #endif // seed must be != 0 Kiss32Random(uint32_t seed = default_seed) { x = seed; y = 362436000; z = 521288629; c = 7654321; } uint32_t kiss() { // Linear congruence generator x = 69069 * x + 12345; // Xor shift y ^= y << 13; y ^= y >> 17; y ^= y << 5; // Multiply-with-carry uint64_t t = 698769069ULL * z + c; c = t >> 32; z = (uint32_t) t; return x + y + z; } inline int flip() { // Draw random 0 or 1 return kiss() & 1; } inline size_t index(size_t n) { // Draw random integer between 0 and n-1 where n is at most the number of data points you have return kiss() % n; } inline void set_seed(uint32_t seed) { x = seed; } }; // 64 bit KISS. Use this if you have more than about 2^24 data points ("big data" ;) ) struct Kiss64Random { uint64_t x; uint64_t y; uint64_t z; uint64_t c; static const uint64_t default_seed = 1234567890987654321ULL; #if __cplusplus < 201103L typedef uint64_t seed_type; #endif // seed must be != 0 Kiss64Random(uint64_t seed = default_seed) { x = seed; y = 362436362436362436ULL; z = 1066149217761810ULL; c = 123456123456123456ULL; } uint64_t kiss() { // Linear congruence generator z = 6906969069LL*z+1234567; // Xor shift y ^= (y<<13); y ^= (y>>17); y ^= (y<<43); // Multiply-with-carry (uint128_t t = (2^58 + 1) * x + c; c = t >> 64; x = (uint64_t) t) uint64_t t = (x<<58)+c; c = (x>>6); x += t; c += (x #include #include #include #define PROT_NONE 0 #define PROT_READ 1 #define PROT_WRITE 2 #define PROT_EXEC 4 #define MAP_FILE 0 #define MAP_SHARED 1 #define MAP_PRIVATE 2 #define MAP_TYPE 0xf #define MAP_FIXED 0x10 #define MAP_ANONYMOUS 0x20 #define MAP_ANON MAP_ANONYMOUS #define MAP_FAILED ((void *)-1) /* Flags for msync. */ #define MS_ASYNC 1 #define MS_SYNC 2 #define MS_INVALIDATE 4 #ifndef FILE_MAP_EXECUTE #define FILE_MAP_EXECUTE 0x0020 #endif static int __map_mman_error(const DWORD err, const int deferr) { if (err == 0) return 0; //TODO: implement return err; } static DWORD __map_mmap_prot_page(const int prot) { DWORD protect = 0; if (prot == PROT_NONE) return protect; if ((prot & PROT_EXEC) != 0) { protect = ((prot & PROT_WRITE) != 0) ? PAGE_EXECUTE_READWRITE : PAGE_EXECUTE_READ; } else { protect = ((prot & PROT_WRITE) != 0) ? PAGE_READWRITE : PAGE_READONLY; } return protect; } static DWORD __map_mmap_prot_file(const int prot) { DWORD desiredAccess = 0; if (prot == PROT_NONE) return desiredAccess; if ((prot & PROT_READ) != 0) desiredAccess |= FILE_MAP_READ; if ((prot & PROT_WRITE) != 0) desiredAccess |= FILE_MAP_WRITE; if ((prot & PROT_EXEC) != 0) desiredAccess |= FILE_MAP_EXECUTE; return desiredAccess; } inline void* mmap(void *addr, size_t len, int prot, int flags, int fildes, off_t off) { HANDLE fm, h; void * map = MAP_FAILED; #ifdef _MSC_VER #pragma warning(push) #pragma warning(disable: 4293) #endif const DWORD dwFileOffsetLow = (sizeof(off_t) <= sizeof(DWORD)) ? (DWORD)off : (DWORD)(off & 0xFFFFFFFFL); const DWORD dwFileOffsetHigh = (sizeof(off_t) <= sizeof(DWORD)) ? (DWORD)0 : (DWORD)((off >> 32) & 0xFFFFFFFFL); const DWORD protect = __map_mmap_prot_page(prot); const DWORD desiredAccess = __map_mmap_prot_file(prot); const off_t maxSize = off + (off_t)len; const DWORD dwMaxSizeLow = (sizeof(off_t) <= sizeof(DWORD)) ? (DWORD)maxSize : (DWORD)(maxSize & 0xFFFFFFFFL); const DWORD dwMaxSizeHigh = (sizeof(off_t) <= sizeof(DWORD)) ? (DWORD)0 : (DWORD)((maxSize >> 32) & 0xFFFFFFFFL); #ifdef _MSC_VER #pragma warning(pop) #endif errno = 0; if (len == 0 /* Unsupported flag combinations */ || (flags & MAP_FIXED) != 0 /* Usupported protection combinations */ || prot == PROT_EXEC) { errno = EINVAL; return MAP_FAILED; } h = ((flags & MAP_ANONYMOUS) == 0) ? (HANDLE)_get_osfhandle(fildes) : INVALID_HANDLE_VALUE; if ((flags & MAP_ANONYMOUS) == 0 && h == INVALID_HANDLE_VALUE) { errno = EBADF; return MAP_FAILED; } fm = CreateFileMapping(h, NULL, protect, dwMaxSizeHigh, dwMaxSizeLow, NULL); if (fm == NULL) { errno = __map_mman_error(GetLastError(), EPERM); return MAP_FAILED; } map = MapViewOfFile(fm, desiredAccess, dwFileOffsetHigh, dwFileOffsetLow, len); CloseHandle(fm); if (map == NULL) { errno = __map_mman_error(GetLastError(), EPERM); return MAP_FAILED; } return map; } inline int munmap(void *addr, size_t len) { if (UnmapViewOfFile(addr)) return 0; errno = __map_mman_error(GetLastError(), EPERM); return -1; } inline int mprotect(void *addr, size_t len, int prot) { DWORD newProtect = __map_mmap_prot_page(prot); DWORD oldProtect = 0; if (VirtualProtect(addr, len, newProtect, &oldProtect)) return 0; errno = __map_mman_error(GetLastError(), EPERM); return -1; } inline int msync(void *addr, size_t len, int flags) { if (FlushViewOfFile(addr, len)) return 0; errno = __map_mman_error(GetLastError(), EPERM); return -1; } inline int mlock(const void *addr, size_t len) { if (VirtualLock((LPVOID)addr, len)) return 0; errno = __map_mman_error(GetLastError(), EPERM); return -1; } inline int munlock(const void *addr, size_t len) { if (VirtualUnlock((LPVOID)addr, len)) return 0; errno = __map_mman_error(GetLastError(), EPERM); return -1; } #if !defined(__MINGW32__) inline int ftruncate(const int fd, const int64_t size) { if (fd < 0) { errno = EBADF; return -1; } HANDLE h = reinterpret_cast(_get_osfhandle(fd)); LARGE_INTEGER li_start, li_size; li_start.QuadPart = static_cast(0); li_size.QuadPart = size; if (SetFilePointerEx(h, li_start, NULL, FILE_CURRENT) == ~0 || SetFilePointerEx(h, li_size, NULL, FILE_BEGIN) == ~0 || !SetEndOfFile(h)) { unsigned long error = GetLastError(); fprintf(stderr, "I/O error while truncating: %lu\n", error); switch (error) { case ERROR_INVALID_HANDLE: errno = EBADF; break; default: errno = EIO; break; } return -1; } return 0; } #endif #endif ================================================ FILE: test/accuracy_test.py ================================================ # Copyright (c) 2013 Spotify AB # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy of # the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations under # the License. from __future__ import print_function import os import h5py from annoy import AnnoyIndex try: from urllib import urlretrieve except ImportError: from urllib.request import urlretrieve # Python 3 def _get_index(dataset, custom_distance=None, custom_dim=None): url = 'http://ann-benchmarks.com/%s.hdf5' % dataset vectors_fn = os.path.join("test", dataset + ".hdf5") index_fn = os.path.join("test", dataset + ".annoy") if not os.path.exists(vectors_fn): print("downloading", url, "->", vectors_fn) urlretrieve(url, vectors_fn) dataset_f = h5py.File(vectors_fn, "r") distance = dataset_f.attrs["distance"] if custom_distance is not None: distance = custom_distance f = dataset_f["train"].shape[1] if custom_dim: f = custom_dim if custom_distance: dataset = dataset.rsplit('-', 2)[0] + "-%d-%s" % (f, custom_distance) index_fn = os.path.join('test', dataset + '.annoy') annoy = AnnoyIndex(f, distance) if not os.path.exists(index_fn): print("adding items", distance, f) for i, v in enumerate(dataset_f["train"]): if len(v) > f: v = v[:f] annoy.add_item(i, v) print("building index") annoy.build(10) annoy.save(index_fn) else: annoy.load(index_fn) return annoy, dataset_f, dataset def _test_index(dataset, exp_accuracy, custom_metric=None, custom_dim=None): annoy, dataset_f, dataset = _get_index(dataset, custom_metric, custom_dim) n, k = 0, 0 for i, v in enumerate(dataset_f["test"]): if custom_dim: v = v[:custom_dim] js_fast = annoy.get_nns_by_vector(v, 10, 10000) js_real = dataset_f["neighbors"][i][:10] assert len(js_fast) == 10 assert len(js_real) == 10 n += 10 k += len(set(js_fast).intersection(js_real)) accuracy = 100.0 * k / n print( "%50s accuracy: %5.2f%% (expected %5.2f%%)" % (dataset, accuracy, exp_accuracy) ) assert accuracy > exp_accuracy - 1.0 # should be within 1% def test_glove_25(): _test_index("glove-25-angular", 69.00) def test_nytimes_16(): _test_index("nytimes-16-angular", 80.00) def test_lastfm_dot(): _test_index('lastfm-64-dot', 60.00, 'dot', 64) def test_lastfm_angular(): _test_index('lastfm-64-dot', 60.00, 'angular', 65) ================================================ FILE: test/angular_index_test.py ================================================ # Copyright (c) 2013 Spotify AB # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy of # the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations under # the License. import random import numpy import pytest from annoy import AnnoyIndex def test_get_nns_by_vector(): f = 3 i = AnnoyIndex(f, "angular") i.add_item(0, [0, 0, 1]) i.add_item(1, [0, 1, 0]) i.add_item(2, [1, 0, 0]) i.build(10) assert i.get_nns_by_vector([3, 2, 1], 3) == [2, 1, 0] assert i.get_nns_by_vector([1, 2, 3], 3) == [0, 1, 2] assert i.get_nns_by_vector([2, 0, 1], 3) == [2, 0, 1] def test_get_nns_by_item(): f = 3 i = AnnoyIndex(f, "angular") i.add_item(0, [2, 1, 0]) i.add_item(1, [1, 2, 0]) i.add_item(2, [0, 0, 1]) i.build(10) assert i.get_nns_by_item(0, 3) == [0, 1, 2] assert i.get_nns_by_item(1, 3) == [1, 0, 2] assert i.get_nns_by_item(2, 3) in [[2, 0, 1], [2, 1, 0]] # could be either def test_dist(): f = 2 i = AnnoyIndex(f, "angular") i.add_item(0, [0, 1]) i.add_item(1, [1, 1]) assert i.get_distance(0, 1) == pytest.approx((2 * (1.0 - 2**-0.5)) ** 0.5) def test_dist_2(): f = 2 i = AnnoyIndex(f, "angular") i.add_item(0, [1000, 0]) i.add_item(1, [10, 0]) assert i.get_distance(0, 1) == pytest.approx(0) def test_dist_3(): f = 2 i = AnnoyIndex(f, "angular") i.add_item(0, [97, 0]) i.add_item(1, [42, 42]) dist = ((1 - 2**-0.5) ** 2 + (2**-0.5) ** 2) ** 0.5 assert i.get_distance(0, 1) == pytest.approx(dist) def test_dist_degen(): f = 2 i = AnnoyIndex(f, "angular") i.add_item(0, [1, 0]) i.add_item(1, [0, 0]) assert i.get_distance(0, 1) == pytest.approx(2.0**0.5) def test_large_index(): # Generate pairs of random points where the pair is super close f = 10 i = AnnoyIndex(f, "angular") for j in range(0, 10000, 2): p = [random.gauss(0, 1) for z in range(f)] f1 = random.random() + 1 f2 = random.random() + 1 x = [f1 * pi + random.gauss(0, 1e-2) for pi in p] y = [f2 * pi + random.gauss(0, 1e-2) for pi in p] i.add_item(j, x) i.add_item(j + 1, y) i.build(10) for j in range(0, 10000, 2): assert i.get_nns_by_item(j, 2) == [j, j + 1] assert i.get_nns_by_item(j + 1, 2) == [j + 1, j] def precision(n, n_trees=10, n_points=10000, n_rounds=10, search_k=100000): found = 0 for r in range(n_rounds): # create random points at distance x from (1000, 0, 0, ...) f = 10 i = AnnoyIndex(f, "angular") for j in range(n_points): p = [random.gauss(0, 1) for z in range(f - 1)] norm = sum([pi**2 for pi in p]) ** 0.5 x = [1000] + [pi / norm * j for pi in p] i.add_item(j, x) i.build(n_trees) nns = i.get_nns_by_vector([1000] + [0] * (f - 1), n, search_k) assert nns == sorted(nns) # should be in order # The number of gaps should be equal to the last item minus n-1 found += len([x for x in nns if x < n]) return 1.0 * found / (n * n_rounds) def test_precision_1(): assert precision(1) >= 0.98 def test_precision_10(): assert precision(10) >= 0.98 def test_precision_100(): assert precision(100) >= 0.98 def test_precision_1000(): assert precision(1000) >= 0.98 def test_load_save_get_item_vector(): f = 3 i = AnnoyIndex(f, "angular") i.add_item(0, [1.1, 2.2, 3.3]) i.add_item(1, [4.4, 5.5, 6.6]) i.add_item(2, [7.7, 8.8, 9.9]) numpy.testing.assert_array_almost_equal(i.get_item_vector(0), [1.1, 2.2, 3.3]) assert i.build(10) assert i.save("blah.ann") numpy.testing.assert_array_almost_equal(i.get_item_vector(1), [4.4, 5.5, 6.6]) j = AnnoyIndex(f, "angular") assert j.load("blah.ann") numpy.testing.assert_array_almost_equal(j.get_item_vector(2), [7.7, 8.8, 9.9]) def test_get_nns_search_k(): f = 3 i = AnnoyIndex(f, "angular") i.add_item(0, [0, 0, 1]) i.add_item(1, [0, 1, 0]) i.add_item(2, [1, 0, 0]) i.build(10) assert i.get_nns_by_item(0, 3, 10) == [0, 1, 2] assert i.get_nns_by_vector([3, 2, 1], 3, 10) == [2, 1, 0] def test_include_dists(): # Double checking issue 112 f = 40 i = AnnoyIndex(f, "angular") v = numpy.random.normal(size=f) i.add_item(0, v) i.add_item(1, -v) i.build(10) indices, dists = i.get_nns_by_item(0, 2, 10, True) assert indices == [0, 1] assert dists[0] == pytest.approx(0.0) assert dists[1] == pytest.approx(2.0) def test_include_dists_check_ranges(): f = 3 i = AnnoyIndex(f, "angular") for j in range(100000): i.add_item(j, numpy.random.normal(size=f)) i.build(10) indices, dists = i.get_nns_by_item(0, 100000, include_distances=True) assert max(dists) <= 2.0 assert min(dists) == pytest.approx(0.0) def test_distance_consistency(): n, f = 1000, 3 i = AnnoyIndex(f, "angular") for j in range(n): while True: v = numpy.random.normal(size=f) if numpy.dot(v, v) > 0.1: break i.add_item(j, v) i.build(10) for a in random.sample(range(n), 100): indices, dists = i.get_nns_by_item(a, 100, include_distances=True) for b, dist in zip(indices, dists): u = i.get_item_vector(a) v = i.get_item_vector(b) assert dist == pytest.approx(i.get_distance(a, b), rel=1e-3, abs=1e-3) u_norm = numpy.array(u) * numpy.dot(u, u) ** -0.5 v_norm = numpy.array(v) * numpy.dot(v, v) ** -0.5 # cos = numpy.clip(1 - cosine(u, v), -1, 1) # scipy returns 1 - cos assert dist**2 == pytest.approx( numpy.dot(u_norm - v_norm, u_norm - v_norm), rel=1e-3, abs=1e-3 ) # self.assertAlmostEqual(dist, (2*(1 - cos))**0.5) assert dist**2 == pytest.approx( sum([(x - y) ** 2 for x, y in zip(u_norm, v_norm)]), rel=1e-3, abs=1e-3, ) def test_only_one_item(): # reported to annoy-user by Kireet Reddy idx = AnnoyIndex(100, "angular") idx.add_item(0, numpy.random.randn(100)) idx.build(n_trees=10) idx.save("foo.idx") idx = AnnoyIndex(100, "angular") idx.load("foo.idx") assert idx.get_n_items() == 1 assert idx.get_nns_by_vector( vector=numpy.random.randn(100), n=50, include_distances=False ) == [0] def test_no_items(): idx = AnnoyIndex(100, "angular") idx.build(n_trees=10) idx.save("foo.idx") idx = AnnoyIndex(100, "angular") idx.load("foo.idx") assert idx.get_n_items() == 0 assert ( idx.get_nns_by_vector( vector=numpy.random.randn(100), n=50, include_distances=False ) == [] ) def test_single_vector(): # https://github.com/spotify/annoy/issues/194 a = AnnoyIndex(3, "angular") a.add_item(0, [1, 0, 0]) a.build(10) a.save("1.ann") indices, dists = a.get_nns_by_vector([1, 0, 0], 3, include_distances=True) assert indices == [0] assert dists[0] ** 2 == pytest.approx(0.0) ================================================ FILE: test/annoy_test.go ================================================ /* # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy of # the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations under # the License. */ package annoy_test import ( "math" "math/rand" "os" "testing" "github.com/spotify/annoy" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "github.com/stretchr/testify/suite" ) type AnnoyTestSuite struct { suite.Suite } func Round(f float64) float64 { return math.Floor(f + 0.5) } func RoundPlus(f float64, places int) float64 { shift := math.Pow(10, float64(places)) return Round(f*shift) / shift } func (suite *AnnoyTestSuite) SetupTest() { } func (suite *AnnoyTestSuite) TestFileHandling() { index := annoy.NewAnnoyIndexAngular(3) index.AddItem(0, []float32{0, 0, 1}) index.AddItem(1, []float32{0, 1, 0}) index.AddItem(2, []float32{1, 0, 0}) index.Build(10) index.Save("go_test.ann") info, err := os.Stat("go_test.ann") if err != nil { assert.Fail(suite.T(), "Failed to create file, file not found") } if info.Size() == 0 { assert.Fail(suite.T(), "Failed to create file, file size zero") } annoy.DeleteAnnoyIndexAngular(index) index = annoy.NewAnnoyIndexAngular(3) if ret := index.Load("go_test.ann"); ret == false { assert.Fail(suite.T(), "Failed to load file") } os.Remove("go_test.ann") index.Save("go_test2.ann", false) info, err = os.Stat("go_test2.ann") if err != nil { assert.Fail(suite.T(), "Failed to create file without prefault, file not found") } if info.Size() == 0 { assert.Fail(suite.T(), "Failed to create file without prefault, file size zero") } annoy.DeleteAnnoyIndexAngular(index) index = annoy.NewAnnoyIndexAngular(3) if ret := index.Load("go_test2.ann", false); ret == false { assert.Fail(suite.T(), "Failed to load file without prefault") } os.Remove("go_test2.ann") index.Save("go_test3.ann", true) info, err = os.Stat("go_test3.ann") if err != nil { assert.Fail(suite.T(), "Failed to create file allowing prefault, file not found") } if info.Size() == 0 { assert.Fail(suite.T(), "Failed to create file allowing prefault, file size zero") } annoy.DeleteAnnoyIndexAngular(index) index = annoy.NewAnnoyIndexAngular(3) if ret := index.Load("go_test3.ann", true); ret == false { assert.Fail(suite.T(), "Failed to load file allowing prefault") } annoy.DeleteAnnoyIndexAngular(index) os.Remove("go_test3.ann") } func (suite *AnnoyTestSuite) TestOnDiskBuild() { index := annoy.NewAnnoyIndexAngular(3) index.OnDiskBuild("go_test.ann") info, err := os.Stat("go_test.ann") if err != nil { assert.Fail(suite.T(), "Failed to create file, file not found") } if info.Size() == 0 { assert.Fail(suite.T(), "Failed to create file, file size zero") } index.AddItem(0, []float32{0, 0, 1}) index.AddItem(1, []float32{0, 1, 0}) index.AddItem(2, []float32{1, 0, 0}) index.Build(10) index.Unload() index.Load("go_test.ann") result := annoy.NewAnnoyVectorInt() defer result.Free() index.GetNnsByVector([]float32{3, 2, 1}, 3, -1, result) assert.Equal(suite.T(), []int32{2, 1, 0}, result.ToSlice()) index.GetNnsByVector([]float32{1, 2, 3}, 3, -1, result) assert.Equal(suite.T(), []int32{0, 1, 2}, result.ToSlice()) index.GetNnsByVector([]float32{2, 0, 1}, 3, -1, result) assert.Equal(suite.T(), []int32{2, 0, 1}, result.ToSlice()) annoy.DeleteAnnoyIndexAngular(index) os.Remove("go_test.ann") } func (suite *AnnoyTestSuite) TestGetNnsByVector() { t := suite.T() index := annoy.NewAnnoyIndexAngular(3) index.AddItem(0, []float32{0, 0, 1}) index.AddItem(1, []float32{0, 1, 0}) index.AddItem(2, []float32{1, 0, 0}) index.Build(10) t.Run("regular", func(t *testing.T) { result := annoy.NewAnnoyVectorInt() defer result.Free() index.GetNnsByVector([]float32{3, 2, 1}, 3, -1, result) assert.Equal(t, []int32{2, 1, 0}, result.ToSlice()) index.GetNnsByVector([]float32{1, 2, 3}, 3, -1, result) assert.Equal(t, []int32{0, 1, 2}, result.ToSlice()) index.GetNnsByVector([]float32{2, 0, 1}, 3, -1, result) assert.Equal(t, []int32{2, 0, 1}, result.ToSlice()) }) t.Run("with copying", func(t *testing.T) { result := annoy.NewAnnoyVectorInt() defer result.Free() var notAllocated []int32 index.GetNnsByVector([]float32{3, 2, 1}, 3, -1, result) result.Copy(¬Allocated) assert.Equal(t, []int32{2, 1, 0}, notAllocated) // to make sure it will be overwritten var alreadyAllocated = make([]int32, 10) for i := 0; i < len(alreadyAllocated); i++ { alreadyAllocated[i] = -1 } index.GetNnsByVector([]float32{3, 2, 1}, 3, -1, result) result.Copy(&alreadyAllocated) assert.Equal(t, []int32{2, 1, 0}, alreadyAllocated) var alreadyAllocatedCap = make([]int32, 0, 00) index.GetNnsByVector([]float32{3, 2, 1}, 3, -1, result) result.Copy(&alreadyAllocatedCap) assert.Equal(t, []int32{2, 1, 0}, alreadyAllocatedCap) }) t.Run("with inner array", func(t *testing.T) { result := annoy.NewAnnoyVectorInt() defer result.Free() index.GetNnsByVector([]float32{3, 2, 1}, 3, -1, result) assert.Equal(t, []int32{2, 1, 0}, result.InnerArray()) }) annoy.DeleteAnnoyIndexAngular(index) } func (suite *AnnoyTestSuite) TestGetNnsByItem() { index := annoy.NewAnnoyIndexAngular(3) index.AddItem(0, []float32{2, 1, 0}) index.AddItem(1, []float32{1, 2, 0}) index.AddItem(2, []float32{0, 0, 1}) index.Build(10) var result = annoy.NewAnnoyVectorInt() defer result.Free() index.GetNnsByItem(0, 3, -1, result) assert.Equal(suite.T(), []int32{0, 1, 2}, result.ToSlice()) index.GetNnsByItem(1, 3, -1, result) assert.Equal(suite.T(), []int32{1, 0, 2}, result.ToSlice()) annoy.DeleteAnnoyIndexAngular(index) } func (suite *AnnoyTestSuite) TestGetItem() { index := annoy.NewAnnoyIndexAngular(3) index.AddItem(0, []float32{2, 1, 0}) index.AddItem(1, []float32{1, 2, 0}) index.AddItem(2, []float32{0, 0, 1}) index.Build(10) var result = annoy.NewAnnoyVectorFloat() defer result.Free() index.GetItem(0, result) assert.Equal(suite.T(), []float32{2, 1, 0}, result.ToSlice()) index.GetItem(1, result) assert.Equal(suite.T(), []float32{1, 2, 0}, result.ToSlice()) index.GetItem(2, result) assert.Equal(suite.T(), []float32{0, 0, 1}, result.ToSlice()) annoy.DeleteAnnoyIndexAngular(index) } func (suite *AnnoyTestSuite) TestGetDistance() { index := annoy.NewAnnoyIndexAngular(2) index.AddItem(0, []float32{0, 1}) index.AddItem(1, []float32{1, 1}) index.Build(10) assert.Equal(suite.T(), RoundPlus(math.Pow(2*(1.0-math.Pow(2, -0.5)), 0.5), 3), RoundPlus(float64(index.GetDistance(0, 1)), 3)) annoy.DeleteAnnoyIndexAngular(index) } func (suite *AnnoyTestSuite) TestGetDotProductDistance() { index := annoy.NewAnnoyIndexDotProduct(2) index.AddItem(0, []float32{0, 1}) index.AddItem(1, []float32{1, 1}) index.Build(10) assert.True(suite.T(), math.Abs(1.0-float64(index.GetDistance(0, 1))) < 0.00001) annoy.DeleteAnnoyIndexDotProduct(index) } func (suite *AnnoyTestSuite) TestLargeEuclideanIndex() { index := annoy.NewAnnoyIndexEuclidean(10) for j := 0; j < 10000; j += 2 { p := make([]float32, 0, 10) for i := 0; i < 10; i++ { p = append(p, rand.Float32()) } x := make([]float32, 0, 10) for i := 0; i < 10; i++ { x = append(x, 1+p[i]+rand.Float32()*1e-2) } y := make([]float32, 0, 10) for i := 0; i < 10; i++ { y = append(y, 1+p[i]+rand.Float32()*1e-2) } index.AddItem(j, x) index.AddItem(j+1, y) } index.Build(10) result := annoy.NewAnnoyVectorInt() defer result.Free() for j := 0; j < 10000; j += 2 { index.GetNnsByItem(j, 2, -1, result) require.Equal(suite.T(), result.ToSlice(), []int32{int32(j), int32(j + 1)}) index.GetNnsByItem(j+1, 2, -1, result) require.Equal(suite.T(), result.ToSlice(), []int32{int32(j) + 1, int32(j)}) } annoy.DeleteAnnoyIndexEuclidean(index) } func TestAnnoyTestSuite(t *testing.T) { suite.Run(t, new(AnnoyTestSuite)) } ================================================ FILE: test/annoy_test.lua ================================================ -- Copyright (c) 2016 Boris Nagaev -- -- Licensed under the Apache License, Version 2.0 (the "License"); you may not -- use this file except in compliance with the License. You may obtain a copy of -- the License at -- -- http://www.apache.org/licenses/LICENSE-2.0 -- -- Unless required by applicable law or agreed to in writing, software -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the -- License for the specific language governing permissions and limitations under -- the License. local AnnoyIndex = require 'annoy'.AnnoyIndex local function gauss(mu, sigma) local sum = -6 for _ = 1, 12 do sum = sum + math.random() end return mu + sum * sigma end local function randomVector(f, mu, sigma) local v = {} for i = 1, f do v[i] = gauss(mu, sigma) end return v end local function round(x) return ("%.3f"):format(x) end local function roundArray(array) local rounded_array = {} for k, v in ipairs(array) do rounded_array[k] = round(v) end return rounded_array end local function isSorted(v) for i = 2, #v do if v[i-1] > v[i] then return false end end return true end local function max(array) local ans = assert(array[1]) for _, v in ipairs(array) do ans = math.max(ans, v) end return ans end local function min(array) local ans = assert(array[1]) for _, v in ipairs(array) do ans = math.min(ans, v) end return ans end local function precision(first1000, n, n_trees, n_points, n_rounds) if not n_trees then n_trees = 10 end if not n_points then n_points = 10000 end if not n_rounds then n_rounds = 10 end local found = 0 for _ = 1, n_rounds do local f = 10 local p_size if first1000 then -- create random points at distance x from (1000, 0, 0, ...) p_size = f - 1 else -- create random points at distance x p_size = f end local i = AnnoyIndex(f, 'euclidean') for j = 0, n_points - 1 do local p = randomVector(p_size, 0, 1) local norm do norm = 0 for _, pi in ipairs(p) do norm = norm + pi ^ 2 end norm = norm ^ 0.5 end local x = {} do if first1000 then x[1] = 1000 end for _, pi in ipairs(p) do table.insert(x, pi / norm * j) end end i:add_item(j, x) end i:build(n_trees) local v = {} do for k = 1, f do v[k] = 0 end if first1000 then v[1] = 1000 end end local nns = i:get_nns_by_vector(v, n) assert(isSorted(nns)) -- The number of gaps should be equal to the last item minus n-1 for _, x in ipairs(nns) do if x < n then found = found + 1 end end end return 1.0 * found / (n * n_rounds) end describe("angular annoy test", function() it("get_nns_by_vector", function() local f = 3 local i = AnnoyIndex(f) i:add_item(0, {0, 0, 1}) i:add_item(1, {0, 1, 0}) i:add_item(2, {1, 0, 0}) i:build(10) assert.same({2, 1, 0}, i:get_nns_by_vector({3, 2, 1}, 3)) assert.same({0, 1, 2}, i:get_nns_by_vector({1, 2, 3}, 3)) assert.same({2, 0, 1}, i:get_nns_by_vector({2, 0, 1}, 3)) end) it("get_nns_by_item", function() local f = 3 local i = AnnoyIndex(f) i:add_item(0, {2, 1, 0}) i:add_item(1, {1, 2, 0}) i:add_item(2, {0, 0, 1}) i:build(10) assert.same({0, 1, 2}, i:get_nns_by_item(0, 3)) assert.same({1, 0, 2}, i:get_nns_by_item(1, 3)) do local close_to_2 = i:get_nns_by_item(2, 3) assert.equal(close_to_2[1], 2) assert.truthy( (close_to_2[2] == 0 and close_to_2[3] == 1) or (close_to_2[2] == 1 and close_to_2[3] == 0) ) end end) it("dist", function() local f = 2 local i = AnnoyIndex(f) i:add_item(0, {0, 1}) i:add_item(1, {1, 1}) assert.equal(round((2 * (1.0 - 2 ^ -0.5)) ^ 0.5), round(i:get_distance(0, 1))) end) it("dist_2", function() local f = 2 local i = AnnoyIndex(f) i:add_item(0, {1000, 0}) i:add_item(1, {10, 0}) assert.equal(round(0), round(i:get_distance(0, 1))) end) it("dist_3", function() local f = 2 local i = AnnoyIndex(f) i:add_item(0, {97, 0}) i:add_item(1, {42, 42}) local dist = ((1 - 2 ^ -0.5) ^ 2 + (2 ^ -0.5) ^ 2) ^ 0.5 assert.equal(round(dist), round(i:get_distance(0, 1))) end) it("dist_degen", function() local f = 2 local i = AnnoyIndex(f) i:add_item(0, {1, 0}) i:add_item(1, {0, 0}) assert.equal(round(2.0 ^ 0.5), round(i:get_distance(0, 1))) end) it("large_index", function() -- Generate pairs of random points where the pair is super close local f = 10 local i = AnnoyIndex(f) for j = 0, 10000 - 1, 2 do local p = randomVector(f, 0, 1) local f1 = math.random() + 1 local f2 = math.random() + 1 local x = {} local y = {} for k, pi in ipairs(p) do x[k] = f1 * pi + gauss(0, 1e-2) y[k] = f2 * pi + gauss(0, 1e-2) end i:add_item(j, x) i:add_item(j+1, y) end i:build(10) for j = 0, 10000 - 1, 2 do assert.same({j, j+1}, i:get_nns_by_item(j, 2)) assert.same({j+1, j}, i:get_nns_by_item(j+1, 2)) end end) it("precision_1", function() assert.truthy(precision(true, 1) >= 0.98) end) it("precision_10", function() assert.truthy(precision(true, 10) >= 0.98) end) it("precision_100", function() assert.truthy(precision(true, 100) >= 0.98) end) it("precision_1000", function() assert.truthy(precision(true, 1000) >= 0.98) end) it("load_save_get_item_vector", function() local f = 3 local i = AnnoyIndex(f) i:add_item(0, {1.1, 2.2, 3.3}) i:add_item(1, {4.4, 5.5, 6.6}) i:add_item(2, {7.7, 8.8, 9.9}) assert.same(roundArray({1.1, 2.2, 3.3}), roundArray(i:get_item_vector(0))) assert.truthy(i:build(10)) assert.truthy(i:save('blah.ann')) assert.same(roundArray({4.4, 5.5, 6.6}), roundArray(i:get_item_vector(1))) local j = AnnoyIndex(f) assert.truthy(j:load('blah.ann')) assert.same(roundArray({7.7, 8.8, 9.9}), roundArray(i:get_item_vector(2))) end) it("get_nns_search_k", function() local f = 3 local i = AnnoyIndex(f) i:add_item(0, {0, 0, 1}) i:add_item(1, {0, 1, 0}) i:add_item(2, {1, 0, 0}) i:build(10) assert.same({0, 1, 2}, i:get_nns_by_item(0, 3, 10)) assert.same({2, 1, 0}, i:get_nns_by_vector({3, 2, 1}, 3, 10)) end) it("include_dists", function() -- Double checking issue 112 local f = 40 local i = AnnoyIndex(f) local v = randomVector(f, 0, 1) i:add_item(0, v) local neg_v = {} do for k, value in ipairs(v) do neg_v[k] = -value end end i:add_item(1, neg_v) i:build(10) local indices, dists = i:get_nns_by_item(0, 2, 10, true) assert.same({0, 1}, indices) assert.same(roundArray({0.0, 2.0}), roundArray(dists)) end) it("include_dists_check_ranges", function() local f = 3 local i = AnnoyIndex(f) for j = 0, 100000 - 1 do i:add_item(j, randomVector(f, 0, 1)) end i:build(10) local include_distances = true local _, dists = i:get_nns_by_item(0, 100000, -1, include_distances) assert.truthy(max(dists) < 2.0) assert.equal(round(0.0), round(min(dists))) end) end) describe("euclidean annoy test", function() it("get_nns_by_vector", function() local f = 2 local i = AnnoyIndex(f, 'euclidean') i:add_item(0, {2, 2}) i:add_item(1, {3, 2}) i:add_item(2, {3, 3}) i:build(10) assert.same({2, 1, 0}, i:get_nns_by_vector({4, 4}, 3)) assert.same({0, 1, 2}, i:get_nns_by_vector({1, 1}, 3)) assert.same({1, 2, 0}, i:get_nns_by_vector({4, 2}, 3)) end) it("get_nns_by_item", function() local f = 2 local i = AnnoyIndex(f, 'euclidean') i:add_item(0, {2, 2}) i:add_item(1, {3, 2}) i:add_item(2, {3, 3}) i:build(10) assert.same({0, 1, 2}, i:get_nns_by_item(0, 3)) assert.same({2, 1, 0}, i:get_nns_by_item(2, 3)) end) it("dist", function() local f = 2 local i = AnnoyIndex(f, 'euclidean') i:add_item(0, {0, 1}) i:add_item(1, {1, 1}) assert.equal(round(1.0), round(i:get_distance(0, 1))) end) it("large_index", function() -- Generate pairs of random points where the pair is super close local f = 10 -- local q = randomVector(f, 0, 10) local i = AnnoyIndex(f, 'euclidean') for j = 0, 10000 - 1, 2 do local p = randomVector(f, 0, 1) local x = {} local y = {} for k, pi in ipairs(p) do x[k] = 1 + pi + gauss(0, 1e-2) -- todo: should be q[i] y[k] = 1 + pi + gauss(0, 1e-2) end i:add_item(j, x) i:add_item(j+1, y) end i:build(10) for j = 0, 10000 - 1, 2 do assert.same({j, j+1}, i:get_nns_by_item(j, 2)) assert.same({j+1, j}, i:get_nns_by_item(j+1, 2)) end end) it("precision_1", function() assert.truthy(precision(false, 1) >= 0.98) end) it("precision_10", function() assert.truthy(precision(false, 10) >= 0.98) end) it("precision_100", function() assert.truthy(precision(false, 100) >= 0.98) end) it("precision_1000", function() assert.truthy(precision(false, 1000) >= 0.98) end) it("get_nns_with_distances", function() local f = 3 local i = AnnoyIndex(f, 'euclidean') i:add_item(0, {0, 0, 2}) i:add_item(1, {0, 1, 1}) i:add_item(2, {1, 0, 0}) i:build(10) do local l, d = i:get_nns_by_item(0, 3, -1, true) assert.same({0, 1, 2}, l) assert.same( roundArray({0, 2, 5}), roundArray({d[1]^2, d[2]^2, d[3]^2}) ) end do local l, d = i:get_nns_by_vector({2, 2, 2}, 3, -1, true) assert.same({1, 0, 2}, l) assert.same( roundArray({6, 8, 9}), roundArray({d[1]^2, d[2]^2, d[3]^2}) ) end end) it("include_dists", function() local f = 40 local i = AnnoyIndex(f) local v = randomVector(f, 0, 1) i:add_item(0, v) local neg_v = {} do for k, value in ipairs(v) do neg_v[k] = -value end end i:add_item(1, neg_v) i:build(10) local indices, dists = i:get_nns_by_item(0, 2, 10, true) assert.same({0, 1}, indices) assert.same(round(0.0), round(dists[1])) end) end) describe("index test", function() it("not_found_tree", function() local i = AnnoyIndex(10) assert.has_error(function() i:load('nonexists.tree') end) end) it("binary_compatibility", function() local i = AnnoyIndex(10) i:load('test/test.tree') -- This might change in the future if we change the search -- algorithm, but in that case let's update the test assert.same( {0, 85, 42, 11, 54, 38, 53, 66, 19, 31}, i:get_nns_by_item(0, 10) ) end) it("load_unload", function() -- Issue #108 local i = AnnoyIndex(10) for _ = 1, 100000 do i:load('test/test.tree') i:unload() end end) it("construct_load_destruct", function() for x = 1, 100000 do local i = AnnoyIndex(10) i:load('test/test.tree') if x % 100 == 0 then collectgarbage() end end end) it("construct_destruct", function() for _ = 1, 100000 do local i = AnnoyIndex(10) i:add_item(1000, randomVector(10, 0, 1)) end end) it("save_twice", function() -- Issue #100 local t = AnnoyIndex(10) t:save("t.ann") t:save("t.ann") end) it("load_save", function() -- Issue #61 local i = AnnoyIndex(10) i:load('test/test.tree') local u = i:get_item_vector(99) i:save('i.tree') local v = i:get_item_vector(99) assert.same(u, v) local j = AnnoyIndex(10) j:load('test/test.tree') local w = i:get_item_vector(99) -- maybe s/i/j/? assert.same(u, w) -- Ensure specifying if prefault is allowed does not impact result j:save('j.tree', true) local k = AnnoyIndex(10) k:load('j.tree', true) local x = k:get_item_vector(99) assert.same(u, x) k:save('k.tree', false) local l = AnnoyIndex(10) l:load('k.tree', false) local y = l:get_item_vector(99) assert.same(u, y) end) it("on_disk_build", function() local f = 2 local i = AnnoyIndex(f, 'euclidean') i:on_disk_build('x.tree') i:add_item(0, {2, 2}) i:add_item(1, {3, 2}) i:add_item(2, {3, 3}) i:build(10) i:unload() i:load('x.tree') assert.same({2, 1, 0}, i:get_nns_by_vector({4, 4}, 3)) assert.same({0, 1, 2}, i:get_nns_by_vector({1, 1}, 3)) assert.same({1, 2, 0}, i:get_nns_by_vector({4, 2}, 3)) end) end) describe("types test", function() local n_points = 1000 local n_trees = 10 -- tests "numpy" and "tuple" are not applicable to Lua it("wrong_length", function() local f = 10 local i = AnnoyIndex(f, 'euclidean') i:add_item(0, randomVector(f, 0, 1)) assert.has_error(function() i:add_item(1, randomVector(f + 1000, 0, 1)) end) assert.has_error(function() i:add_item(2, {}) end) i:build(n_trees) end) it("range_errors", function() local f = 10 local i = AnnoyIndex(f, 'euclidean') for j = 0, n_points - 1 do i:add_item(j, randomVector(f, 0, 1)) end assert.has_error(function() i:add_item(-1, randomVector(f)) end) i:build(n_trees) for _, bad_index in ipairs({-1000, -1, n_points, n_points + 1000}) do assert.has_error(function() i:get_distance(0, bad_index) end) assert.has_error(function() i:get_nns_by_item(bad_index, 1) end) assert.has_error(function() i:get_item_vector(bad_index) end) end end) end) describe("memory leaks", function() it("get_item_vector", function() local f = 10 local i = AnnoyIndex(f, 'euclidean') i:add_item(0, randomVector(f, 0, 1)) for j = 0, 100 - 1 do print(j, '...') for _ = 1, 1000 * 1000 do i:get_item_vector(0) end end end) it("get_lots_of_nns", function() local f = 10 local i = AnnoyIndex(f, 'euclidean') i:add_item(0, randomVector(f, 0, 1)) i:build(10) for _ = 1, 100 do assert.same({0}, i:get_nns_by_item(0, 999999999)) end end) end) ================================================ FILE: test/dot_index_test.py ================================================ # Copyright (c) 2018 Spotify AB # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy of # the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations under # the License. import random import numpy import pytest from annoy import AnnoyIndex def dot_metric(a, b): return -numpy.dot(a, b) def recall(retrieved, relevant): return float(len(set(relevant) & set(retrieved))) / float(len(set(relevant))) def test_get_nns_by_vector(): f = 2 i = AnnoyIndex(f, "dot") i.add_item(0, [2, 2]) i.add_item(1, [3, 2]) i.add_item(2, [3, 3]) i.build(10) assert i.get_nns_by_vector([4, 4], 3) == [2, 1, 0] assert i.get_nns_by_vector([1, 1], 3) == [2, 1, 0] assert i.get_nns_by_vector([4, 2], 3) == [2, 1, 0] def test_get_nns_by_item(): f = 2 i = AnnoyIndex(f, "dot") i.add_item(0, [2, 2]) i.add_item(1, [3, 2]) i.add_item(2, [3, 3]) i.build(10) assert i.get_nns_by_item(0, 3) == [2, 1, 0] assert i.get_nns_by_item(2, 3) == [2, 1, 0] def test_dist(): f = 2 i = AnnoyIndex(f, "dot") i.add_item(0, [0, 1]) i.add_item(1, [1, 1]) i.add_item(2, [0, 0]) i.build(10) assert i.get_distance(0, 1) == pytest.approx(1.0) assert i.get_distance(1, 2) == pytest.approx(0.0) def recall_at(n, n_trees=10, n_points=1000, n_rounds=5): # the best movie/variable name total_recall = 0.0 for r in range(n_rounds): # create random points at distance x f = 10 idx = AnnoyIndex(f, "dot") data = numpy.array( [[random.gauss(0, 1) for z in range(f)] for j in range(n_points)] ) expected_results = [ sorted(range(n_points), key=lambda j: dot_metric(data[i], data[j]))[:n] for i in range(n_points) ] for i, vec in enumerate(data): idx.add_item(i, vec) idx.build(n_trees) for i in range(n_points): nns = idx.get_nns_by_vector(data[i], n) total_recall += recall(nns, expected_results[i]) return total_recall / float(n_rounds * n_points) def test_recall_at_10(): value = recall_at(10) assert value >= 0.65 def test_recall_at_100(): value = recall_at(100) assert value >= 0.95 def test_recall_at_1000(): value = recall_at(1000) assert value >= 0.99 def test_recall_at_1000_fewer_trees(): value = recall_at(1000, n_trees=4) assert value >= 0.99 def test_get_nns_with_distances(): f = 3 i = AnnoyIndex(f, "dot") i.add_item(0, [0, 0, 2]) i.add_item(1, [0, 1, 1]) i.add_item(2, [1, 0, 0]) i.build(10) l, d = i.get_nns_by_item(0, 3, -1, True) assert l == [0, 1, 2] assert d[0] == pytest.approx(4) assert d[1] == pytest.approx(2) assert d[2] == pytest.approx(0) l, d = i.get_nns_by_vector([2, 2, 2], 3, -1, True) assert l == [0, 1, 2] assert d[0] == pytest.approx(4) assert d[1] == pytest.approx(4) assert d[2] == pytest.approx(2) def test_include_dists(): f = 40 i = AnnoyIndex(f, "dot") v = numpy.random.normal(size=f) i.add_item(0, v) i.add_item(1, -v) i.build(10) indices, dists = i.get_nns_by_item(0, 2, 10, True) assert indices == [0, 1] assert dists[0] == pytest.approx(numpy.dot(v, v)) def test_distance_consistency(): n, f = 1000, 3 i = AnnoyIndex(f, "dot") for j in range(n): i.add_item(j, numpy.random.normal(size=f)) i.build(10) for a in random.sample(range(n), 100): indices, dists = i.get_nns_by_item(a, 100, include_distances=True) for b, dist in zip(indices, dists): assert dist == pytest.approx( numpy.dot(i.get_item_vector(a), i.get_item_vector(b)) ) assert dist == pytest.approx(i.get_distance(a, b)) ================================================ FILE: test/euclidean_index_test.py ================================================ # Copyright (c) 2013 Spotify AB # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy of # the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations under # the License. import random import numpy import pytest from annoy import AnnoyIndex def test_get_nns_by_vector(): f = 2 i = AnnoyIndex(f, "euclidean") i.add_item(0, [2, 2]) i.add_item(1, [3, 2]) i.add_item(2, [3, 3]) i.build(10) assert i.get_nns_by_vector([4, 4], 3) == [2, 1, 0] assert i.get_nns_by_vector([1, 1], 3) == [0, 1, 2] assert i.get_nns_by_vector([4, 2], 3) == [1, 2, 0] def test_get_nns_by_item(): f = 2 i = AnnoyIndex(f, "euclidean") i.add_item(0, [2, 2]) i.add_item(1, [3, 2]) i.add_item(2, [3, 3]) i.build(10) assert i.get_nns_by_item(0, 3) == [0, 1, 2] assert i.get_nns_by_item(2, 3) == [2, 1, 0] def test_dist(): f = 2 i = AnnoyIndex(f, "euclidean") i.add_item(0, [0, 1]) i.add_item(1, [1, 1]) i.add_item(2, [0, 0]) assert i.get_distance(0, 1) == pytest.approx(1.0**0.5) assert i.get_distance(1, 2) == pytest.approx(2.0**0.5) def test_large_index(): # Generate pairs of random points where the pair is super close f = 10 [random.gauss(0, 10) for z in range(f)] i = AnnoyIndex(f, "euclidean") for j in range(0, 10000, 2): p = [random.gauss(0, 1) for z in range(f)] x = [1 + pi + random.gauss(0, 1e-2) for pi in p] # todo: should be q[i] y = [1 + pi + random.gauss(0, 1e-2) for pi in p] i.add_item(j, x) i.add_item(j + 1, y) i.build(10) for j in range(0, 10000, 2): assert i.get_nns_by_item(j, 2) == [j, j + 1] assert i.get_nns_by_item(j + 1, 2) == [j + 1, j] def precision(n, n_trees=10, n_points=10000, n_rounds=10): found = 0 for r in range(n_rounds): # create random points at distance x f = 10 i = AnnoyIndex(f, "euclidean") for j in range(n_points): p = [random.gauss(0, 1) for z in range(f)] norm = sum([pi**2 for pi in p]) ** 0.5 x = [pi / norm * j for pi in p] i.add_item(j, x) i.build(n_trees) nns = i.get_nns_by_vector([0] * f, n) assert nns == sorted(nns) # should be in order # The number of gaps should be equal to the last item minus n-1 found += len([x for x in nns if x < n]) return 1.0 * found / (n * n_rounds) def test_precision_1(): assert precision(1) >= 0.98 def test_precision_10(): assert precision(10) >= 0.98 def test_precision_100(): assert precision(100) >= 0.98 def test_precision_1000(): assert precision(1000) >= 0.98 def test_get_nns_with_distances(): f = 3 i = AnnoyIndex(f, "euclidean") i.add_item(0, [0, 0, 2]) i.add_item(1, [0, 1, 1]) i.add_item(2, [1, 0, 0]) i.build(10) l, d = i.get_nns_by_item(0, 3, -1, True) assert l == [0, 1, 2] assert d[0] ** 2 == pytest.approx(0) assert d[1] ** 2 == pytest.approx(2) assert d[2] ** 2 == pytest.approx(5) l, d = i.get_nns_by_vector([2, 2, 2], 3, -1, True) assert l == [1, 0, 2] assert d[0] ** 2 == pytest.approx(6) assert d[1] ** 2 == pytest.approx(8) assert d[2] ** 2 == pytest.approx(9) def test_include_dists(): f = 40 i = AnnoyIndex(f, "euclidean") v = numpy.random.normal(size=f) i.add_item(0, v) i.add_item(1, -v) i.build(10) indices, dists = i.get_nns_by_item(0, 2, 10, True) assert indices == [0, 1] assert dists[0] == pytest.approx(0) def test_distance_consistency(): n, f = 1000, 3 i = AnnoyIndex(f, "euclidean") for j in range(n): i.add_item(j, numpy.random.normal(size=f)) i.build(10) for a in random.sample(range(n), 100): indices, dists = i.get_nns_by_item(a, 100, include_distances=True) for b, dist in zip(indices, dists): assert dist == pytest.approx(i.get_distance(a, b)) u = numpy.array(i.get_item_vector(a)) v = numpy.array(i.get_item_vector(b)) assert dist == pytest.approx(numpy.dot(u - v, u - v) ** 0.5) assert dist == pytest.approx( sum([(x - y) ** 2 for x, y in zip(u, v)]) ** 0.5 ) def test_rounding_error(): # https://github.com/spotify/annoy/issues/314 i = AnnoyIndex(1, "euclidean") i.add_item(0, [0.7125930]) i.add_item(1, [0.7123166]) assert i.get_distance(0, 1) >= 0.0 ================================================ FILE: test/examples_test.py ================================================ def execfile(fn): with open(fn) as f: exec(f.read()) def simple_test(): execfile("examples/simple_test.py") def mmap_test(): execfile("examples/mmap_test.py") def precision_test(): execfile("examples/precision_test.py") ================================================ FILE: test/hamming_index_test.py ================================================ # Copyright (c) 2013 Spotify AB # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy of # the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations under # the License. import numpy import pytest from annoy import AnnoyIndex def test_basic_conversion(): f = 100 i = AnnoyIndex(f, "hamming") u = numpy.random.binomial(1, 0.5, f) v = numpy.random.binomial(1, 0.5, f) i.add_item(0, u) i.add_item(1, v) u2 = i.get_item_vector(0) v2 = i.get_item_vector(1) assert numpy.dot(u - u2, u - u2) == pytest.approx(0.0) assert numpy.dot(v - v2, v - v2) == pytest.approx(0.0) assert i.get_distance(0, 0) == pytest.approx(0.0) assert i.get_distance(1, 1) == pytest.approx(0.0) assert i.get_distance(0, 1) == pytest.approx(numpy.dot(u - v, u - v)) assert i.get_distance(1, 0) == pytest.approx(numpy.dot(u - v, u - v)) def test_basic_nns(): f = 100 i = AnnoyIndex(f, "hamming") u = numpy.random.binomial(1, 0.5, f) v = numpy.random.binomial(1, 0.5, f) i.add_item(0, u) i.add_item(1, v) i.build(10) assert i.get_nns_by_item(0, 99) == [0, 1] assert i.get_nns_by_item(1, 99) == [1, 0] rs, ds = i.get_nns_by_item(0, 99, include_distances=True) assert rs == [0, 1] assert ds[0] == pytest.approx(0) assert ds[1] == pytest.approx(numpy.dot(u - v, u - v)) def test_save_load(): f = 100 i = AnnoyIndex(f, "hamming") u = numpy.random.binomial(1, 0.5, f) v = numpy.random.binomial(1, 0.5, f) i.add_item(0, u) i.add_item(1, v) i.build(10) i.save("blah.ann") j = AnnoyIndex(f, "hamming") j.load("blah.ann") rs, ds = j.get_nns_by_item(0, 99, include_distances=True) assert rs == [0, 1] assert ds[0] == pytest.approx(0) assert ds[1] == pytest.approx(numpy.dot(u - v, u - v)) def test_many_vectors(): f = 10 i = AnnoyIndex(f, "hamming") for x in range(100000): i.add_item(x, numpy.random.binomial(1, 0.5, f)) i.build(10) rs, ds = i.get_nns_by_vector([0] * f, 10000, include_distances=True) assert min(ds) >= 0 assert max(ds) <= f dists = [] for x in range(1000): rs, ds = i.get_nns_by_vector( numpy.random.binomial(1, 0.5, f), 1, search_k=1000, include_distances=True ) dists.append(ds[0]) avg_dist = 1.0 * sum(dists) / len(dists) assert avg_dist <= 0.42 @pytest.mark.skip # will fix later def test_zero_vectors(): # Mentioned on the annoy-user list bitstrings = [ "0000000000011000001110000011111000101110111110000100000100000000", "0000000000011000001110000011111000101110111110000100000100000001", "0000000000011000001110000011111000101110111110000100000100000010", "0010010100011001001000010001100101011110000000110000011110001100", "1001011010000110100101101001111010001110100001101000111000001110", "0111100101111001011110010010001100010111000111100001101100011111", "0011000010011101000011010010111000101110100101111000011101001011", "0011000010011100000011010010111000101110100101111000011101001011", "1001100000111010001010000010110000111100100101001001010000000111", "0000000000111101010100010001000101101001000000011000001101000000", "1000101001010001011100010111001100110011001100110011001111001100", "1110011001001111100110010001100100001011000011010010111100100111", ] vectors = [[int(bit) for bit in bitstring] for bitstring in bitstrings] f = 64 idx = AnnoyIndex(f, "hamming") for i, v in enumerate(vectors): idx.add_item(i, v) idx.build(10) idx.save("idx.ann") idx = AnnoyIndex(f, "hamming") idx.load("idx.ann") js, ds = idx.get_nns_by_item(0, 5, include_distances=True) assert js[0] == 0 assert ds[:4] == [0, 1, 1, 22] ================================================ FILE: test/holes_test.py ================================================ # Copyright (c) 2013 Spotify AB # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy of # the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations under # the License. import random import numpy from annoy import AnnoyIndex def test_random_holes(): f = 10 index = AnnoyIndex(f, "angular") valid_indices = random.sample(range(2000), 1000) # leave holes for i in valid_indices: v = numpy.random.normal(size=(f,)) index.add_item(i, v) index.build(10) for i in valid_indices: js = index.get_nns_by_item(i, 10000) for j in js: assert j in valid_indices for i in range(1000): v = numpy.random.normal(size=(f,)) js = index.get_nns_by_vector(v, 10000) for j in js: assert j in valid_indices def _test_holes_base(n, f=100, base_i=100000): annoy = AnnoyIndex(f, "angular") for i in range(n): annoy.add_item(base_i + i, numpy.random.normal(size=(f,))) annoy.build(100) res = annoy.get_nns_by_item(base_i, n) assert set(res) == set([base_i + i for i in range(n)]) def test_root_one_child(): # See https://github.com/spotify/annoy/issues/223 _test_holes_base(1) def test_root_two_children(): _test_holes_base(2) def test_root_some_children(): # See https://github.com/spotify/annoy/issues/295 _test_holes_base(10) def test_root_many_children(): _test_holes_base(1000) ================================================ FILE: test/index_test.py ================================================ # Copyright (c) 2013 Spotify AB # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy of # the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations under # the License. import os import random import pytest from annoy import AnnoyIndex def test_not_found_tree(): i = AnnoyIndex(10, "angular") with pytest.raises(IOError): i.load("nonexists.tree") def test_binary_compatibility(): i = AnnoyIndex(10, "angular") i.load("test/test.tree") # This might change in the future if we change the search algorithm, but in that case let's update the test assert i.get_nns_by_item(0, 10) == [0, 85, 42, 11, 54, 38, 53, 66, 19, 31] def test_load_unload(): # Issue #108 i = AnnoyIndex(10, "angular") for x in range(100000): i.load("test/test.tree") i.unload() def test_construct_load_destruct(): for x in range(100000): i = AnnoyIndex(10, "angular") i.load("test/test.tree") def test_construct_destruct(): for x in range(100000): i = AnnoyIndex(10, "angular") i.add_item(1000, [random.gauss(0, 1) for z in range(10)]) def test_save_twice(): # Issue #100 t = AnnoyIndex(10, "angular") for i in range(100): t.add_item(i, [random.gauss(0, 1) for z in range(10)]) t.build(10) t.save("t1.ann") t.save("t2.ann") def test_load_save(): # Issue #61 i = AnnoyIndex(10, "angular") i.load("test/test.tree") u = i.get_item_vector(99) i.save("i.tree") v = i.get_item_vector(99) assert u == v j = AnnoyIndex(10, "angular") j.load("test/test.tree") w = i.get_item_vector(99) assert u == w # Ensure specifying if prefault is allowed does not impact result j.save("j.tree", True) k = AnnoyIndex(10, "angular") k.load("j.tree", True) x = k.get_item_vector(99) assert u == x k.save("k.tree", False) l = AnnoyIndex(10, "angular") l.load("k.tree", False) y = l.get_item_vector(99) assert u == y def test_save_without_build(): t = AnnoyIndex(10, "angular") for i in range(100): t.add_item(i, [random.gauss(0, 1) for z in range(10)]) # Note: in earlier version, this was allowed (see eg #61) with pytest.raises(Exception): t.save("x.tree") def test_unbuild_with_loaded_tree(): i = AnnoyIndex(10, "angular") i.load("test/test.tree") with pytest.raises(Exception): i.unbuild() def test_seed(): i = AnnoyIndex(10, "angular") i.load("test/test.tree") i.set_seed(42) def test_unknown_distance(): with pytest.raises(Exception): AnnoyIndex(10, "banana") def test_metric_kwarg(): # Issue 211 i = AnnoyIndex(2, metric="euclidean") i.add_item(0, [1, 0]) i.add_item(1, [9, 0]) assert i.get_distance(0, 1) == pytest.approx(8) assert i.f == 2 def test_metric_f_kwargs(): AnnoyIndex(f=3, metric="euclidean") def test_item_vector_after_save(): # Issue #279 a = AnnoyIndex(3, "angular") a.verbose(True) a.add_item(1, [1, 0, 0]) a.add_item(2, [0, 1, 0]) a.add_item(3, [0, 0, 1]) a.build(-1) assert a.get_n_items() == 4 assert a.get_item_vector(3) == [0, 0, 1] assert set(a.get_nns_by_item(1, 999)) == set([1, 2, 3]) a.save("something.annoy") assert a.get_n_items() == 4 assert a.get_item_vector(3) == [0, 0, 1] assert set(a.get_nns_by_item(1, 999)) == set([1, 2, 3]) def test_prefault(): i = AnnoyIndex(10, "angular") i.load("test/test.tree", prefault=True) assert i.get_nns_by_item(0, 10) == [0, 85, 42, 11, 54, 38, 53, 66, 19, 31] def test_fail_save(): t = AnnoyIndex(40, "angular") with pytest.raises(IOError): t.save("") def test_overwrite_index(): # Issue #335 f = 40 # Build the initial index t = AnnoyIndex(f, "angular") for i in range(1000): v = [random.gauss(0, 1) for z in range(f)] t.add_item(i, v) t.build(10) t.save("test.ann") # Load index file t2 = AnnoyIndex(f, "angular") t2.load("test.ann") # Overwrite index file t3 = AnnoyIndex(f, "angular") for i in range(500): v = [random.gauss(0, 1) for z in range(f)] t3.add_item(i, v) t3.build(10) if os.name == "nt": # Can't overwrite on Windows with pytest.raises(IOError): t3.save("test.ann") else: t3.save("test.ann") # Get nearest neighbors v = [random.gauss(0, 1) for z in range(f)] t2.get_nns_by_vector(v, 1000) # Should not crash def test_get_n_trees(): i = AnnoyIndex(10, "angular") i.load("test/test.tree") assert i.get_n_trees() == 10 def test_write_failed(): f = 40 # Build the initial index t = AnnoyIndex(f, "angular") t.verbose(True) for i in range(1000): v = [random.gauss(0, 1) for z in range(f)] t.add_item(i, v) t.build(10) if os.name == "nt": path = "Z:\\xyz.annoy" else: path = "/x/y/z.annoy" with pytest.raises(Exception): t.save(path) def test_dimension_mismatch(): t = AnnoyIndex(100, "angular") for i in range(1000): t.add_item(i, [random.gauss(0, 1) for z in range(100)]) t.build(10) t.save("test.annoy") u = AnnoyIndex(200, "angular") with pytest.raises(IOError): u.load("test.annoy") u = AnnoyIndex(50, "angular") with pytest.raises(IOError): u.load("test.annoy") def test_add_after_save(): # 398 t = AnnoyIndex(100, "angular") for i in range(1000): t.add_item(i, [random.gauss(0, 1) for z in range(100)]) t.build(10) t.save("test.annoy") # Used to segfault: v = [random.gauss(0, 1) for z in range(100)] with pytest.raises(Exception): t.add_item(i, v) def test_build_twice(): # 420 t = AnnoyIndex(100, "angular") for i in range(1000): t.add_item(i, [random.gauss(0, 1) for z in range(100)]) t.build(10) # Used to segfault: with pytest.raises(Exception): t.build(10) def test_very_large_index(): # 388 f = 3 dangerous_size = 2**31 size_per_vector = 4 * (f + 3) n_vectors = int(dangerous_size / size_per_vector) m = AnnoyIndex(3, "angular") m.verbose(True) for i in range(100): m.add_item(n_vectors + i, [random.gauss(0, 1) for z in range(f)]) n_trees = 10 m.build(n_trees) path = "test_big.annoy" m.save(path) # Raises on Windows # Sanity check size of index assert os.path.getsize(path) >= dangerous_size assert os.path.getsize(path) < dangerous_size + 100e3 # Sanity check number of trees assert m.get_n_trees() == n_trees ================================================ FILE: test/manhattan_index_test.py ================================================ # Copyright (c) 2013 Spotify AB # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy of # the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations under # the License. import random import numpy import pytest from annoy import AnnoyIndex def test_get_nns_by_vector(): f = 2 i = AnnoyIndex(f, "manhattan") i.add_item(0, [2, 2]) i.add_item(1, [3, 2]) i.add_item(2, [3, 3]) i.build(10) assert i.get_nns_by_vector([4, 4], 3) == [2, 1, 0] assert i.get_nns_by_vector([1, 1], 3) == [0, 1, 2] assert i.get_nns_by_vector([5, 3], 3) == [2, 1, 0] def test_get_nns_by_item(): f = 2 i = AnnoyIndex(f, "manhattan") i.add_item(0, [2, 2]) i.add_item(1, [3, 2]) i.add_item(2, [3, 3]) i.build(10) assert i.get_nns_by_item(0, 3) == [0, 1, 2] assert i.get_nns_by_item(2, 3) == [2, 1, 0] def test_dist(): f = 2 i = AnnoyIndex(f, "manhattan") i.add_item(0, [0, 1]) i.add_item(1, [1, 1]) i.add_item(2, [0, 0]) assert i.get_distance(0, 1) == pytest.approx(1.0) assert i.get_distance(1, 2) == pytest.approx(2.0) def test_large_index(): # Generate pairs of random points where the pair is super close f = 10 i = AnnoyIndex(f, "manhattan") for j in range(0, 10000, 2): p = [random.gauss(0, 1) for z in range(f)] x = [1 + pi + random.gauss(0, 1e-2) for pi in p] y = [1 + pi + random.gauss(0, 1e-2) for pi in p] i.add_item(j, x) i.add_item(j + 1, y) i.build(10) for j in range(0, 10000, 2): assert i.get_nns_by_item(j, 2) == [j, j + 1] assert i.get_nns_by_item(j + 1, 2) == [j + 1, j] def precision(n, n_trees=10, n_points=10000, n_rounds=10): found = 0 for r in range(n_rounds): # create random points at distance x f = 10 i = AnnoyIndex(f, "manhattan") for j in range(n_points): p = [random.gauss(0, 1) for z in range(f)] norm = sum([pi**2 for pi in p]) ** 0.5 x = [pi / norm + j for pi in p] i.add_item(j, x) i.build(n_trees) nns = i.get_nns_by_vector([0] * f, n) assert nns == sorted(nns) # should be in order # The number of gaps should be equal to the last item minus n-1 found += len([x for x in nns if x < n]) return 1.0 * found / (n * n_rounds) def test_precision_1(): assert precision(1) >= 0.98 def test_precision_10(): assert precision(10) >= 0.98 def test_precision_100(): assert precision(100) >= 0.98 def test_precision_1000(): assert precision(1000) >= 0.98 def test_get_nns_with_distances(): f = 3 i = AnnoyIndex(f, "manhattan") i.add_item(0, [0, 0, 2]) i.add_item(1, [0, 1, 1]) i.add_item(2, [1, 0, 0]) i.build(10) l, d = i.get_nns_by_item(0, 3, -1, True) assert l == [0, 1, 2] assert d[0] == pytest.approx(0) assert d[1] == pytest.approx(2) assert d[2] == pytest.approx(3) l, d = i.get_nns_by_vector([2, 2, 1], 3, -1, True) assert l == [1, 2, 0] assert d[0] == pytest.approx(3) assert d[1] == pytest.approx(4) assert d[2] == pytest.approx(5) def test_include_dists(): f = 40 i = AnnoyIndex(f, "manhattan") v = numpy.random.normal(size=f) i.add_item(0, v) i.add_item(1, -v) i.build(10) indices, dists = i.get_nns_by_item(0, 2, 10, True) assert indices == [0, 1] assert dists[0] == pytest.approx(0) def test_distance_consistency(): n, f = 1000, 3 i = AnnoyIndex(f, "manhattan") for j in range(n): i.add_item(j, numpy.random.normal(size=f)) i.build(10) for a in random.sample(range(n), 100): indices, dists = i.get_nns_by_item(a, 100, include_distances=True) for b, dist in zip(indices, dists): assert dist == pytest.approx(i.get_distance(a, b)) u = numpy.array(i.get_item_vector(a)) v = numpy.array(i.get_item_vector(b)) assert dist == pytest.approx(numpy.sum(numpy.fabs(u - v))) assert dist == pytest.approx( sum([abs(float(x) - float(y)) for x, y in zip(u, v)]) ) ================================================ FILE: test/memory_leak_test.py ================================================ # Copyright (c) 2013 Spotify AB # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy of # the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations under # the License. import pytest import random from annoy import AnnoyIndex def test_get_item_vector(): f = 10 i = AnnoyIndex(f, "euclidean") i.add_item(0, [random.gauss(0, 1) for x in range(f)]) for j in range(100): print(j, "...") for k in range(1000 * 1000): i.get_item_vector(0) def test_get_lots_of_nns(): f = 10 i = AnnoyIndex(f, "euclidean") i.add_item(0, [random.gauss(0, 1) for x in range(f)]) i.build(10) for j in range(100): assert i.get_nns_by_item(0, 999999999) == [0] def test_build_unbuid(): f = 10 i = AnnoyIndex(f, "euclidean") for j in range(1000): i.add_item(j, [random.gauss(0, 1) for x in range(f)]) i.build(10) for j in range(100): i.unbuild() i.build(10) assert i.get_n_items() == 1000 def test_include_distances(): # See #633 # (Not able to repro it though) f = 10 i = AnnoyIndex(f, "euclidean") for j in range(10000): i.add_item(j, [random.gauss(0, 1) for x in range(f)]) i.build(10) v = [random.gauss(0, 1) for x in range(f)] for _ in range(10000000): indices, distances = i.get_nns_by_vector(v, 1, include_distances=True) ================================================ FILE: test/multithreaded_build_test.py ================================================ import numpy from annoy import AnnoyIndex def _test_building_with_threads(n_jobs): n, f = 10000, 10 n_trees = 31 i = AnnoyIndex(f, "euclidean") for j in range(n): i.add_item(j, numpy.random.normal(size=f)) assert i.build(n_trees, n_jobs=n_jobs) assert n_trees == i.get_n_trees() def test_one_thread(): _test_building_with_threads(1) def test_two_threads(): _test_building_with_threads(2) def test_four_threads(): _test_building_with_threads(4) def test_eight_threads(): _test_building_with_threads(8) ================================================ FILE: test/on_disk_build_test.py ================================================ # Copyright (c) 2013 Spotify AB # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy of # the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations under # the License. import os import pytest from annoy import AnnoyIndex @pytest.fixture(scope="module", autouse=True) def setUp(): if os.path.exists("on_disk.ann"): os.remove("on_disk.ann") def add_items(i): i.add_item(0, [2, 2]) i.add_item(1, [3, 2]) i.add_item(2, [3, 3]) def check_nns(i): assert i.get_nns_by_vector([4, 4], 3) == [2, 1, 0] assert i.get_nns_by_vector([1, 1], 3) == [0, 1, 2] assert i.get_nns_by_vector([4, 2], 3) == [1, 2, 0] def test_on_disk(): f = 2 i = AnnoyIndex(f, "euclidean") i.on_disk_build("on_disk.ann") add_items(i) i.build(10) check_nns(i) i.unload() i.load("on_disk.ann") check_nns(i) j = AnnoyIndex(f, "euclidean") j.load("on_disk.ann") check_nns(j) ================================================ FILE: test/seed_test.py ================================================ # Copyright (c) 2013 Spotify AB # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy of # the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations under # the License. import numpy from annoy import AnnoyIndex def test_seeding(): f = 10 X = numpy.random.rand(1000, f) Y = numpy.random.rand(50, f) indexes = [] for i in range(2): index = AnnoyIndex(f, "angular") index.set_seed(42) for j in range(X.shape[0]): index.add_item(j, X[j]) index.build(10) indexes.append(index) for k in range(Y.shape[0]): assert indexes[0].get_nns_by_vector(Y[k], 100) == indexes[1].get_nns_by_vector( Y[k], 100 ) ================================================ FILE: test/threading_test.py ================================================ # Copyright (c) 2013 Spotify AB # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy of # the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations under # the License. import multiprocessing.pool import numpy from annoy import AnnoyIndex def test_threads(): n, f = 10000, 10 i = AnnoyIndex(f, "euclidean") for j in range(n): i.add_item(j, numpy.random.normal(size=f)) i.build(10) pool = multiprocessing.pool.ThreadPool() def query_f(j): i.get_nns_by_item(1, 1000) pool.map(query_f, range(n)) ================================================ FILE: test/types_test.py ================================================ # Copyright (c) 2013 Spotify AB # # Licensed under the Apache License, Version 2.0 (the "License"); you may not # use this file except in compliance with the License. You may obtain a copy of # the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the # License for the specific language governing permissions and limitations under # the License. import random import numpy import pytest from annoy import AnnoyIndex def test_numpy(n_points=1000, n_trees=10): f = 10 i = AnnoyIndex(f, "euclidean") for j in range(n_points): a = numpy.random.normal(size=f) a = a.astype( random.choice([numpy.float64, numpy.float32, numpy.uint8, numpy.int16]) ) i.add_item(j, a) i.build(n_trees) def test_tuple(n_points=1000, n_trees=10): f = 10 i = AnnoyIndex(f, "euclidean") for j in range(n_points): i.add_item(j, tuple(random.gauss(0, 1) for x in range(f))) i.build(n_trees) def test_wrong_length(n_points=1000, n_trees=10): f = 10 i = AnnoyIndex(f, "euclidean") i.add_item(0, [random.gauss(0, 1) for x in range(f)]) with pytest.raises(IndexError): i.add_item(1, [random.gauss(0, 1) for x in range(f + 1000)]) with pytest.raises(IndexError): i.add_item(2, []) i.build(n_trees) def test_range_errors(n_points=1000, n_trees=10): f = 10 i = AnnoyIndex(f, "euclidean") for j in range(n_points): i.add_item(j, [random.gauss(0, 1) for x in range(f)]) with pytest.raises(IndexError): i.add_item(-1, [random.gauss(0, 1) for x in range(f)]) i.build(n_trees) for bad_index in [-1000, -1, n_points, n_points + 1000]: with pytest.raises(IndexError): i.get_distance(0, bad_index) with pytest.raises(IndexError): i.get_nns_by_item(bad_index, 1) with pytest.raises(IndexError): i.get_item_vector(bad_index) def test_missing_len(): """ We should get a helpful error message if our vector doesn't have a __len__ method. """ class FakeCollection: pass i = AnnoyIndex(10, "euclidean") with pytest.raises(TypeError) as excinfo: i.add_item(1, FakeCollection()) assert str(excinfo.value) == "object of type 'FakeCollection' has no len()" def test_missing_getitem(): """ We should get a helpful error message if our vector doesn't have a __getitem__ method. """ class FakeCollection: def __len__(self): return 5 i = AnnoyIndex(5, "euclidean") with pytest.raises(TypeError) as excinfo: i.add_item(1, FakeCollection()) assert str(excinfo.value) == "'FakeCollection' object is not subscriptable" def test_short(): """ Ensure we handle our vector not being long enough. """ class FakeCollection: def __len__(self): return 3 def __getitem__(self, i): raise IndexError i = AnnoyIndex(3, "euclidean") with pytest.raises(IndexError): i.add_item(1, FakeCollection()) def test_non_float(): """ We should error gracefully if non-floats are provided in our vector. """ array_strings = ["1", "2", "3"] i = AnnoyIndex(3, "euclidean") with pytest.raises(TypeError) as excinfo: i.add_item(1, array_strings) assert str(excinfo.value) == "must be real number, not str" ================================================ FILE: tox.ini ================================================ [tox] envlist=py{26,27,33,34,35,36,37,38,39,310,311,312,313}, go, lua [testenv] setenv = TRAVIS = {env:TRAVIS:} commands = pip install numpy h5py pip install . python setup.py nosetests --verbosity=3 [testenv:go] setenv = GOPATH = {env:HOME:}/gopath GOROOT = /usr/local/go whitelist_externals=* commands = mkdir -p {env:GOPATH:}/src/annoyindex wget https://storage.googleapis.com/golang/go1.5.linux-amd64.tar.gz sudo tar -C /usr/local -xzf go1.5.linux-amd64.tar.gz sudo add-apt-repository -y ppa:timsc/swig-3.0.12 sudo apt-get update -qq sudo apt-get install -y swig3.0 swig3.0 -go -intgosize 64 -cgo -c++ src/annoygomodule.i cp src/annoygomodule_wrap.cxx src/annoyindex.go src/annoygomodule.h src/annoylib.h src/kissrandom.h {env:GOPATH:}/src/annoyindex {env:GOROOT}/bin/go build annoyindex [testenv:lua] setenv = HOME = {env:HOME} whitelist_externals=* commands = pip install hererocks hererocks {toxworkdir}/here --{env:LUA:} --luarocks 2.2 {toxworkdir}/here/bin/luarocks make {toxworkdir}/here/bin/luarocks install busted {toxworkdir}/here/bin/busted test/annoy_test.lua