[
  {
    "path": ".github/workflows/ci.yml",
    "content": "name: Annoy\n\non:\n  push:\n    branches:\n      - main\n  pull_request:\n\njobs:\n  unit-tests:\n    runs-on: ubuntu-22.04\n    strategy:\n      fail-fast: false\n      matrix:\n        python-version: [\"3.7\", \"3.8\", \"3.9\", \"3.10\", \"3.11\", \"3.12\", \"3.13\"]\n        os: [\"ubuntu-20.04\", \"macos-latest\", \"windows-latest\"]\n\n    steps:\n      - uses: actions/checkout@v3 # Pull the repository\n      - uses: actions/setup-python@v4\n        with:\n          python-version: ${{ matrix.python-version }}\n      - run: pip install .\n      - run: pip install h5py numpy pytest\n      - run: pytest -v\n"
  },
  {
    "path": ".github/workflows/publish.yml",
    "content": "name: Publish\n\non:\n  push:\n    tags:\n      - 'v*.*.*'\n\njobs:\n  build:\n    runs-on: ${{ matrix.os }}\n    strategy:\n      fail-fast: false\n      matrix:\n        os: [ubuntu-latest, windows-latest, macos-latest]\n\n    steps:\n      - name: Checkout code\n        uses: actions/checkout@v2\n\n      - name: Set up QEMU (for Linux aarch64)\n        if: runner.os == 'Linux'\n        uses: docker/setup-qemu-action@v3\n        with:\n          platforms: arm64\n\n      - name: Set up Python\n        uses: actions/setup-python@v5\n        with:\n          python-version: '3.x'\n\n      - name: Install cibuildwheel\n        run: python -m pip install cibuildwheel==3.2.1\n\n      - name: Build wheels\n        run: python -m cibuildwheel --output-dir dist\n        env:\n          CIBW_BEFORE_BUILD: python -m pip install -U pip && rm -rf build\n          CIBW_ARCHS_LINUX: auto aarch64\n\n      - name: Upload wheels\n        uses: actions/upload-artifact@v4\n        with:\n          name: built-wheels-${{ matrix.os }}-${{ strategy.job-index }}\n          path: ./dist/*.whl\n\n      - name: Build source distribution\n        if: matrix.os == 'ubuntu-latest'\n        run: python -m pip install build && python -m build --sdist --outdir dist\n\n      - name: Upload sdist\n        if: matrix.os == 'ubuntu-latest'\n        uses: actions/upload-artifact@v4\n        with:\n          name: built-sdist\n          path: ./dist/*.tar.gz\n\n  publish:\n    needs: build\n    runs-on: ubuntu-latest\n    # pypi trusted publishing via OIDC\n    permissions:\n      id-token: write\n    steps:\n      - name: Download all artifacts\n        uses: actions/download-artifact@v4\n        with:\n          pattern: built-*\n          path: dist\n          merge-multiple: true\n\n      - name: Publish package\n        uses: pypa/gh-action-pypi-publish@release/v1\n        if: startsWith(github.ref, 'refs/tags/v') && github.event_name == 'push'\n        with:\n          password: ${{ secrets.PYPI_API_TOKEN }}\n"
  },
  {
    "path": ".gitignore",
    "content": "*.egg-info/\n*.egg/\n*.so\n*.o\nbuild/\ndist/\n.vscode/\n*.pdb\n\nMANIFEST\n*.py[cod]\n*.idea\n\n# testing\n*.ann\n*.tree\n*.annoy\n*.idx\n*.hdf5\n"
  },
  {
    "path": "CMakeLists.txt",
    "content": "cmake_minimum_required(VERSION 3.15...3.25 FATAL_ERROR)\n\nproject(Annoy\n  DESCRIPTION \"Approximate Nearest Neighbors Oh Yeah\"\n  VERSION 1.17.1\n  LANGUAGES CXX)\n\nadd_library(Annoy INTERFACE)\nadd_library(Annoy::Annoy ALIAS Annoy)\n\nforeach (HEADER annoylib.h kissrandom.h mman.h)\n  configure_file(\"${CMAKE_CURRENT_SOURCE_DIR}/src/${HEADER}\" \"${CMAKE_CURRENT_BINARY_DIR}/include/annoy/${HEADER}\" COPYONLY)\nendforeach ()\n\ntarget_include_directories(Annoy INTERFACE\n  $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/include>\n  $<INSTALL_INTERFACE:include>)\n\n# Install\ninclude(GNUInstallDirs)\n\ninstall(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/include/\n  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})\n\ninstall(TARGETS Annoy\n  EXPORT AnnoyTargets)\n\ninstall(EXPORT AnnoyTargets\n  FILE AnnoyConfig.cmake\n  NAMESPACE Annoy::\n  DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/annoy)\n\nexport(TARGETS Annoy NAMESPACE Annoy:: FILE AnnoyConfig.cmake)\n"
  },
  {
    "path": "LICENSE",
    "content": "\n                                 Apache License\n                           Version 2.0, January 2004\n                        http://www.apache.org/licenses/\n\n   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n   1. Definitions.\n\n      \"License\" shall mean the terms and conditions for use, reproduction,\n      and distribution as defined by Sections 1 through 9 of this document.\n\n      \"Licensor\" shall mean the copyright owner or entity authorized by\n      the copyright owner that is granting the License.\n\n      \"Legal Entity\" shall mean the union of the acting entity and all\n      other entities that control, are controlled by, or are under common\n      control with that entity. For the purposes of this definition,\n      \"control\" means (i) the power, direct or indirect, to cause the\n      direction or management of such entity, whether by contract or\n      otherwise, or (ii) ownership of fifty percent (50%) or more of the\n      outstanding shares, or (iii) beneficial ownership of such entity.\n\n      \"You\" (or \"Your\") shall mean an individual or Legal Entity\n      exercising permissions granted by this License.\n\n      \"Source\" form shall mean the preferred form for making modifications,\n      including but not limited to software source code, documentation\n      source, and configuration files.\n\n      \"Object\" form shall mean any form resulting from mechanical\n      transformation or translation of a Source form, including but\n      not limited to compiled object code, generated documentation,\n      and conversions to other media types.\n\n      \"Work\" shall mean the work of authorship, whether in Source or\n      Object form, made available under the License, as indicated by a\n      copyright notice that is included in or attached to the work\n      (an example is provided in the Appendix below).\n\n      \"Derivative Works\" shall mean any work, whether in Source or Object\n      form, that is based on (or derived from) the Work and for which the\n      editorial revisions, annotations, elaborations, or other modifications\n      represent, as a whole, an original work of authorship. For the purposes\n      of this License, Derivative Works shall not include works that remain\n      separable from, or merely link (or bind by name) to the interfaces of,\n      the Work and Derivative Works thereof.\n\n      \"Contribution\" shall mean any work of authorship, including\n      the original version of the Work and any modifications or additions\n      to that Work or Derivative Works thereof, that is intentionally\n      submitted to Licensor for inclusion in the Work by the copyright owner\n      or by an individual or Legal Entity authorized to submit on behalf of\n      the copyright owner. For the purposes of this definition, \"submitted\"\n      means any form of electronic, verbal, or written communication sent\n      to the Licensor or its representatives, including but not limited to\n      communication on electronic mailing lists, source code control systems,\n      and issue tracking systems that are managed by, or on behalf of, the\n      Licensor for the purpose of discussing and improving the Work, but\n      excluding communication that is conspicuously marked or otherwise\n      designated in writing by the copyright owner as \"Not a Contribution.\"\n\n      \"Contributor\" shall mean Licensor and any individual or Legal Entity\n      on behalf of whom a Contribution has been received by Licensor and\n      subsequently incorporated within the Work.\n\n   2. Grant of Copyright License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      copyright license to reproduce, prepare Derivative Works of,\n      publicly display, publicly perform, sublicense, and distribute the\n      Work and such Derivative Works in Source or Object form.\n\n   3. Grant of Patent License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      (except as stated in this section) patent license to make, have made,\n      use, offer to sell, sell, import, and otherwise transfer the Work,\n      where such license applies only to those patent claims licensable\n      by such Contributor that are necessarily infringed by their\n      Contribution(s) alone or by combination of their Contribution(s)\n      with the Work to which such Contribution(s) was submitted. If You\n      institute patent litigation against any entity (including a\n      cross-claim or counterclaim in a lawsuit) alleging that the Work\n      or a Contribution incorporated within the Work constitutes direct\n      or contributory patent infringement, then any patent licenses\n      granted to You under this License for that Work shall terminate\n      as of the date such litigation is filed.\n\n   4. Redistribution. You may reproduce and distribute copies of the\n      Work or Derivative Works thereof in any medium, with or without\n      modifications, and in Source or Object form, provided that You\n      meet the following conditions:\n\n      (a) You must give any other recipients of the Work or\n          Derivative Works a copy of this License; and\n\n      (b) You must cause any modified files to carry prominent notices\n          stating that You changed the files; and\n\n      (c) You must retain, in the Source form of any Derivative Works\n          that You distribute, all copyright, patent, trademark, and\n          attribution notices from the Source form of the Work,\n          excluding those notices that do not pertain to any part of\n          the Derivative Works; and\n\n      (d) If the Work includes a \"NOTICE\" text file as part of its\n          distribution, then any Derivative Works that You distribute must\n          include a readable copy of the attribution notices contained\n          within such NOTICE file, excluding those notices that do not\n          pertain to any part of the Derivative Works, in at least one\n          of the following places: within a NOTICE text file distributed\n          as part of the Derivative Works; within the Source form or\n          documentation, if provided along with the Derivative Works; or,\n          within a display generated by the Derivative Works, if and\n          wherever such third-party notices normally appear. The contents\n          of the NOTICE file are for informational purposes only and\n          do not modify the License. You may add Your own attribution\n          notices within Derivative Works that You distribute, alongside\n          or as an addendum to the NOTICE text from the Work, provided\n          that such additional attribution notices cannot be construed\n          as modifying the License.\n\n      You may add Your own copyright statement to Your modifications and\n      may provide additional or different license terms and conditions\n      for use, reproduction, or distribution of Your modifications, or\n      for any such Derivative Works as a whole, provided Your use,\n      reproduction, and distribution of the Work otherwise complies with\n      the conditions stated in this License.\n\n   5. Submission of Contributions. Unless You explicitly state otherwise,\n      any Contribution intentionally submitted for inclusion in the Work\n      by You to the Licensor shall be under the terms and conditions of\n      this License, without any additional terms or conditions.\n      Notwithstanding the above, nothing herein shall supersede or modify\n      the terms of any separate license agreement you may have executed\n      with Licensor regarding such Contributions.\n\n   6. Trademarks. This License does not grant permission to use the trade\n      names, trademarks, service marks, or product names of the Licensor,\n      except as required for reasonable and customary use in describing the\n      origin of the Work and reproducing the content of the NOTICE file.\n\n   7. Disclaimer of Warranty. Unless required by applicable law or\n      agreed to in writing, Licensor provides the Work (and each\n      Contributor provides its Contributions) on an \"AS IS\" BASIS,\n      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n      implied, including, without limitation, any warranties or conditions\n      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n      PARTICULAR PURPOSE. You are solely responsible for determining the\n      appropriateness of using or redistributing the Work and assume any\n      risks associated with Your exercise of permissions under this License.\n\n   8. Limitation of Liability. In no event and under no legal theory,\n      whether in tort (including negligence), contract, or otherwise,\n      unless required by applicable law (such as deliberate and grossly\n      negligent acts) or agreed to in writing, shall any Contributor be\n      liable to You for damages, including any direct, indirect, special,\n      incidental, or consequential damages of any character arising as a\n      result of this License or out of the use or inability to use the\n      Work (including but not limited to damages for loss of goodwill,\n      work stoppage, computer failure or malfunction, or any and all\n      other commercial damages or losses), even if such Contributor\n      has been advised of the possibility of such damages.\n\n   9. Accepting Warranty or Additional Liability. While redistributing\n      the Work or Derivative Works thereof, You may choose to offer,\n      and charge a fee for, acceptance of support, warranty, indemnity,\n      or other liability obligations and/or rights consistent with this\n      License. However, in accepting such obligations, You may act only\n      on Your own behalf and on Your sole responsibility, not on behalf\n      of any other Contributor, and only if You agree to indemnify,\n      defend, and hold each Contributor harmless for any liability\n      incurred by, or claims asserted against, such Contributor by reason\n      of your accepting any such warranty or additional liability.\n\n   END OF TERMS AND CONDITIONS\n\n   APPENDIX: How to apply the Apache License to your work.\n\n      To apply the Apache License to your work, attach the following\n      boilerplate notice, with the fields enclosed by brackets \"[]\"\n      replaced with your own identifying information. (Don't include\n      the brackets!)  The text should be enclosed in the appropriate\n      comment syntax for the file format. We also recommend that a\n      file or class name and description of purpose be included on the\n      same \"printed page\" as the copyright notice for easier\n      identification within third-party archives.\n\n   Copyright 2021 (c) Spotify and its affiliates.\n\n   Licensed under the Apache License, Version 2.0 (the \"License\");\n   you may not use this file except in compliance with the License.\n   You may obtain a copy of the License at\n\n       http://www.apache.org/licenses/LICENSE-2.0\n\n   Unless required by applicable law or agreed to in writing, software\n   distributed under the License is distributed on an \"AS IS\" BASIS,\n   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n   See the License for the specific language governing permissions and\n   limitations under the License.\n"
  },
  {
    "path": "MANIFEST.in",
    "content": "include README.rst LICENSE ann.png\ninclude src/annoylib.h\ninclude src/kissrandom.h\ninclude src/mman.h\n"
  },
  {
    "path": "README.rst",
    "content": "Annoy\n-----\n\n\n\n.. figure:: https://raw.github.com/spotify/annoy/master/ann.png\n   :alt: Annoy example\n   :align: center\n\n.. image:: https://github.com/spotify/annoy/actions/workflows/ci.yml/badge.svg\n   :target: https://github.com/spotify/annoy/actions\n\nAnnoy (`Approximate Nearest Neighbors <http://en.wikipedia.org/wiki/Nearest_neighbor_search#Approximate_nearest_neighbor>`__ Oh Yeah) is a C++ library with Python bindings to search for points in space that are close to a given query point. It also creates large read-only file-based data structures that are `mmapped <https://en.wikipedia.org/wiki/Mmap>`__ into memory so that many processes may share the same data.\n\nInstall\n-------\n\nTo install, simply do ``pip install --user annoy`` to pull down the latest version from `PyPI <https://pypi.python.org/pypi/annoy>`_.\n\nFor the C++ version, just clone the repo and ``#include \"annoylib.h\"``.\n\nBackground\n----------\n\nThere are some other libraries to do nearest neighbor search. Annoy is almost as fast as the fastest libraries, (see below), but there is actually another feature that really sets Annoy apart: it has the ability to **use static files as indexes**. In particular, this means you can **share index across processes**. Annoy also decouples creating indexes from loading them, so you can pass around indexes as files and map them into memory quickly. Another nice thing of Annoy is that it tries to minimize memory footprint so the indexes are quite small.\n\nWhy is this useful? If you want to find nearest neighbors and you have many CPU's, you only need to build the index once. You can also pass around and distribute static files to use in production environment, in Hadoop jobs, etc. Any process will be able to load (mmap) the index into memory and will be able to do lookups immediately.\n\nWe use it at `Spotify <http://www.spotify.com/>`__ for music recommendations. After running matrix factorization algorithms, every user/item can be represented as a vector in f-dimensional space. This library helps us search for similar users/items. We have many millions of tracks in a high-dimensional space, so memory usage is a prime concern.\n\nAnnoy was built by `Erik Bernhardsson <http://www.erikbern.com>`__ in a couple of afternoons during `Hack Week <http://labs.spotify.com/2013/02/15/organizing-a-hack-week/>`__.\n\nSummary of features\n-------------------\n\n* `Euclidean distance <https://en.wikipedia.org/wiki/Euclidean_distance>`__, `Manhattan distance <https://en.wikipedia.org/wiki/Taxicab_geometry>`__, `cosine distance <https://en.wikipedia.org/wiki/Cosine_similarity>`__, `Hamming distance <https://en.wikipedia.org/wiki/Hamming_distance>`__, or `Dot (Inner) Product distance <https://en.wikipedia.org/wiki/Dot_product>`__\n* Cosine distance is equivalent to Euclidean distance of normalized vectors = sqrt(2-2*cos(u, v))\n* Works better if you don't have too many dimensions (like <100) but seems to perform surprisingly well even up to 1,000 dimensions\n* Small memory usage\n* Lets you share memory between multiple processes\n* Index creation is separate from lookup (in particular you can not add more items once the tree has been created)\n* Native Python support, tested with 2.7, 3.6, and 3.7.\n* Build index on disk to enable indexing big datasets that won't fit into memory (contributed by `Rene Hollander <https://github.com/ReneHollander>`__)\n\nPython code example\n-------------------\n\n.. code-block:: python\n\n  from annoy import AnnoyIndex\n  import random\n\n  f = 40  # Length of item vector that will be indexed\n\n  t = AnnoyIndex(f, 'angular')\n  for i in range(1000):\n      v = [random.gauss(0, 1) for z in range(f)]\n      t.add_item(i, v)\n\n  t.build(10) # 10 trees\n  t.save('test.ann')\n\n  # ...\n\n  u = AnnoyIndex(f, 'angular')\n  u.load('test.ann') # super fast, will just mmap the file\n  print(u.get_nns_by_item(0, 1000)) # will find the 1000 nearest neighbors\n\nRight now it only accepts integers as identifiers for items. Note that it will allocate memory for max(id)+1 items because it assumes your items are numbered 0 … n-1. If you need other id's, you will have to keep track of a map yourself.\n\nFull Python API\n---------------\n\n* ``AnnoyIndex(f, metric)`` returns a new index that's read-write and stores vector of ``f`` dimensions. Metric can be ``\"angular\"``, ``\"euclidean\"``, ``\"manhattan\"``, ``\"hamming\"``, or ``\"dot\"``.\n* ``a.add_item(i, v)`` adds item ``i`` (any nonnegative integer) with vector ``v``. Note that it will allocate memory for ``max(i)+1`` items.\n* ``a.build(n_trees, n_jobs=-1)`` builds a forest of ``n_trees`` trees. More trees gives higher precision when querying. After calling ``build``, no more items can be added. ``n_jobs`` specifies the number of threads used to build the trees. ``n_jobs=-1`` uses all available CPU cores.\n* ``a.save(fn, prefault=False)`` saves the index to disk and loads it (see next function). After saving, no more items can be added.\n* ``a.load(fn, prefault=False)`` loads (mmaps) an index from disk. If `prefault` is set to `True`, it will pre-read the entire file into memory (using mmap with `MAP_POPULATE`). Default is `False`.\n* ``a.unload()`` unloads.\n* ``a.get_nns_by_item(i, n, search_k=-1, include_distances=False)`` returns the ``n`` closest items. During the query it will inspect up to ``search_k`` nodes which defaults to ``n_trees * n`` if not provided. ``search_k`` gives you a run-time tradeoff between better accuracy and speed. If you set ``include_distances`` to ``True``, it will return a 2 element tuple with two lists in it: the second one containing all corresponding distances.\n* ``a.get_nns_by_vector(v, n, search_k=-1, include_distances=False)`` same but query by vector ``v``.\n* ``a.get_item_vector(i)`` returns the vector for item ``i`` that was previously added.\n* ``a.get_distance(i, j)`` returns the distance between items ``i`` and ``j``. NOTE: this used to return the *squared* distance, but has been changed as of Aug 2016.\n* ``a.get_n_items()`` returns the number of items in the index.\n* ``a.get_n_trees()`` returns the number of trees in the index.\n* ``a.on_disk_build(fn)`` prepares annoy to build the index in the specified file instead of RAM (execute before adding items, no need to save after build)\n* ``a.set_seed(seed)`` will initialize the random number generator with the given seed.  Only used for building up the tree, i. e. only necessary to pass this before adding the items.  Will have no effect after calling `a.build(n_trees)` or `a.load(fn)`.\n\nNotes:\n\n* There's no bounds checking performed on the values so be careful.\n* Annoy uses Euclidean distance of normalized vectors for its angular distance, which for two vectors u,v is equal to ``sqrt(2(1-cos(u,v)))``\n\n\nThe C++ API is very similar: just ``#include \"annoylib.h\"`` to get access to it.\n\nTradeoffs\n---------\n\nThere are just two main parameters needed to tune Annoy: the number of trees ``n_trees`` and the number of nodes to inspect during searching ``search_k``.\n\n* ``n_trees`` is provided during build time and affects the build time and the index size. A larger value will give more accurate results, but larger indexes.\n* ``search_k`` is provided in runtime and affects the search performance. A larger value will give more accurate results, but will take longer time to return.\n\nIf ``search_k`` is not provided, it will default to ``n * n_trees`` where ``n`` is the number of approximate nearest neighbors. Otherwise, ``search_k`` and ``n_trees`` are roughly independent, i.e. the value of ``n_trees`` will not affect search time if ``search_k`` is held constant and vice versa. Basically it's recommended to set ``n_trees`` as large as possible given the amount of memory you can afford, and it's recommended to set ``search_k`` as large as possible given the time constraints you have for the queries.\n\nYou can also accept slower search times in favour of reduced loading times, memory usage, and disk IO. On supported platforms the index is prefaulted during ``load`` and ``save``, causing the file to be pre-emptively read from disk into memory. If you set ``prefault`` to ``False``, pages of the mmapped index are instead read from disk and cached in memory on-demand, as necessary for a search to complete. This can significantly increase early search times but may be better suited for systems with low memory compared to index size, when few queries are executed against a loaded index, and/or when large areas of the index are unlikely to be relevant to search queries.\n\n\nHow does it work\n----------------\n\nUsing `random projections <http://en.wikipedia.org/wiki/Locality-sensitive_hashing#Random_projection>`__ and by building up a tree. At every intermediate node in the tree, a random hyperplane is chosen, which divides the space into two subspaces. This hyperplane is chosen by sampling two points from the subset and taking the hyperplane equidistant from them.\n\nWe do this k times so that we get a forest of trees. k has to be tuned to your need, by looking at what tradeoff you have between precision and performance.\n\nHamming distance (contributed by `Martin Aumüller <https://github.com/maumueller>`__) packs the data into 64-bit integers under the hood and uses built-in bit count primitives so it could be quite fast. All splits are axis-aligned.\n\nDot Product distance (contributed by `Peter Sobot <https://github.com/psobot>`__ and `Pavel Korobov <https://github.com/pkorobov>`__) reduces the provided vectors from dot (or \"inner-product\") space to a more query-friendly cosine space using `a method by Bachrach et al., at Microsoft Research, published in 2014 <https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/XboxInnerProduct.pdf>`__.\n\n\n\nMore info\n---------\n\n* `Dirk Eddelbuettel <https://github.com/eddelbuettel>`__ provides an `R version of Annoy <http://dirk.eddelbuettel.com/code/rcpp.annoy.html>`__.\n* `Andy Sloane <https://github.com/a1k0n>`__ provides a `Java version of Annoy <https://github.com/spotify/annoy-java>`__ although currently limited to cosine and read-only.\n* `Pishen Tsai <https://github.com/pishen>`__ provides a `Scala wrapper of Annoy <https://github.com/pishen/annoy4s>`__ which uses JNA to call the C++ library of Annoy.\n* `Atsushi Tatsuma <https://github.com/yoshoku>`__ provides `Ruby bindings for Annoy <https://github.com/yoshoku/annoy.rb>`__.\n* There is `experimental support for Go <https://github.com/spotify/annoy/blob/master/README_GO.rst>`__ provided by `Taneli Leppä <https://github.com/rosmo>`__.\n* `Boris Nagaev <https://github.com/starius>`__ wrote `Lua bindings <https://github.com/spotify/annoy/blob/master/README_Lua.md>`__.\n* During part of Spotify Hack Week 2016 (and a bit afterward), `Jim Kang <https://github.com/jimkang>`__ wrote `Node bindings <https://github.com/jimkang/annoy-node>`__ for Annoy.\n* `Min-Seok Kim <https://github.com/mskimm>`__ built a `Scala version <https://github.com/mskimm/ann4s>`__ of Annoy.\n* `hanabi1224 <https://github.com/hanabi1224>`__ built a read-only `Rust version <https://github.com/hanabi1224/RuAnnoy>`__ of Annoy, together with **dotnet, jvm and dart** read-only bindings.\n* `Presentation from New York Machine Learning meetup <http://www.slideshare.net/erikbern/approximate-nearest-neighbor-methods-and-vector-models-nyc-ml-meetup>`__ about Annoy\n* Annoy is available as a `conda package <https://anaconda.org/conda-forge/python-annoy>`__ on Linux, OS X, and Windows.\n* `ann-benchmarks <https://github.com/erikbern/ann-benchmarks>`__ is a benchmark for several approximate nearest neighbor libraries. Annoy seems to be fairly competitive, especially at higher precisions:\n\n.. figure:: https://raw.githubusercontent.com/erikbern/ann-benchmarks/main/results/glove-100-angular.png\n   :alt: ANN benchmarks\n   :align: center\n   :target: https://github.com/erikbern/ann-benchmarks\n\nSource code\n-----------\n\nIt's all written in C++ with a handful of ugly optimizations for performance and memory usage. You have been warned :)\n\nThe code should support Windows, thanks to `Qiang Kou <https://github.com/thirdwing>`__ and `Timothy Riley <https://github.com/tjrileywisc>`__.\n\nTo run the tests, execute `python setup.py nosetests`. The test suite includes a big real world dataset that is downloaded from the internet, so it will take a few minutes to execute.\n\nDiscuss\n-------\n\nFeel free to post any questions or comments to the `annoy-user <https://groups.google.com/group/annoy-user>`__ group. I'm `@fulhack <https://twitter.com/fulhack>`__ on Twitter.\n"
  },
  {
    "path": "README_GO.rst",
    "content": "Install\n-------\n\nTo install, you'll need Swig (tested with Swig 4.2.1 on Ubuntu 24.04), and then just::\n\n  swig -go -intgosize 64 -cgo -c++ src/annoygomodule.i\n  mkdir -p $(go env GOPATH)/src/annoy\n  cp src/annoygomodule_wrap.cxx src/annoy.go src/annoygomodule.h src/annoylib.h src/kissrandom.h test/annoy_test.go $(go env GOPATH)/src/annoy\n  cd $(go env GOPATH)/src/annoy\n  go mod init github.com/spotify/annoy\n  go mod tidy\n  go test\n\nBackground\n----------\n\nSee the main README.\n\nGo code example\n-------------------\n\n.. code-block:: go\n\n  package main\n  \n  import (\n         \"fmt\"\n         \"math/rand\"\n\n         \"github.com/spotify/annoy\"\n  )\n  \n  func main() {\n       f := 40\n       t := annoy.NewAnnoyIndexAngular(f)\n       for i := 0; i < 1000; i++ {\n       \t item := make([]float32, 0, f)\n       \t for x:= 0; x < f; x++ {\n  \t     item = append(item, rand.Float32())\n  \t }\n  \t t.AddItem(i, item)\n       }\n       t.Build(10)\n       t.Save(\"test.ann\")\n  \n       annoy.DeleteAnnoyIndexAngular(t)\n       \n       t = annoy.NewAnnoyIndexAngular(f)\n       t.Load(\"test.ann\")\n       \n       result := annoyindex.NewAnnoyVectorInt()\n       defer result.Free()\n       t.GetNnsByItem(0, 1000, -1, result)\n       fmt.Printf(\"%v\\n\", result.ToSlice())\n  \n  }\n  \nRight now it only accepts integers as identifiers for items. Note that it will allocate memory for max(id)+1 items because it assumes your items are numbered 0 … n-1. If you need other id's, you will have to keep track of a map yourself.\n\nFull Go API\n---------------\n\nSee annoygomodule.h. Generally the same as Python API except some arguments are not optional. Go binding does not support multithreaded build.\n\nTests\n-------\nA simple test is supplied in test/annoy_test.go.\n\nDiscuss\n-------\n\nMemroy leak in the previous versions has been fixed thanks to https://github.com/swig/swig/issues/2292. (memory leak fix is implemented in https://github.com/Rikanishu/annoy-go)\n\nGo glue written by Taneli Leppä (@rosmo). You can contact me via email (see https://github.com/rosmo).\n"
  },
  {
    "path": "README_Lua.md",
    "content": "Install\n-------\n\nTo install, you'll need Lua (binary + library) and LuaRocks.\n\nIf you have Python and Pip, you can get Lua and LuaRocks\nusing [hererocks](https://github.com/mpeterv/hererocks/),\nwritten by Peter Melnichenko.\n\n```\n  pip install hererocks\n  hererocks here --lua 5.1 --luarocks 2.2\n```\n\nThis command installs Lua and LuaRocks locally to directory `here`.\nTo activate it, add `here/bin` to `PATH`:\n\n```\n  export PATH=\"$(pwd)/here/bin/:$PATH\"\n```\n\nThen you can use commands `lua`, `luarocks`,\nand tools installed by `luarocks`.\n\nTo build and install `annoy`, type:\n\n```\n  luarocks make\n```\n\nBackground\n----------\n\nSee the main README.\n\nLua code example\n----------------\n\n```lua\nlocal annoy = require \"annoy\"\n\nlocal f = 3\nlocal t = annoy.AnnoyIndex(f) -- Length of item vector that will be indexed\nfor i = 0, 999 do\n  local v = {math.random(), math.random(), math.random()}\n  t:add_item(i, v)\nend\n\nt:build(10) -- 10 trees\nt:save('test.ann')\n\n-- ...\n\nlocal u = annoy.AnnoyIndex(f)\nu:load('test.ann') -- super fast, will just mmap the file\n\n-- find the 10 nearest neighbors\nlocal neighbors = u:get_nns_by_item(0, 10)\nfor rank, i in ipairs(neighbors) do\n  print(\"neighbor\", rank, \"is\", i)\nend\n```\n\nFull Lua API\n------------\n\nLua API closely resembles Python API, see main README. Lua binding does not support multithreaded build.\n\n\nTests\n-------\n\nFile `test/annoy_test.lua` is the literal translation of\n`test/annoy_test.py` from Python+Nosetests to Lua+Busted.\n\nTo run tests, you need [Busted](http://olivinelabs.com/busted/),\nElegant Lua unit testing. To install it, type:\n\n```\n  luarocks install busted\n```\n\nTo run tests, type:\n\n```\n  busted test/annoy_test.lua\n```\n\nIt will take few minutes to execute.\n\nDiscuss\n-------\n\nThere might be some memory leaks if inputs are incorrect.\nSome functions allocate stack objects calling Lua functions throwing\nLua errors (e.g., `luaL_checkinteger`). A Lua error may omit calling\nC++ destructors when unwinding the stack. (If it does, depends on\nthe Lua implementation and platform being in use.)\n\nLua binding was written by Boris Nagaev.\nYou can contact me via email (see https://github.com/starius).\n"
  },
  {
    "path": "RELEASE.md",
    "content": "How to release\n--------------\n\n1. Make sure you're on master. `git checkout master && git fetch && git reset --hard origin/master`\n1. Update `setup.py` to the newest version, `git add setup.py && git commit -m \"version 1.2.3\"`\n1. `python setup.py sdist bdist_wheel`\n1. `git tag -a v1.2.3 -m \"version 1.2.3\"`\n1. `git push --tags origin master` to push the last version to Github\n1. Go to https://github.com/spotify/annoy/releases and click \"Draft a new release\"\n1. `twine upload dist/annoy-1.2.3*`\n\nTODO\n----\n\n* Wheel\n"
  },
  {
    "path": "annoy/__init__.py",
    "content": "# Copyright (c) 2013 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n# use this file except in compliance with the License. You may obtain a copy of\n# the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT\n# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the\n# License for the specific language governing permissions and limitations under\n# the License.\n\n# This module is a dummy wrapper around the underlying C++ module.\nfrom .annoylib import Annoy as AnnoyIndex\n"
  },
  {
    "path": "annoy/__init__.pyi",
    "content": "\nfrom typing import Sized, overload\nfrom typing_extensions import Literal, Protocol\n\nclass _Vector(Protocol, Sized):\n    def __getitem__(self, __index: int) -> float: ...\n\nclass AnnoyIndex:\n    f: int\n    def __init__(self, f: int, metric: Literal[\"angular\", \"euclidean\", \"manhattan\", \"hamming\", \"dot\"]) -> None: ...\n    def load(self, fn: str, prefault: bool = ...) -> Literal[True]: ...\n    def save(self, fn: str, prefault: bool = ...) -> Literal[True]: ...\n    @overload\n    def get_nns_by_item(self, i: int, n: int, search_k: int = ..., include_distances: Literal[False] = ...) -> list[int]: ...\n    @overload\n    def get_nns_by_item(\n        self, i: int, n: int, search_k: int, include_distances: Literal[True]\n    ) -> tuple[list[int], list[float]]: ...\n    @overload\n    def get_nns_by_item(\n        self, i: int, n: int, search_k: int = ..., *, include_distances: Literal[True]\n    ) -> tuple[list[int], list[float]]: ...\n    @overload\n    def get_nns_by_vector(\n        self, vector: _Vector, n: int, search_k: int = ..., include_distances: Literal[False] = ...\n    ) -> list[int]: ...\n    @overload\n    def get_nns_by_vector(\n        self, vector: _Vector, n: int, search_k: int, include_distances: Literal[True]\n    ) -> tuple[list[int], list[float]]: ...\n    @overload\n    def get_nns_by_vector(\n        self, vector: _Vector, n: int, search_k: int = ..., *, include_distances: Literal[True]\n    ) -> tuple[list[int], list[float]]: ...\n    def get_item_vector(self, __i: int) -> list[float]: ...\n    def add_item(self, i: int, vector: _Vector) -> None: ...\n    def on_disk_build(self, fn: str) -> Literal[True]: ...\n    def build(self, n_trees: int, n_jobs: int = ...) -> Literal[True]: ...\n    def unbuild(self) -> Literal[True]: ...\n    def unload(self) -> Literal[True]: ...\n    def get_distance(self, __i: int, __j: int) -> float: ...\n    def get_n_items(self) -> int: ...\n    def get_n_trees(self) -> int: ...\n    def verbose(self, __v: bool) -> Literal[True]: ...\n    def set_seed(self, __s: int) -> None: ...\n"
  },
  {
    "path": "annoy/py.typed",
    "content": ""
  },
  {
    "path": "annoy-dev-1.rockspec",
    "content": "-- Copyright (c) 2016 Boris Nagaev\n--\n-- Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n-- use this file except in compliance with the License. You may obtain a copy of\n-- the License at\n--\n-- http://www.apache.org/licenses/LICENSE-2.0\n--\n-- Unless required by applicable law or agreed to in writing, software\n-- distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT\n-- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the\n-- License for the specific language governing permissions and limitations under\n-- the License.\n\npackage = \"annoy\"\nversion = \"dev-1\"\nsource = {\n    url = \"git://github.com/spotify/annoy.git\",\n}\ndescription = {\n    summary = \"Approximate Nearest Neighbors Oh Yeah\",\n    homepage = \"https://github.com/spotify/annoy\",\n    license = \"Apache\",\n    detailed = [[\nAnnoy (Approximate Nearest Neighbors Oh Yeah) is a C++ library with Python\nGo and Lua bindings to search for points in space that are close to a given\nquery point. It also creates large read-only file-based data structures\nthat are mmapped into memory so that many processes may share the same data.\n]],\n}\ndependencies = {\n    \"lua >= 5.1\",\n}\nbuild = {\n    type = \"builtin\",\n    modules = {\n        ['annoy'] = {\n            sources = {\n                \"src/annoyluamodule.cc\",\n            },\n        },\n    },\n    platforms = {\n        unix = {\n            modules = {\n                ['annoy'] = {\n                    libraries = {\"stdc++\"},\n                },\n            },\n        },\n        mingw32 = {\n            modules = {\n                ['annoy'] = {\n                    libraries = {\"stdc++\"},\n                },\n            },\n        },\n    },\n}\n"
  },
  {
    "path": "debian/changelog",
    "content": "spotify-annoy (1.0.0) unstable; urgency=low\n\n  * Initial release.\n\n -- Erik Bernhardsson <erikbern@spotify.com>  Wed, 20 Feb 2013 00:00:00 +0000\n\n\n"
  },
  {
    "path": "debian/compat",
    "content": "7\n"
  },
  {
    "path": "debian/control",
    "content": "Source: spotify-annoy\nSection: non-free/net\nPriority: extra\nMaintainer: Erik Bernhardsson <erikbern@spotify.com>\nBuild-Depends: debhelper (>= 7), python-all-dev, python-setuptools\nStandards-Version: 3.7.2\nXS-Python-Version: >= 2.6\n\nPackage: spotify-annoy\nArchitecture: any\nDepends: ${python:Depends}\nDescription: Python module (written in C++) for high-dimensional approximate nearest neigbor (ANN) queries\n"
  },
  {
    "path": "debian/rules",
    "content": "#!/usr/bin/make -f\n\n%:\n\tdh $@\n"
  },
  {
    "path": "examples/mmap_test.py",
    "content": "from annoy import AnnoyIndex\n\na = AnnoyIndex(3, 'angular')\na.add_item(0, [1, 0, 0])\na.add_item(1, [0, 1, 0])\na.add_item(2, [0, 0, 1])\na.build(-1)\na.save('test.tree')\n\nb = AnnoyIndex(3)\nb.load('test.tree')\n\nprint(b.get_nns_by_item(0, 100))\nprint(b.get_nns_by_vector([1.0, 0.5, 0.5], 100))\n"
  },
  {
    "path": "examples/precision_test.cpp",
    "content": "/*\n * precision_test.cpp\n\n *\n *  Created on: Jul 13, 2016\n *      Author: Claudio Sanhueza\n *      Contact: csanhuezalobos@gmail.com\n */\n\n#include <iostream>\n#include <iomanip>\n#include \"../src/kissrandom.h\"\n#include \"../src/annoylib.h\"\n#include <chrono>\n#include <algorithm>\n#include <map>\n#include <random>\n\nusing namespace Annoy;\nint precision(int f=40, int n=1000000){\n\tstd::chrono::high_resolution_clock::time_point t_start, t_end;\n\n\tstd::default_random_engine generator;\n\tstd::normal_distribution<double> distribution(0.0, 1.0);\n\n\t//******************************************************\n\t//Building the tree\n\tAnnoyIndex<int, double, Angular, Kiss32Random, AnnoyIndexMultiThreadedBuildPolicy> t = AnnoyIndex<int, double, Angular, Kiss32Random, AnnoyIndexMultiThreadedBuildPolicy>(f);\n\n\tstd::cout << \"Building index ... be patient !!\" << std::endl;\n\tstd::cout << \"\\\"Trees that are slow to grow bear the best fruit\\\" (Moliere)\" << std::endl;\n\n\n\n\tfor(int i=0; i<n; ++i){\n\t\tdouble *vec = (double *) malloc( f * sizeof(double) );\n\n\t\tfor(int z=0; z<f; ++z){\n\t\t\tvec[z] = (distribution(generator));\n\t\t}\n\n\t\tt.add_item(i, vec);\n\n\t\tstd::cout << \"Loading objects ...\\t object: \"<< i+1 << \"\\tProgress:\"<< std::fixed << std::setprecision(2) << (double) i / (double)(n + 1) * 100 << \"%\\r\";\n\n\t}\n\tstd::cout << std::endl;\n\tstd::cout << \"Building index num_trees = 2 * num_features ...\";\n\tt_start = std::chrono::high_resolution_clock::now();\n\tt.build(2 * f);\n\tt_end = std::chrono::high_resolution_clock::now();\n\tauto duration = std::chrono::duration_cast<std::chrono::seconds>( t_end - t_start ).count();\n\tstd::cout << \" Done in \"<< duration << \" secs.\" << std::endl;\n\n\n\tstd::cout << \"Saving index ...\";\n\tt.save(\"precision.tree\");\n\tstd::cout << \" Done\" << std::endl;\n\n\n\n\t//******************************************************\n\tstd::vector<int> limits = {10, 100, 1000, 10000};\n\tint K=10;\n\tint prec_n = 1000;\n\n\tstd::map<int, double> prec_sum;\n\tstd::map<int, double> time_sum;\n\tstd::vector<int> closest;\n\n\t//init precision and timers map\n\tfor(std::vector<int>::iterator it = limits.begin(); it!=limits.end(); ++it){\n\t\tprec_sum[(*it)] = 0.0;\n\t\ttime_sum[(*it)] = 0.0;\n\t}\n\n\t// doing the work\n\tfor(int i=0; i<prec_n; ++i){\n\n\t\t//select a random node\n\t\tint j = rand() % n;\n\n\t\tstd::cout << \"finding nbs for \" << j << std::endl;\n\n\t\t// getting the K closest\n\t\tt.get_nns_by_item(j, K, n, &closest, nullptr);\n\n\t\tstd::vector<int> toplist;\n\t\tstd::vector<int> intersection;\n\n\t\tfor(std::vector<int>::iterator limit = limits.begin(); limit!=limits.end(); ++limit){\n\n\t\t\tt_start = std::chrono::high_resolution_clock::now();\n\t\t\tt.get_nns_by_item(j, (*limit), (size_t) -1, &toplist, nullptr); //search_k defaults to \"n_trees * n\" if not provided.\n\t\t\tt_end = std::chrono::high_resolution_clock::now();\n\t\t\tauto duration = std::chrono::duration_cast<std::chrono::milliseconds>( t_end - t_start ).count();\n\n\t\t\t//intersecting results\n\t\t\tstd::sort(closest.begin(), closest.end(), std::less<int>());\n\t\t\tstd::sort(toplist.begin(), toplist.end(), std::less<int>());\n\t\t\tintersection.resize(std::max(closest.size(), toplist.size()));\n\t\t\tstd::vector<int>::iterator it_set = std::set_intersection(closest.begin(), closest.end(), toplist.begin(), toplist.end(), intersection.begin());\n\t\t\tintersection.resize(it_set-intersection.begin());\n\n\t\t\t// storing metrics\n\t\t\tint found = intersection.size();\n\t\t\tdouble hitrate = found / (double) K;\n\t\t\tprec_sum[(*limit)] += hitrate;\n\n\t\t\ttime_sum[(*limit)] += duration;\n\n\n\t\t\t//deallocate memory\n\t\t\tvector<int>().swap(intersection);\n\t\t\tvector<int>().swap(toplist);\n\t\t}\n\n\t\t//print resulting metrics\n\t\tfor(std::vector<int>::iterator limit = limits.begin(); limit!=limits.end(); ++limit){\n\t\t\tstd::cout << \"limit: \" << (*limit) << \"\\tprecision: \"<< std::fixed << std::setprecision(2) << (100.0 * prec_sum[(*limit)] / (i + 1)) << \"% \\tavg. time: \"<< std::fixed<< std::setprecision(6) << (time_sum[(*limit)] / (i + 1)) * 1e-04 << \"s\" << std::endl;\n\t\t}\n\n\t\tclosest.clear(); vector<int>().swap(closest);\n\n\t}\n\n\tstd::cout << \"\\nDone\" << std::endl;\n\treturn 0;\n}\n\n\nvoid help(){\n\tstd::cout << \"Annoy Precision C++ example\" << std::endl;\n\tstd::cout << \"Usage:\" << std::endl;\n\tstd::cout << \"(default)\t\t./precision\" << std::endl;\n\tstd::cout << \"(using parameters)\t./precision num_features num_nodes\" << std::endl;\n\tstd::cout << std::endl;\n}\n\nvoid feedback(int f, int n){\n\tstd::cout<<\"Runing precision example with:\" << std::endl;\n\tstd::cout<<\"num. features: \"<< f << std::endl;\n\tstd::cout<<\"num. nodes: \"<< n << std::endl;\n\tstd::cout << std::endl;\n}\n\n\nint main(int argc, char **argv) {\n\tint f, n;\n\n\n\tif(argc == 1){\n\t\tf = 40;\n\t\tn = 1000000;\n\n\t\tfeedback(f,n);\n\n\t\tprecision(40, 1000000);\n\t}\n\telse if(argc == 3){\n\n\t\tf = atoi(argv[1]);\n\t\tn = atoi(argv[2]);\n\n\t\tfeedback(f,n);\n\n\t\tprecision(f, n);\n\t}\n\telse {\n\t\thelp();\n\t\treturn EXIT_FAILURE;\n\t}\n\n\n\treturn EXIT_SUCCESS;\n}\n"
  },
  {
    "path": "examples/precision_test.py",
    "content": "from __future__ import print_function\nimport random, time\nfrom annoy import AnnoyIndex\n\ntry:\n    xrange\nexcept NameError:\n    # Python 3 compat\n    xrange = range\n\nn, f = 100000, 40\n\nt = AnnoyIndex(f, 'angular')\nfor i in xrange(n):\n    v = []\n    for z in xrange(f):\n        v.append(random.gauss(0, 1))\n    t.add_item(i, v)\n\nt.build(2 * f)\nt.save('test.tree')\n\nlimits = [10, 100, 1000, 10000]\nk = 10\nprec_sum = {}\nprec_n = 1000\ntime_sum = {}\n\nfor i in xrange(prec_n):\n    j = random.randrange(0, n)\n        \n    closest = set(t.get_nns_by_item(j, k, n))\n    for limit in limits:\n        t0 = time.time()\n        toplist = t.get_nns_by_item(j, k, limit)\n        T = time.time() - t0\n            \n        found = len(closest.intersection(toplist))\n        hitrate = 1.0 * found / k\n        prec_sum[limit] = prec_sum.get(limit, 0.0) + hitrate\n        time_sum[limit] = time_sum.get(limit, 0.0) + T\n\nfor limit in limits:\n    print('limit: %-9d precision: %6.2f%% avg time: %.6fs'\n          % (limit, 100.0 * prec_sum[limit] / (i + 1),\n             time_sum[limit] / (i + 1)))\n"
  },
  {
    "path": "examples/s_compile_cpp.sh",
    "content": "#!/bin/bash\n\n\necho \"compiling precision example...\"\ncmd=\"g++ precision_test.cpp -DANNOYLIB_MULTITHREADED_BUILD -o precision_test -std=c++14 -pthread\"\neval $cmd\necho \"Done\"\n"
  },
  {
    "path": "examples/simple_test.py",
    "content": "from annoy import AnnoyIndex\n\na = AnnoyIndex(3, 'angular')\na.add_item(0, [1, 0, 0])\na.add_item(1, [0, 1, 0])\na.add_item(2, [0, 0, 1])\na.build(-1)\n\nprint(a.get_nns_by_item(0, 100))\nprint(a.get_nns_by_vector([1.0, 0.5, 0.5], 100))\n"
  },
  {
    "path": "setup.cfg",
    "content": "[nosetests]\nattr=!slow\nnocapture=1\n\n"
  },
  {
    "path": "setup.py",
    "content": "#!/usr/bin/env python\n# -*- coding: utf-8 -*-\n#\n# Copyright (c) 2013 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n# use this file except in compliance with the License. You may obtain a copy of\n# the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT\n# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the\n# License for the specific language governing permissions and limitations under\n# the License.\n\nfrom setuptools import setup, Extension\nimport os\nimport platform\nimport sys\n\nreadme_note = \"\"\"\\\n.. note::\n\n   For the latest source, discussion, etc, please visit the\n   `GitHub repository <https://github.com/spotify/annoy>`_\\n\\n\n\n.. image:: https://img.shields.io/github/stars/spotify/annoy.svg\n    :target: https://github.com/spotify/annoy\n\n\"\"\"\n\nwith open('README.rst', encoding='utf-8') as fobj:\n    long_description = readme_note + fobj.read()\n\n# Various platform-dependent extras\nextra_compile_args = ['-D_CRT_SECURE_NO_WARNINGS', '-fpermissive']\nextra_link_args = []\nif platform.machine() == 'ppc64le':\n    extra_compile_args += ['-mcpu=native',]\n\nif platform.machine() == 'x86_64':\n    # do not apply march on Intel Darwin\n    if platform.system() != 'Darwin':\n        # Not all CPUs have march as a tuning parameter\n        extra_compile_args += ['-march=native',]\n\nif os.name != 'nt':\n    extra_compile_args += ['-O3', '-ffast-math', '-fno-associative-math']\n\n# Add multithreaded build flag for all platforms using Python 3 and\n# for non-Windows Python 2 platforms\npython_major_version = sys.version_info[0]\nif python_major_version == 3 or (python_major_version == 2 and os.name != 'nt'):\n    extra_compile_args += ['-DANNOYLIB_MULTITHREADED_BUILD']\n\n    if os.name != 'nt':\n        extra_compile_args += ['-std=c++14']\n\n# #349: something with OS X Mojave causes libstd not to be found\nif platform.system() == 'Darwin':\n    extra_compile_args += ['-mmacosx-version-min=10.12']\n    extra_link_args += ['-stdlib=libc++', '-mmacosx-version-min=10.12']\n\n# Manual configuration, you're on your own here.\nmanual_compiler_args = os.environ.get('ANNOY_COMPILER_ARGS', None)\nif manual_compiler_args:\n    extra_compile_args = manual_compiler_args.split(',')\nmanual_linker_args = os.environ.get('ANNOY_LINKER_ARGS', None)\nif manual_linker_args:\n    extra_link_args = manual_linker_args.split(',')\n\nsetup(name='annoy',\n      version='1.17.3',\n      description='Approximate Nearest Neighbors in C++/Python optimized for memory usage and loading/saving to disk.',\n      packages=['annoy'],\n      package_data={'annoy': ['__init__.pyi', 'py.typed']},\n      ext_modules=[\n          Extension(\n              'annoy.annoylib', ['src/annoymodule.cc'],\n              depends=['src/annoylib.h', 'src/kissrandom.h', 'src/mman.h'],\n              extra_compile_args=extra_compile_args,\n              extra_link_args=extra_link_args,\n          )\n      ],\n      long_description=long_description,\n      long_description_content_type='text/x-rst',\n      author='Erik Bernhardsson',\n      author_email='mail@erikbern.com',\n      url='https://github.com/spotify/annoy',\n      license='Apache License 2.0',\n      classifiers=[\n          'Development Status :: 5 - Production/Stable',\n          'Programming Language :: Python',\n          'Programming Language :: Python :: 2.6',\n          'Programming Language :: Python :: 2.7',\n          'Programming Language :: Python :: 3.3',\n          'Programming Language :: Python :: 3.4',\n          'Programming Language :: Python :: 3.5',\n          'Programming Language :: Python :: 3.6',\n          'Programming Language :: Python :: 3.7',\n          'Programming Language :: Python :: 3.8',\n          'Programming Language :: Python :: 3.9',\n          'Programming Language :: Python :: 3.10',\n          'Programming Language :: Python :: 3.11',\n          'Programming Language :: Python :: 3.12',\n          'Programming Language :: Python :: 3.13',\n      ],\n      keywords='nns, approximate nearest neighbor search',\n      setup_requires=['nose>=1.0'],\n      tests_require=['numpy', 'h5py']\n      )\n"
  },
  {
    "path": "src/annoygomodule.h",
    "content": "#include \"annoylib.h\"\n#include \"kissrandom.h\"\n\nusing namespace Annoy;\n\nnamespace GoAnnoy {\n\n\nclass AnnoyVectorFloat {\n    protected:\n        float *ptr;\n        int len;\n\n    public:\n      ~AnnoyVectorFloat() {\n        free(ptr);\n      };\n      float* ArrayPtr() {\n        return ptr;\n      };\n      int Len() {\n        return len;\n      };\n      float Get(int i) {\n        if (i >= len) {\n            return 0.0;\n        }\n        return ptr[i];\n      };\n      void fill_from_vector(vector<float>* v) {\n            if (ptr != NULL) {\n               free(ptr);\n            }\n            ptr = (float*) malloc(v->size() * sizeof(float));\n            for (int i = 0; i < v->size(); i++) {\n                ptr[i] = (float)(*v)[i];\n            }\n            len = v->size();\n      };\n};\n\nclass AnnoyVectorInt {\n    protected:\n        int32_t *ptr;\n        int len;\n\n    public:\n      ~AnnoyVectorInt() {\n        free(ptr);\n      };\n      int32_t* ArrayPtr() {\n        return ptr;\n      };\n      int Len() {\n        return len;\n      };\n      int32_t Get(int i) {\n        if (i >= len) {\n            return 0.0;\n        }\n        return ptr[i];\n      };\n      void fill_from_vector(vector<int32_t>* v) {\n            if (ptr != NULL) {\n                free(ptr);\n            }\n            ptr = (int32_t*) malloc(v->size() * sizeof(int32_t));\n            for (int i = 0; i < v->size(); i++) {\n                ptr[i] = (int32_t)(*v)[i];\n            }\n            len = v->size();\n      };\n};\n\nclass AnnoyIndex {\n protected:\n  ::AnnoyIndexInterface<int32_t, float> *ptr;\n\n  int f;\n\n public:\n  ~AnnoyIndex() {\n    delete ptr;\n  };\n  void addItem(int item, const float* w) {\n    ptr->add_item(item, w);\n  };\n  void build(int q) {\n    ptr->build(q, 1);\n  };\n  bool save(const char* filename, bool prefault) {\n    return ptr->save(filename, prefault);\n  };\n  bool save(const char* filename) {\n    return ptr->save(filename, true);\n  };\n  void unload() {\n    ptr->unload();\n  };\n  bool load(const char* filename, bool prefault) {\n    return ptr->load(filename, prefault);\n  };\n  bool load(const char* filename) {\n    return ptr->load(filename, true);\n  };\n  float getDistance(int i, int j) {\n    return ptr->get_distance(i, j);\n  };\n  void getNnsByItem(int item, int n, int search_k, AnnoyVectorInt* out_result, AnnoyVectorFloat* out_distances) {\n    vector<int32_t>* result = new vector<int32_t>();\n    vector<float>* distances = new vector<float>();\n\n    ptr->get_nns_by_item(item, n, search_k, result, distances);\n\n    out_result->fill_from_vector(result);\n    out_distances->fill_from_vector(distances);\n    delete result;\n    delete distances;\n  };\n  void getNnsByVector(const float* w, int n, int search_k, AnnoyVectorInt* out_result, AnnoyVectorFloat* out_distances) {\n    vector<int32_t>* result = new vector<int32_t>();\n    vector<float>* distances = new vector<float>();\n\n    ptr->get_nns_by_vector(w, n, search_k, result, distances);\n\n    out_result->fill_from_vector(result);\n    out_distances->fill_from_vector(distances);\n    delete result;\n    delete distances;\n  };\n  void getNnsByItem(int item, int n, int search_k, AnnoyVectorInt* out_result) {\n    vector<int32_t>* result = new vector<int32_t>();\n\n    ptr->get_nns_by_item(item, n, search_k, result, NULL);\n\n    out_result->fill_from_vector(result);\n    delete result;\n  };\n  void getNnsByVector(const float* w, int n, int search_k, AnnoyVectorInt* out_result) {\n    vector<int32_t>* result = new vector<int32_t>();\n\n    ptr->get_nns_by_vector(w, n, search_k, result, NULL);\n\n    out_result->fill_from_vector(result);\n    delete result;\n  };\n\n  int getNItems() {\n    return (int)ptr->get_n_items();\n  };\n  void verbose(bool v) {\n    ptr->verbose(v);\n  };\n  void getItem(int item, AnnoyVectorFloat *v) {\n    vector<float>* r = new vector<float>();\n    r->resize(this->f);\n    ptr->get_item(item, &r->front());\n    v->fill_from_vector(r);\n  };\n  bool onDiskBuild(const char* filename) {\n    return ptr->on_disk_build(filename);\n  };\n};\n\nclass AnnoyIndexAngular : public AnnoyIndex \n{\n public:\n  AnnoyIndexAngular(int f) {\n    ptr = new ::AnnoyIndex<int32_t, float, ::Angular, ::Kiss64Random, AnnoyIndexSingleThreadedBuildPolicy>(f);\n    this->f = f;\n  }\n};\n\nclass AnnoyIndexEuclidean : public AnnoyIndex {\n public:\n  AnnoyIndexEuclidean(int f) {\n    ptr = new ::AnnoyIndex<int32_t, float, ::Euclidean, ::Kiss64Random, AnnoyIndexSingleThreadedBuildPolicy>(f);\n    this->f = f;\n  }\n};\n\nclass AnnoyIndexManhattan : public AnnoyIndex {\n public:\n  AnnoyIndexManhattan(int f) {\n    ptr = new ::AnnoyIndex<int32_t, float, ::Manhattan, ::Kiss64Random, AnnoyIndexSingleThreadedBuildPolicy>(f);\n    this->f = f;\n  }\n};\n\nclass AnnoyIndexDotProduct : public AnnoyIndex {\n public:\n  AnnoyIndexDotProduct(int f) {\n    ptr = new ::AnnoyIndex<int32_t, float, ::DotProduct, ::Kiss64Random, AnnoyIndexSingleThreadedBuildPolicy>(f);\n    this->f = f;\n  }\n};\n}\n"
  },
  {
    "path": "src/annoygomodule.i",
    "content": "%module annoy\n\nnamespace Annoy {}\n\n%{\n#include \"annoygomodule.h\"\n%}\n\n\n// const float *\n%typemap(gotype) (const float *)  \"[]float32\"\n%typemap(gotype) (int32_t)  \"int32\"\n\n%typemap(in) (const float *)\n%{\n    float *v;\n    vector<float> w;\n    v = (float *)$input.array;\n    for (int i = 0; i < $input.len; i++) {\n       w.push_back(v[i]);\n    }\n    $1 = &w[0];\n%}\n\n\n%typemap(gotype) (const char *) \"string\"\n\n%typemap(in) (const char *)\n%{\n  $1 = (char *)calloc((((_gostring_)$input).n + 1), sizeof(char));\n  strncpy($1, (((_gostring_)$input).p), ((_gostring_)$input).n);\n%}\n\n%typemap(freearg) (const char *)\n%{\n  free($1);\n%}\n\n\n%ignore fill_from_vector;\n%rename(X_RawAnnoyVectorInt) AnnoyVectorInt;\n%rename(X_RawAnnoyVectorFloat) AnnoyVectorFloat;\n\n%insert(go_wrapper) %{\n\ntype AnnoyVectorInt interface {\n  X_RawAnnoyVectorInt\n  ToSlice() []int32\n  Copy(in *[]int32)\n  InnerArray() []int32\n  Free()\n}\n\nfunc NewAnnoyVectorInt() AnnoyVectorInt {\n    vec := NewX_RawAnnoyVectorInt()\n    return vec.(SwigcptrX_RawAnnoyVectorInt)\n}\n\nfunc (p SwigcptrX_RawAnnoyVectorInt) ToSlice() []int32 {\n    var out []int32\n    p.Copy(&out)\n    return out\n}\n\nfunc (p SwigcptrX_RawAnnoyVectorInt) Copy(in *[]int32)  {\n    out := *in\n    inner := p.InnerArray()\n    if cap(out) >= len(inner) {\n        if len(out) != len(inner) {\n          out = out[:len(inner)]\n        }\n    } else {\n        out = make([]int32, len(inner))\n    }\n\n    copy(out, inner)\n    *in = out\n}\n\nfunc (p SwigcptrX_RawAnnoyVectorInt) Free() {\n    DeleteX_RawAnnoyVectorInt(p)\n}\n\nfunc (p SwigcptrX_RawAnnoyVectorInt) InnerArray() []int32 {\n\tlength := p.Len()\n    ptr := unsafe.Pointer(p.ArrayPtr())\n\treturn ((*[1 << 30]int32)(ptr))[:length:length]\n}\n\n%}\n\n%insert(go_wrapper) %{\n\ntype AnnoyVectorFloat interface {\n  X_RawAnnoyVectorFloat\n  ToSlice() []float32\n  Copy(in *[]float32)\n  InnerArray() []float32\n  Free()\n}\n\nfunc NewAnnoyVectorFloat() AnnoyVectorFloat {\n    vec := NewX_RawAnnoyVectorFloat()\n    return vec.(SwigcptrX_RawAnnoyVectorFloat)\n}\n\nfunc (p SwigcptrX_RawAnnoyVectorFloat) ToSlice() []float32 {\n    var out []float32\n    p.Copy(&out)\n    return out\n}\n\nfunc (p SwigcptrX_RawAnnoyVectorFloat) Copy(in *[]float32)  {\n    out := *in\n    inner := p.InnerArray()\n    if cap(out) >= len(inner) {\n        if len(out) != len(inner) {\n          out = out[:len(inner)]\n        }\n    } else {\n        out = make([]float32, len(inner))\n    }\n\n    copy(out, inner)\n    *in = out\n}\n\nfunc (p SwigcptrX_RawAnnoyVectorFloat) Free() {\n    DeleteX_RawAnnoyVectorFloat(p)\n}\n\nfunc (p SwigcptrX_RawAnnoyVectorFloat) InnerArray() []float32 {\n    length := p.Len()\n    ptr := unsafe.Pointer(p.ArrayPtr())\n    return ((*[1 << 30]float32)(ptr))[:length:length]\n}\n\n%}\n\n/* Let's just grab the original header file here */\n%include \"annoygomodule.h\"\n\n%feature(\"notabstract\") GoAnnoyIndexAngular;\n%feature(\"notabstract\") GoAnnoyIndexEuclidean;\n%feature(\"notabstract\") GoAnnoyIndexManhattan;\n%feature(\"notabstract\") GoAnnoyIndexDotProduct;"
  },
  {
    "path": "src/annoylib.h",
    "content": "// Copyright (c) 2013 Spotify AB\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n// use this file except in compliance with the License. You may obtain a copy of\n// the License at\n//\n// http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT\n// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the\n// License for the specific language governing permissions and limitations under\n// the License.\n\n\n#ifndef ANNOY_ANNOYLIB_H\n#define ANNOY_ANNOYLIB_H\n\n#include <stdio.h>\n#include <sys/stat.h>\n#ifndef _MSC_VER\n#include <unistd.h>\n#endif\n#include <stdio.h>\n#include <stdlib.h>\n#include <sys/types.h>\n#include <fcntl.h>\n#include <stddef.h>\n\n#if defined(_MSC_VER) && _MSC_VER == 1500\ntypedef unsigned char     uint8_t;\ntypedef signed __int32    int32_t;\ntypedef unsigned __int64  uint64_t;\ntypedef signed __int64    int64_t;\n#else\n#include <stdint.h>\n#endif\n\n#if defined(_MSC_VER) || defined(__MINGW32__)\n // a bit hacky, but override some definitions to support 64 bit\n #define off_t int64_t\n #define lseek_getsize(fd) _lseeki64(fd, 0, SEEK_END)\n #ifndef NOMINMAX\n  #define NOMINMAX\n #endif\n #include \"mman.h\"\n #include <windows.h>\n#else\n #include <sys/mman.h>\n #define lseek_getsize(fd) lseek(fd, 0, SEEK_END)\n#endif\n\n#include <cerrno>\n#include <string.h>\n#include <math.h>\n#include <vector>\n#include <algorithm>\n#include <queue>\n#include <limits>\n\n#if __cplusplus >= 201103L\n#include <type_traits>\n#endif\n\n#ifdef ANNOYLIB_MULTITHREADED_BUILD\n#include <thread>\n#include <mutex>\n#include <shared_mutex>\n#endif\n\n#ifdef _MSC_VER\n// Needed for Visual Studio to disable runtime checks for mempcy\n#pragma runtime_checks(\"s\", off)\n#endif\n\n// This allows others to supply their own logger / error printer without\n// requiring Annoy to import their headers. See RcppAnnoy for a use case.\n#ifndef __ERROR_PRINTER_OVERRIDE__\n  #define annoylib_showUpdate(...) { fprintf(stderr, __VA_ARGS__ ); }\n#else\n  #define annoylib_showUpdate(...) { __ERROR_PRINTER_OVERRIDE__( __VA_ARGS__ ); }\n#endif\n\n// Portable alloc definition, cf Writing R Extensions, Section 1.6.4\n#ifdef __GNUC__\n  // Includes GCC, clang and Intel compilers\n  # undef alloca\n  # define alloca(x) __builtin_alloca((x))\n#elif defined(__sun) || defined(_AIX)\n  // this is necessary (and sufficient) for Solaris 10 and AIX 6:\n  # include <alloca.h>\n#endif\n\n// We let the v array in the Node struct take whatever space is needed, so this is a mostly insignificant number.\n// Compilers need *some* size defined for the v array, and some memory checking tools will flag for buffer overruns if this is set too low.\n#define ANNOYLIB_V_ARRAY_SIZE 65536\n\n#ifndef _MSC_VER\n#define annoylib_popcount __builtin_popcountll\n#else // See #293, #358\n#define annoylib_popcount cole_popcount\n#endif\n\n#if !defined(NO_MANUAL_VECTORIZATION) && defined(__GNUC__) && (__GNUC__ >6) && defined(__AVX512F__)  // See #402\n#define ANNOYLIB_USE_AVX512\n#elif !defined(NO_MANUAL_VECTORIZATION) && defined(__AVX__) && defined (__SSE__) && defined(__SSE2__) && defined(__SSE3__)\n#define ANNOYLIB_USE_AVX\n#else\n#endif\n\n#if defined(ANNOYLIB_USE_AVX) || defined(ANNOYLIB_USE_AVX512)\n#if defined(_MSC_VER)\n#include <intrin.h>\n#elif defined(__GNUC__)\n#include <x86intrin.h>\n#endif\n#endif\n\n#if !defined(__MINGW32__)\n#define ANNOYLIB_FTRUNCATE_SIZE(x) static_cast<int64_t>(x)\n#else\n#define ANNOYLIB_FTRUNCATE_SIZE(x) (x)\n#endif\n\nnamespace Annoy {\n\ninline void set_error_from_errno(char **error, const char* msg) {\n  annoylib_showUpdate(\"%s: %s (%d)\\n\", msg, strerror(errno), errno);\n  if (error) {\n    *error = (char *)malloc(256);  // TODO: win doesn't support snprintf\n    snprintf(*error, 255, \"%s: %s (%d)\", msg, strerror(errno), errno);\n  }\n}\n\ninline void set_error_from_string(char **error, const char* msg) {\n  annoylib_showUpdate(\"%s\\n\", msg);\n  if (error) {\n    *error = (char *)malloc(strlen(msg) + 1);\n    strcpy(*error, msg);\n  }\n}\n\n\nusing std::vector;\nusing std::pair;\nusing std::numeric_limits;\nusing std::make_pair;\n\ninline bool remap_memory_and_truncate(void** _ptr, int _fd, size_t old_size, size_t new_size) {\n#ifdef __linux__\n    *_ptr = mremap(*_ptr, old_size, new_size, MREMAP_MAYMOVE);\n    bool ok = ftruncate(_fd, new_size) != -1;\n#else\n    munmap(*_ptr, old_size);\n    bool ok = ftruncate(_fd, ANNOYLIB_FTRUNCATE_SIZE(new_size)) != -1;\n#ifdef MAP_POPULATE\n    *_ptr = mmap(*_ptr, new_size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, _fd, 0);\n#else\n    *_ptr = mmap(*_ptr, new_size, PROT_READ | PROT_WRITE, MAP_SHARED, _fd, 0);\n#endif\n#endif\n    return ok;\n}\n\nnamespace {\n\ntemplate<typename S, typename Node>\ninline Node* get_node_ptr(const void* _nodes, const size_t _s, const S i) {\n  return (Node*)((uint8_t *)_nodes + (_s * i));\n}\n\ntemplate<typename T>\ninline T dot(const T* x, const T* y, int f) {\n  T s = 0;\n  for (int z = 0; z < f; z++) {\n    s += (*x) * (*y);\n    x++;\n    y++;\n  }\n  return s;\n}\n\ntemplate<typename T>\ninline T manhattan_distance(const T* x, const T* y, int f) {\n  T d = 0.0;\n  for (int i = 0; i < f; i++)\n    d += fabs(x[i] - y[i]);\n  return d;\n}\n\ntemplate<typename T>\ninline T euclidean_distance(const T* x, const T* y, int f) {\n  // Don't use dot-product: avoid catastrophic cancellation in #314.\n  T d = 0.0;\n  for (int i = 0; i < f; ++i) {\n    const T tmp=*x - *y;\n    d += tmp * tmp;\n    ++x;\n    ++y;\n  }\n  return d;\n}\n\n#ifdef ANNOYLIB_USE_AVX\n// Horizontal single sum of 256bit vector.\ninline float hsum256_ps_avx(__m256 v) {\n  const __m128 x128 = _mm_add_ps(_mm256_extractf128_ps(v, 1), _mm256_castps256_ps128(v));\n  const __m128 x64 = _mm_add_ps(x128, _mm_movehl_ps(x128, x128));\n  const __m128 x32 = _mm_add_ss(x64, _mm_shuffle_ps(x64, x64, 0x55));\n  return _mm_cvtss_f32(x32);\n}\n\ntemplate<>\ninline float dot<float>(const float* x, const float *y, int f) {\n  float result = 0;\n  if (f > 7) {\n    __m256 d = _mm256_setzero_ps();\n    for (; f > 7; f -= 8) {\n      d = _mm256_add_ps(d, _mm256_mul_ps(_mm256_loadu_ps(x), _mm256_loadu_ps(y)));\n      x += 8;\n      y += 8;\n    }\n    // Sum all floats in dot register.\n    result += hsum256_ps_avx(d);\n  }\n  // Don't forget the remaining values.\n  for (; f > 0; f--) {\n    result += *x * *y;\n    x++;\n    y++;\n  }\n  return result;\n}\n\ntemplate<>\ninline float manhattan_distance<float>(const float* x, const float* y, int f) {\n  float result = 0;\n  int i = f;\n  if (f > 7) {\n    __m256 manhattan = _mm256_setzero_ps();\n    __m256 minus_zero = _mm256_set1_ps(-0.0f);\n    for (; i > 7; i -= 8) {\n      const __m256 x_minus_y = _mm256_sub_ps(_mm256_loadu_ps(x), _mm256_loadu_ps(y));\n      const __m256 distance = _mm256_andnot_ps(minus_zero, x_minus_y); // Absolute value of x_minus_y (forces sign bit to zero)\n      manhattan = _mm256_add_ps(manhattan, distance);\n      x += 8;\n      y += 8;\n    }\n    // Sum all floats in manhattan register.\n    result = hsum256_ps_avx(manhattan);\n  }\n  // Don't forget the remaining values.\n  for (; i > 0; i--) {\n    result += fabsf(*x - *y);\n    x++;\n    y++;\n  }\n  return result;\n}\n\ntemplate<>\ninline float euclidean_distance<float>(const float* x, const float* y, int f) {\n  float result=0;\n  if (f > 7) {\n    __m256 d = _mm256_setzero_ps();\n    for (; f > 7; f -= 8) {\n      const __m256 diff = _mm256_sub_ps(_mm256_loadu_ps(x), _mm256_loadu_ps(y));\n      d = _mm256_add_ps(d, _mm256_mul_ps(diff, diff)); // no support for fmadd in AVX...\n      x += 8;\n      y += 8;\n    }\n    // Sum all floats in dot register.\n    result = hsum256_ps_avx(d);\n  }\n  // Don't forget the remaining values.\n  for (; f > 0; f--) {\n    float tmp = *x - *y;\n    result += tmp * tmp;\n    x++;\n    y++;\n  }\n  return result;\n}\n\n#endif\n\n#ifdef ANNOYLIB_USE_AVX512\ntemplate<>\ninline float dot<float>(const float* x, const float *y, int f) {\n  float result = 0;\n  if (f > 15) {\n    __m512 d = _mm512_setzero_ps();\n    for (; f > 15; f -= 16) {\n      //AVX512F includes FMA\n      d = _mm512_fmadd_ps(_mm512_loadu_ps(x), _mm512_loadu_ps(y), d);\n      x += 16;\n      y += 16;\n    }\n    // Sum all floats in dot register.\n    result += _mm512_reduce_add_ps(d);\n  }\n  // Don't forget the remaining values.\n  for (; f > 0; f--) {\n    result += *x * *y;\n    x++;\n    y++;\n  }\n  return result;\n}\n\ntemplate<>\ninline float manhattan_distance<float>(const float* x, const float* y, int f) {\n  float result = 0;\n  int i = f;\n  if (f > 15) {\n    __m512 manhattan = _mm512_setzero_ps();\n    for (; i > 15; i -= 16) {\n      const __m512 x_minus_y = _mm512_sub_ps(_mm512_loadu_ps(x), _mm512_loadu_ps(y));\n      manhattan = _mm512_add_ps(manhattan, _mm512_abs_ps(x_minus_y));\n      x += 16;\n      y += 16;\n    }\n    // Sum all floats in manhattan register.\n    result = _mm512_reduce_add_ps(manhattan);\n  }\n  // Don't forget the remaining values.\n  for (; i > 0; i--) {\n    result += fabsf(*x - *y);\n    x++;\n    y++;\n  }\n  return result;\n}\n\ntemplate<>\ninline float euclidean_distance<float>(const float* x, const float* y, int f) {\n  float result=0;\n  if (f > 15) {\n    __m512 d = _mm512_setzero_ps();\n    for (; f > 15; f -= 16) {\n      const __m512 diff = _mm512_sub_ps(_mm512_loadu_ps(x), _mm512_loadu_ps(y));\n      d = _mm512_fmadd_ps(diff, diff, d);\n      x += 16;\n      y += 16;\n    }\n    // Sum all floats in dot register.\n    result = _mm512_reduce_add_ps(d);\n  }\n  // Don't forget the remaining values.\n  for (; f > 0; f--) {\n    float tmp = *x - *y;\n    result += tmp * tmp;\n    x++;\n    y++;\n  }\n  return result;\n}\n\n#endif\n\n\ntemplate<typename T, typename Random, typename Distance, typename Node>\ninline void two_means(const vector<Node*>& nodes, int f, Random& random, bool cosine, Node* p, Node* q) {\n  /*\n    This algorithm is a huge heuristic. Empirically it works really well, but I\n    can't motivate it well. The basic idea is to keep two centroids and assign\n    points to either one of them. We weight each centroid by the number of points\n    assigned to it, so to balance it. \n  */\n  static int iteration_steps = 200;\n  size_t count = nodes.size();\n\n  size_t i = random.index(count);\n  size_t j = random.index(count-1);\n  j += (j >= i); // ensure that i != j\n\n  Distance::template copy_node<T, Node>(p, nodes[i], f);\n  Distance::template copy_node<T, Node>(q, nodes[j], f);\n\n  if (cosine) { Distance::template normalize<T, Node>(p, f); Distance::template normalize<T, Node>(q, f); }\n  Distance::init_node(p, f);\n  Distance::init_node(q, f);\n\n  int ic = 1, jc = 1;\n  for (int l = 0; l < iteration_steps; l++) {\n    size_t k = random.index(count);\n    T di = ic * Distance::distance(p, nodes[k], f),\n      dj = jc * Distance::distance(q, nodes[k], f);\n    T norm = cosine ? Distance::template get_norm<T, Node>(nodes[k], f) : 1;\n    if (!(norm > T(0))) {\n      continue;\n    }\n    if (di < dj) {\n      Distance::update_mean(p, nodes[k], norm, ic, f);\n      Distance::init_node(p, f);\n      ic++;\n    } else if (dj < di) {\n      Distance::update_mean(q, nodes[k], norm, jc, f);\n      Distance::init_node(q, f);\n      jc++;\n    }\n  }\n}\n} // namespace\n\nstruct Base {\n  template<typename T, typename S, typename Node>\n  static inline void preprocess(void* nodes, size_t _s, const S node_count, const int f) {\n    // Override this in specific metric structs below if you need to do any pre-processing\n    // on the entire set of nodes passed into this index.\n  }\n\n  template<typename T, typename S, typename Node>\n  static inline void postprocess(void* nodes, size_t _s, const S node_count, const int f) {\n    // Override this in specific metric structs below if you need to do any post-processing\n    // on the entire set of nodes passed into this index.\n  }\n\n  template<typename Node>\n  static inline void zero_value(Node* dest) {\n    // Initialize any fields that require sane defaults within this node.\n  }\n\n  template<typename T, typename Node>\n  static inline void copy_node(Node* dest, const Node* source, const int f) {\n    memcpy(dest->v, source->v, f * sizeof(T));\n  }\n\n  template<typename T, typename Node>\n  static inline T get_norm(Node* node, int f) {\n      return sqrt(dot(node->v, node->v, f));\n  }\n\n  template<typename T, typename Node>\n  static inline void normalize(Node* node, int f) {\n    T norm = Base::get_norm<T, Node>(node, f);\n    if (norm > 0) {\n      for (int z = 0; z < f; z++)\n        node->v[z] /= norm;\n    }\n  }\n\n  template<typename T, typename Node>\n  static inline void update_mean(Node* mean, Node* new_node, T norm, int c, int f) {\n      for (int z = 0; z < f; z++)\n        mean->v[z] = (mean->v[z] * c + new_node->v[z] / norm) / (c + 1);\n  }\n};\n\nstruct Angular : Base {\n  template<typename S, typename T>\n  struct Node {\n    /*\n     * We store a binary tree where each node has two things\n     * - A vector associated with it\n     * - Two children\n     * All nodes occupy the same amount of memory\n     * All nodes with n_descendants == 1 are leaf nodes.\n     * A memory optimization is that for nodes with 2 <= n_descendants <= K,\n     * we skip the vector. Instead we store a list of all descendants. K is\n     * determined by the number of items that fits in the space of the vector.\n     * For nodes with n_descendants == 1 the vector is a data point.\n     * For nodes with n_descendants > K the vector is the normal of the split plane.\n     * Note that we can't really do sizeof(node<T>) because we cheat and allocate\n     * more memory to be able to fit the vector outside\n     */\n    S n_descendants;\n    union {\n      S children[2]; // Will possibly store more than 2\n      T norm;\n    };\n    T v[ANNOYLIB_V_ARRAY_SIZE];\n  };\n  template<typename S, typename T>\n  static inline T distance(const Node<S, T>* x, const Node<S, T>* y, int f) {\n    // want to calculate (a/|a| - b/|b|)^2\n    // = a^2 / a^2 + b^2 / b^2 - 2ab/|a||b|\n    // = 2 - 2cos\n    T pp = x->norm ? x->norm : dot(x->v, x->v, f); // For backwards compatibility reasons, we need to fall back and compute the norm here\n    T qq = y->norm ? y->norm : dot(y->v, y->v, f);\n    T pq = dot(x->v, y->v, f);\n    T ppqq = pp * qq;\n    if (ppqq > 0) return 2.0 - 2.0 * pq / sqrt(ppqq);\n    else return 2.0; // cos is 0\n  }\n  template<typename S, typename T>\n  static inline T margin(const Node<S, T>* n, const T* y, int f) {\n    return dot(n->v, y, f);\n  }\n  template<typename S, typename T, typename Random>\n  static inline bool side(const Node<S, T>* n, const T* y, int f, Random& random) {\n    T dot = margin(n, y, f);\n    if (dot != 0)\n      return (dot > 0);\n    else\n      return (bool)random.flip();\n  }\n  template<typename S, typename T, typename Random>\n  static inline bool side(const Node<S, T>* n, const Node<S, T>* y, int f, Random& random) {\n    return side(n, y->v, f, random);\n  }\n  template<typename S, typename T, typename Random>\n  static inline void create_split(const vector<Node<S, T>*>& nodes, int f, size_t s, Random& random, Node<S, T>* n) {\n    Node<S, T>* p = (Node<S, T>*)alloca(s);\n    Node<S, T>* q = (Node<S, T>*)alloca(s);\n    two_means<T, Random, Angular, Node<S, T> >(nodes, f, random, true, p, q);\n    for (int z = 0; z < f; z++)\n      n->v[z] = p->v[z] - q->v[z];\n    Base::normalize<T, Node<S, T> >(n, f);\n  }\n  template<typename T>\n  static inline T normalized_distance(T distance) {\n    // Used when requesting distances from Python layer\n    // Turns out sometimes the squared distance is -0.0\n    // so we have to make sure it's a positive number.\n    return sqrt(std::max(distance, T(0)));\n  }\n  template<typename T>\n  static inline T pq_distance(T distance, T margin, int child_nr) {\n    if (child_nr == 0)\n      margin = -margin;\n    return std::min(distance, margin);\n  }\n  template<typename T>\n  static inline T pq_initial_value() {\n    return numeric_limits<T>::infinity();\n  }\n  template<typename S, typename T>\n  static inline void init_node(Node<S, T>* n, int f) {\n    n->norm = dot(n->v, n->v, f);\n  }\n  static const char* name() {\n    return \"angular\";\n  }\n};\n\n\nstruct DotProduct : Angular {\n  template<typename S, typename T>\n  struct Node {\n    /*\n     * This is an extension of the Angular node with extra attributes for the DotProduct metric.\n     * It has dot_factor which is needed to reduce the task to Angular distance metric (see the preprocess method)\n     * and also a built flag that helps to compute exact dot products when an index is already built.\n     */\n    S n_descendants;\n    S children[2]; // Will possibly store more than 2\n    T dot_factor;\n    T norm;\n    bool built;\n    T v[ANNOYLIB_V_ARRAY_SIZE];\n  };\n\n  static const char* name() {\n    return \"dot\";\n  }\n\n  template<typename T, typename Node>\n  static inline T get_norm(Node* node, int f) {\n      return sqrt(dot(node->v, node->v, f) + node->dot_factor * node->dot_factor);\n  }\n\n  template<typename T, typename Node>\n  static inline void update_mean(Node* mean, Node* new_node, T norm, int c, int f) {\n      for (int z = 0; z < f; z++)\n        mean->v[z] = (mean->v[z] * c + new_node->v[z] / norm) / (c + 1);\n      mean->dot_factor = (mean->dot_factor * c + new_node->dot_factor / norm) / (c + 1);\n  }\n\n  template<typename S, typename T>\n  static inline T distance(const Node<S, T>* x, const Node<S, T>* y, int f) {\n    if (x->built || y->built) {\n      // When index is already built, we don't need angular distances to retrieve NNs\n      // Thus, we can return dot product scores itself\n      return -dot(x->v, y->v, f);\n    }\n\n    // Calculated by analogy with the angular case\n    T pp = x->norm ? x->norm : dot(x->v, x->v, f) + x->dot_factor * x->dot_factor;\n    T qq = y->norm ? y->norm : dot(y->v, y->v, f) + y->dot_factor * y->dot_factor;\n    T pq = dot(x->v, y->v, f) + x->dot_factor * y->dot_factor;\n    T ppqq = pp * qq;\n\n    if (ppqq > 0) return 2.0 - 2.0 * pq / sqrt(ppqq);\n    else return 2.0;\n  }\n\n  template<typename Node>\n  static inline void zero_value(Node* dest) {\n    dest->dot_factor = 0;\n  }\n\n  template<typename S, typename T>\n  static inline void init_node(Node<S, T>* n, int f) {\n    n->built = false;\n    n->norm = dot(n->v, n->v, f) + n->dot_factor * n->dot_factor;\n  }\n\n  template<typename T, typename Node>\n  static inline void copy_node(Node* dest, const Node* source, const int f) {\n    memcpy(dest->v, source->v, f * sizeof(T));\n    dest->dot_factor = source->dot_factor;\n  }\n\n  template<typename S, typename T, typename Random>\n  static inline void create_split(const vector<Node<S, T>*>& nodes, int f, size_t s, Random& random, Node<S, T>* n) {\n    Node<S, T>* p = (Node<S, T>*)alloca(s);\n    Node<S, T>* q = (Node<S, T>*)alloca(s);\n    DotProduct::zero_value(p); \n    DotProduct::zero_value(q);\n    two_means<T, Random, DotProduct, Node<S, T> >(nodes, f, random, true, p, q);\n    for (int z = 0; z < f; z++)\n      n->v[z] = p->v[z] - q->v[z];\n    n->dot_factor = p->dot_factor - q->dot_factor;\n    DotProduct::normalize<T, Node<S, T> >(n, f);\n  }\n\n  template<typename T, typename Node>\n  static inline void normalize(Node* node, int f) {\n    T norm = sqrt(dot(node->v, node->v, f) + pow(node->dot_factor, 2));\n    if (norm > 0) {\n      for (int z = 0; z < f; z++)\n        node->v[z] /= norm;\n      node->dot_factor /= norm;\n    }\n  }\n\n  template<typename S, typename T>\n  static inline T margin(const Node<S, T>* n, const T* y, int f) {\n    return dot(n->v, y, f);\n  }\n\n  template<typename S, typename T>\n  static inline T margin(const Node<S, T>* n, const Node<S, T>* y, int f) {\n    return dot(n->v, y->v, f) + n->dot_factor * y->dot_factor;\n  }\n\n  template<typename S, typename T, typename Random>\n  static inline bool side(const Node<S, T>* n, const Node<S, T>* y, int f, Random& random) {\n    T dot = margin(n, y, f);\n    if (dot != 0)\n      return (dot > 0);\n    else\n      return (bool)random.flip();\n  }\n\n  template<typename S, typename T, typename Random>\n  static inline bool side(const Node<S, T>* n, const T* y, int f, Random& random) {\n    T dot = margin(n, y, f);\n    if (dot != 0)\n      return (dot > 0);\n    else\n      return (bool)random.flip();\n  }\n\n  template<typename T>\n  static inline T normalized_distance(T distance) {\n    return -distance;\n  }\n\n  template<typename T, typename S, typename Node>\n  static inline void preprocess(void* nodes, size_t _s, const S node_count, const int f) {\n    // This uses a method from Microsoft Research for transforming inner product spaces to cosine/angular-compatible spaces.\n    // (Bachrach et al., 2014, see https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/XboxInnerProduct.pdf)\n\n    // Step one: compute the norm of each vector and store that in its extra dimension (f-1)\n    for (S i = 0; i < node_count; i++) {\n      Node* node = get_node_ptr<S, Node>(nodes, _s, i);\n      T d = dot(node->v, node->v, f);\n      T norm = d < 0 ? 0 : sqrt(d);\n      node->dot_factor = norm;\n      node->built = false;\n    }\n\n    // Step two: find the maximum norm\n    T max_norm = 0;\n    for (S i = 0; i < node_count; i++) {\n      Node* node = get_node_ptr<S, Node>(nodes, _s, i);\n      if (node->dot_factor > max_norm) {\n        max_norm = node->dot_factor;\n      }\n    }\n\n    // Step three: set each vector's extra dimension to sqrt(max_norm^2 - norm^2)\n    for (S i = 0; i < node_count; i++) {\n      Node* node = get_node_ptr<S, Node>(nodes, _s, i);\n      T node_norm = node->dot_factor;\n      T squared_norm_diff = pow(max_norm, static_cast<T>(2.0)) - pow(node_norm, static_cast<T>(2.0));\n      T dot_factor = squared_norm_diff < 0 ? 0 : sqrt(squared_norm_diff);\n\n      node->norm = pow(max_norm, static_cast<T>(2.0));\n      node->dot_factor = dot_factor;\n    }\n  }\n\n  template<typename T, typename S, typename Node>\n  static inline void postprocess(void* nodes, size_t _s, const S node_count, const int f) {\n    for (S i = 0; i < node_count; i++) {\n      Node* node = get_node_ptr<S, Node>(nodes, _s, i);\n      // When an index is built, we will remember it in index item nodes to compute distances differently\n      node->built = true;\n    }\n  }\n};\n\nstruct Hamming : Base {\n  template<typename S, typename T>\n  struct Node {\n    S n_descendants;\n    S children[2];\n    T v[ANNOYLIB_V_ARRAY_SIZE];\n  };\n\n  static const size_t max_iterations = 20;\n\n  template<typename T>\n  static inline T pq_distance(T distance, T margin, int child_nr) {\n    return distance - (margin != (unsigned int) child_nr);\n  }\n\n  template<typename T>\n  static inline T pq_initial_value() {\n    return numeric_limits<T>::max();\n  }\n  template<typename T>\n  static inline int cole_popcount(T v) {\n    // Note: Only used with MSVC 9, which lacks intrinsics and fails to\n    // calculate std::bitset::count for v > 32bit. Uses the generalized\n    // approach by Eric Cole.\n    // See https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSet64\n    v = v - ((v >> 1) & (T)~(T)0/3);\n    v = (v & (T)~(T)0/15*3) + ((v >> 2) & (T)~(T)0/15*3);\n    v = (v + (v >> 4)) & (T)~(T)0/255*15;\n    return (T)(v * ((T)~(T)0/255)) >> (sizeof(T) - 1) * 8;\n  }\n  template<typename S, typename T>\n  static inline T distance(const Node<S, T>* x, const Node<S, T>* y, int f) {\n    size_t dist = 0;\n    for (int i = 0; i < f; i++) {\n      dist += annoylib_popcount(x->v[i] ^ y->v[i]);\n    }\n    return dist;\n  }\n  template<typename S, typename T>\n  static inline bool margin(const Node<S, T>* n, const T* y, int f) {\n    static const size_t n_bits = sizeof(T) * 8;\n    T chunk = n->v[0] / n_bits;\n    return (y[chunk] & (static_cast<T>(1) << (n_bits - 1 - (n->v[0] % n_bits)))) != 0;\n  }\n  template<typename S, typename T, typename Random>\n  static inline bool side(const Node<S, T>* n, const T* y, int f, Random& random) {\n    return margin(n, y, f);\n  }\n  template<typename S, typename T, typename Random>\n  static inline bool side(const Node<S, T>* n, const Node<S, T>* y, int f, Random& random) {\n    return side(n, y->v, f, random);\n  }\n  template<typename S, typename T, typename Random>\n  static inline void create_split(const vector<Node<S, T>*>& nodes, int f, size_t s, Random& random, Node<S, T>* n) {\n    size_t cur_size = 0;\n    size_t i = 0;\n    int dim = f * 8 * sizeof(T);\n    for (; i < max_iterations; i++) {\n      // choose random position to split at\n      n->v[0] = random.index(dim);\n      cur_size = 0;\n      for (typename vector<Node<S, T>*>::const_iterator it = nodes.begin(); it != nodes.end(); ++it) {\n        if (margin(n, (*it)->v, f)) {\n          cur_size++;\n        }\n      }\n      if (cur_size > 0 && cur_size < nodes.size()) {\n        break;\n      }\n    }\n    // brute-force search for splitting coordinate\n    if (i == max_iterations) {\n      int j = 0;\n      for (; j < dim; j++) {\n        n->v[0] = j;\n        cur_size = 0;\n        for (typename vector<Node<S, T>*>::const_iterator it = nodes.begin(); it != nodes.end(); ++it) {\n          if (margin(n, (*it)->v, f)) {\n            cur_size++;\n          }\n        }\n        if (cur_size > 0 && cur_size < nodes.size()) {\n          break;\n        }\n      }\n    }\n  }\n  template<typename T>\n  static inline T normalized_distance(T distance) {\n    return distance;\n  }\n  template<typename S, typename T>\n  static inline void init_node(Node<S, T>* n, int f) {\n  }\n  static const char* name() {\n    return \"hamming\";\n  }\n};\n\n\nstruct Minkowski : Base {\n  template<typename S, typename T>\n  struct Node {\n    S n_descendants;\n    T a; // need an extra constant term to determine the offset of the plane\n    S children[2];\n    T v[ANNOYLIB_V_ARRAY_SIZE];\n  };\n  template<typename S, typename T>\n  static inline T margin(const Node<S, T>* n, const T* y, int f) {\n    return n->a + dot(n->v, y, f);\n  }\n  template<typename S, typename T, typename Random>\n  static inline bool side(const Node<S, T>* n, const T* y, int f, Random& random) {\n    T dot = margin(n, y, f);\n    if (dot != 0)\n      return (dot > 0);\n    else\n      return (bool)random.flip();\n  }\n  template<typename S, typename T, typename Random>\n  static inline bool side(const Node<S, T>* n, const Node<S, T>* y, int f, Random& random) {\n    return side(n, y->v, f, random);\n  }\n  template<typename T>\n  static inline T pq_distance(T distance, T margin, int child_nr) {\n    if (child_nr == 0)\n      margin = -margin;\n    return std::min(distance, margin);\n  }\n  template<typename T>\n  static inline T pq_initial_value() {\n    return numeric_limits<T>::infinity();\n  }\n};\n\n\nstruct Euclidean : Minkowski {\n  template<typename S, typename T>\n  static inline T distance(const Node<S, T>* x, const Node<S, T>* y, int f) {\n    return euclidean_distance(x->v, y->v, f);    \n  }\n  template<typename S, typename T, typename Random>\n  static inline void create_split(const vector<Node<S, T>*>& nodes, int f, size_t s, Random& random, Node<S, T>* n) {\n    Node<S, T>* p = (Node<S, T>*)alloca(s);\n    Node<S, T>* q = (Node<S, T>*)alloca(s);\n    two_means<T, Random, Euclidean, Node<S, T> >(nodes, f, random, false, p, q);\n\n    for (int z = 0; z < f; z++)\n      n->v[z] = p->v[z] - q->v[z];\n    Base::normalize<T, Node<S, T> >(n, f);\n    n->a = 0.0;\n    for (int z = 0; z < f; z++)\n      n->a += -n->v[z] * (p->v[z] + q->v[z]) / 2;\n  }\n  template<typename T>\n  static inline T normalized_distance(T distance) {\n    return sqrt(std::max(distance, T(0)));\n  }\n  template<typename S, typename T>\n  static inline void init_node(Node<S, T>* n, int f) {\n  }\n  static const char* name() {\n    return \"euclidean\";\n  }\n\n};\n\nstruct Manhattan : Minkowski {\n  template<typename S, typename T>\n  static inline T distance(const Node<S, T>* x, const Node<S, T>* y, int f) {\n    return manhattan_distance(x->v, y->v, f);\n  }\n  template<typename S, typename T, typename Random>\n  static inline void create_split(const vector<Node<S, T>*>& nodes, int f, size_t s, Random& random, Node<S, T>* n) {\n    Node<S, T>* p = (Node<S, T>*)alloca(s);\n    Node<S, T>* q = (Node<S, T>*)alloca(s);\n    two_means<T, Random, Manhattan, Node<S, T> >(nodes, f, random, false, p, q);\n\n    for (int z = 0; z < f; z++)\n      n->v[z] = p->v[z] - q->v[z];\n    Base::normalize<T, Node<S, T> >(n, f);\n    n->a = 0.0;\n    for (int z = 0; z < f; z++)\n      n->a += -n->v[z] * (p->v[z] + q->v[z]) / 2;\n  }\n  template<typename T>\n  static inline T normalized_distance(T distance) {\n    return std::max(distance, T(0));\n  }\n  template<typename S, typename T>\n  static inline void init_node(Node<S, T>* n, int f) {\n  }\n  static const char* name() {\n    return \"manhattan\";\n  }\n};\n\ntemplate<typename S, typename T, typename R = uint64_t>\nclass AnnoyIndexInterface {\n public:\n  // Note that the methods with an **error argument will allocate memory and write the pointer to that string if error is non-NULL\n  virtual ~AnnoyIndexInterface() {};\n  virtual bool add_item(S item, const T* w, char** error=NULL) = 0;\n  virtual bool build(int q, int n_threads=-1, char** error=NULL) = 0;\n  virtual bool unbuild(char** error=NULL) = 0;\n  virtual bool save(const char* filename, bool prefault=false, char** error=NULL) = 0;\n  virtual void unload() = 0;\n  virtual bool load(const char* filename, bool prefault=false, char** error=NULL) = 0;\n  virtual T get_distance(S i, S j) const = 0;\n  virtual void get_nns_by_item(S item, size_t n, int search_k, vector<S>* result, vector<T>* distances) const = 0;\n  virtual void get_nns_by_vector(const T* w, size_t n, int search_k, vector<S>* result, vector<T>* distances) const = 0;\n  virtual S get_n_items() const = 0;\n  virtual S get_n_trees() const = 0;\n  virtual void verbose(bool v) = 0;\n  virtual void get_item(S item, T* v) const = 0;\n  virtual void set_seed(R q) = 0;\n  virtual bool on_disk_build(const char* filename, char** error=NULL) = 0;\n};\n\ntemplate<typename S, typename T, typename Distance, typename Random, class ThreadedBuildPolicy>\n  class AnnoyIndex : public AnnoyIndexInterface<S, T, \n#if __cplusplus >= 201103L\n    typename std::remove_const<decltype(Random::default_seed)>::type\n#else\n    typename Random::seed_type\n#endif\n    > {\n  /*\n   * We use random projection to build a forest of binary trees of all items.\n   * Basically just split the hyperspace into two sides by a hyperplane,\n   * then recursively split each of those subtrees etc.\n   * We create a tree like this q times. The default q is determined automatically\n   * in such a way that we at most use 2x as much memory as the vectors take.\n   */\npublic:\n  typedef Distance D;\n  typedef typename D::template Node<S, T> Node;\n#if __cplusplus >= 201103L\n  typedef typename std::remove_const<decltype(Random::default_seed)>::type R;\n#else\n  typedef typename Random::seed_type R;\n#endif\n\nprotected:\n  const int _f;\n  size_t _s;\n  S _n_items;\n  void* _nodes; // Could either be mmapped, or point to a memory buffer that we reallocate\n  S _n_nodes;\n  S _nodes_size;\n  vector<S> _roots;\n  S _K;\n  R _seed;\n  bool _loaded;\n  bool _verbose;\n  int _fd;\n  bool _on_disk;\n  bool _built;\npublic:\n\n   AnnoyIndex(int f) : _f(f), _seed(Random::default_seed) {\n    _s = offsetof(Node, v) + _f * sizeof(T); // Size of each node\n    _verbose = false;\n    _built = false;\n    _K = (S) (((size_t) (_s - offsetof(Node, children))) / sizeof(S)); // Max number of descendants to fit into node\n    reinitialize(); // Reset everything\n  }\n  ~AnnoyIndex() {\n    unload();\n  }\n\n  int get_f() const {\n    return _f;\n  }\n\n  bool add_item(S item, const T* w, char** error=NULL) {\n    return add_item_impl(item, w, error);\n  }\n\n  template<typename W>\n  bool add_item_impl(S item, const W& w, char** error=NULL) {\n    if (_loaded) {\n      set_error_from_string(error, \"You can't add an item to a loaded index\");\n      return false;\n    }\n    _allocate_size(item + 1);\n    Node* n = _get(item);\n\n    D::zero_value(n);\n\n    n->children[0] = 0;\n    n->children[1] = 0;\n    n->n_descendants = 1;\n\n    for (int z = 0; z < _f; z++)\n      n->v[z] = w[z];\n\n    D::init_node(n, _f);\n\n    if (item >= _n_items)\n      _n_items = item + 1;\n\n    return true;\n  }\n    \n  bool on_disk_build(const char* file, char** error=NULL) {\n    _on_disk = true;\n#ifndef _MSC_VER\n    _fd = open(file, O_RDWR | O_CREAT | O_TRUNC, (int) 0600);\n#else\n    _fd = _open(file, _O_RDWR | _O_CREAT | _O_TRUNC, (int) 0600);\n#endif\n    if (_fd == -1) {\n      set_error_from_errno(error, \"Unable to open\");\n      _fd = 0;\n      return false;\n    }\n    _nodes_size = 1;\n    if (ftruncate(_fd, ANNOYLIB_FTRUNCATE_SIZE(_s) * ANNOYLIB_FTRUNCATE_SIZE(_nodes_size)) == -1) {\n      set_error_from_errno(error, \"Unable to truncate\");\n      return false;\n    }\n#ifdef MAP_POPULATE\n    _nodes = (Node*) mmap(0, _s * _nodes_size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, _fd, 0);\n#else\n    _nodes = (Node*) mmap(0, _s * _nodes_size, PROT_READ | PROT_WRITE, MAP_SHARED, _fd, 0);\n#endif\n    return true;\n  }\n    \n  bool build(int q, int n_threads=-1, char** error=NULL) {\n    if (_loaded) {\n      set_error_from_string(error, \"You can't build a loaded index\");\n      return false;\n    }\n\n    if (_built) {\n      set_error_from_string(error, \"You can't build a built index\");\n      return false;\n    }\n\n    D::template preprocess<T, S, Node>(_nodes, _s, _n_items, _f);\n\n    _n_nodes = _n_items;\n\n    ThreadedBuildPolicy::template build<S, T>(this, q, n_threads);\n\n    // Also, copy the roots into the last segment of the array\n    // This way we can load them faster without reading the whole file\n    _allocate_size(_n_nodes + (S)_roots.size());\n    for (size_t i = 0; i < _roots.size(); i++)\n      memcpy(_get(_n_nodes + (S)i), _get(_roots[i]), _s);\n    _n_nodes += _roots.size();\n\n    if (_verbose) annoylib_showUpdate(\"has %d nodes\\n\", _n_nodes);\n    \n    if (_on_disk) {\n      if (!remap_memory_and_truncate(&_nodes, _fd,\n          static_cast<size_t>(_s) * static_cast<size_t>(_nodes_size),\n          static_cast<size_t>(_s) * static_cast<size_t>(_n_nodes))) {\n        // TODO: this probably creates an index in a corrupt state... not sure what to do\n        set_error_from_errno(error, \"Unable to truncate\");\n        return false;\n      }\n      _nodes_size = _n_nodes;\n    }\n\n    D::template postprocess<T, S, Node>(_nodes, _s, _n_items, _f);\n\n    _built = true;\n    return true;\n  }\n  \n  bool unbuild(char** error=NULL) {\n    if (_loaded) {\n      set_error_from_string(error, \"You can't unbuild a loaded index\");\n      return false;\n    }\n\n    _roots.clear();\n    _n_nodes = _n_items;\n    _built = false;\n\n    return true;\n  }\n\n  bool save(const char* filename, bool prefault=false, char** error=NULL) {\n    if (!_built) {\n      set_error_from_string(error, \"You can't save an index that hasn't been built\");\n      return false;\n    }\n    if (_on_disk) {\n      return true;\n    } else {\n      // Delete file if it already exists (See issue #335)\n#ifndef _MSC_VER\n      unlink(filename);\n#else\n      _unlink(filename);\n#endif\n\n      FILE *f = fopen(filename, \"wb\");\n      if (f == NULL) {\n        set_error_from_errno(error, \"Unable to open\");\n        return false;\n      }\n\n      if (fwrite(_nodes, _s, _n_nodes, f) != (size_t) _n_nodes) {\n        set_error_from_errno(error, \"Unable to write\");\n        return false;\n      }\n\n      if (fclose(f) == EOF) {\n        set_error_from_errno(error, \"Unable to close\");\n        return false;\n      }\n\n      unload();\n      return load(filename, prefault, error);\n    }\n  }\n\n  void reinitialize() {\n    _fd = 0;\n    _nodes = NULL;\n    _loaded = false;\n    _n_items = 0;\n    _n_nodes = 0;\n    _nodes_size = 0;\n    _on_disk = false;\n    _seed = Random::default_seed;\n    _roots.clear();\n  }\n\n  void unload() {\n    if (_on_disk && _fd) {\n#ifndef _MSC_VER\n      close(_fd);\n#else\n      _close(_fd);\n#endif\n      munmap(_nodes, _s * _nodes_size);\n    } else {\n      if (_fd) {\n        // we have mmapped data\n#ifndef _MSC_VER\n        close(_fd);\n#else\n        _close(_fd);\n#endif\n        munmap(_nodes, _n_nodes * _s);\n      } else if (_nodes) {\n        // We have heap allocated data\n        free(_nodes);\n      }\n    }\n    reinitialize();\n    if (_verbose) annoylib_showUpdate(\"unloaded\\n\");\n  }\n\n  bool load(const char* filename, bool prefault=false, char** error=NULL) {\n#ifndef _MSC_VER\n    _fd = open(filename, O_RDONLY, (int)0400);\n#else\n    _fd = _open(filename, _O_RDONLY, (int)0400);\n#endif\n    if (_fd == -1) {\n      set_error_from_errno(error, \"Unable to open\");\n      _fd = 0;\n      return false;\n    }\n    off_t size = lseek_getsize(_fd);\n    if (size == -1) {\n      set_error_from_errno(error, \"Unable to get size\");\n      return false;\n    } else if (size == 0) {\n      set_error_from_errno(error, \"Size of file is zero\");\n      return false;\n    } else if (size % _s) {\n      // Something is fishy with this index!\n      set_error_from_errno(error, \"Index size is not a multiple of vector size. Ensure you are opening using the same metric you used to create the index.\");\n      return false;\n    }\n\n    int flags = MAP_SHARED;\n    if (prefault) {\n#ifdef MAP_POPULATE\n      flags |= MAP_POPULATE;\n#else\n      annoylib_showUpdate(\"prefault is set to true, but MAP_POPULATE is not defined on this platform\");\n#endif\n    }\n    _nodes = (Node*)mmap(0, size, PROT_READ, flags, _fd, 0);\n    _n_nodes = (S)(size / _s);\n\n    // Find the roots by scanning the end of the file and taking the nodes with most descendants\n    _roots.clear();\n    S m = -1;\n    for (S i = _n_nodes - 1; i >= 0; i--) {\n      S k = _get(i)->n_descendants;\n      if (m == -1 || k == m) {\n        _roots.push_back(i);\n        m = k;\n      } else {\n        break;\n      }\n    }\n    // hacky fix: since the last root precedes the copy of all roots, delete it\n    if (_roots.size() > 1 && _get(_roots.front())->children[0] == _get(_roots.back())->children[0])\n      _roots.pop_back();\n    _loaded = true;\n    _built = true;\n    _n_items = m;\n    if (_verbose) annoylib_showUpdate(\"found %zu roots with degree %d\\n\", _roots.size(), m);\n    return true;\n  }\n\n  T get_distance(S i, S j) const {\n    return D::normalized_distance(D::distance(_get(i), _get(j), _f));\n  }\n\n  void get_nns_by_item(S item, size_t n, int search_k, vector<S>* result, vector<T>* distances) const {\n    // TODO: handle OOB\n    const Node* m = _get(item);\n    _get_all_nns(m->v, n, search_k, result, distances);\n  }\n\n  void get_nns_by_vector(const T* w, size_t n, int search_k, vector<S>* result, vector<T>* distances) const {\n    _get_all_nns(w, n, search_k, result, distances);\n  }\n\n  S get_n_items() const {\n    return _n_items;\n  }\n\n  S get_n_trees() const {\n    return (S)_roots.size();\n  }\n\n  void verbose(bool v) {\n    _verbose = v;\n  }\n\n  void get_item(S item, T* v) const {\n    // TODO: handle OOB\n    Node* m = _get(item);\n    memcpy(v, m->v, (_f) * sizeof(T));\n  }\n\n  void set_seed(R seed) {\n    _seed = seed;\n  }\n\n  void thread_build(int q, int thread_idx, ThreadedBuildPolicy& threaded_build_policy) {\n    // Each thread needs its own seed, otherwise each thread would be building the same tree(s)\n    Random _random(_seed + thread_idx);\n\n    vector<S> thread_roots;\n    while (1) {\n      if (q == -1) {\n        threaded_build_policy.lock_n_nodes();\n        if (_n_nodes >= 2 * _n_items) {\n          threaded_build_policy.unlock_n_nodes();\n          break;\n        }\n        threaded_build_policy.unlock_n_nodes();\n      } else {\n        if (thread_roots.size() >= (size_t)q) {\n          break;\n        }\n      }\n\n      if (_verbose) annoylib_showUpdate(\"pass %zd...\\n\", thread_roots.size());\n\n      vector<S> indices;\n      threaded_build_policy.lock_shared_nodes();\n      for (S i = 0; i < _n_items; i++) {\n        if (_get(i)->n_descendants >= 1) { // Issue #223\n          indices.push_back(i);\n        }\n      }\n      threaded_build_policy.unlock_shared_nodes();\n\n      thread_roots.push_back(_make_tree(indices, true, _random, threaded_build_policy));\n    }\n\n    threaded_build_policy.lock_roots();\n    _roots.insert(_roots.end(), thread_roots.begin(), thread_roots.end());\n    threaded_build_policy.unlock_roots();\n  }\n\nprotected:\n  void _reallocate_nodes(S n) {\n    const double reallocation_factor = 1.3;\n    S new_nodes_size = std::max(n, (S) ((_nodes_size + 1) * reallocation_factor));\n    void *old = _nodes;\n    \n    if (_on_disk) {\n      if (!remap_memory_and_truncate(&_nodes, _fd, \n          static_cast<size_t>(_s) * static_cast<size_t>(_nodes_size), \n          static_cast<size_t>(_s) * static_cast<size_t>(new_nodes_size)) && \n          _verbose)\n          annoylib_showUpdate(\"File truncation error\\n\");\n    } else {\n      _nodes = realloc(_nodes, _s * new_nodes_size);\n      memset((char *) _nodes + (_nodes_size * _s) / sizeof(char), 0, (new_nodes_size - _nodes_size) * _s);\n    }\n    \n    _nodes_size = new_nodes_size;\n    if (_verbose) annoylib_showUpdate(\"Reallocating to %d nodes: old_address=%p, new_address=%p\\n\", new_nodes_size, old, _nodes);\n  }\n\n  void _allocate_size(S n, ThreadedBuildPolicy& threaded_build_policy) {\n    if (n > _nodes_size) {\n      threaded_build_policy.lock_nodes();\n      _reallocate_nodes(n);\n      threaded_build_policy.unlock_nodes();\n    }\n  }\n\n  void _allocate_size(S n) {\n    if (n > _nodes_size) {\n      _reallocate_nodes(n);\n    }\n  }\n\n  Node* _get(const S i) const {\n    return get_node_ptr<S, Node>(_nodes, _s, i);\n  }\n\n  double _split_imbalance(const vector<S>& left_indices, const vector<S>& right_indices) {\n    double ls = (float)left_indices.size();\n    double rs = (float)right_indices.size();\n    float f = ls / (ls + rs + 1e-9);  // Avoid 0/0\n    return std::max(f, 1-f);\n  }\n\n  S _make_tree(const vector<S>& indices, bool is_root, Random& _random, ThreadedBuildPolicy& threaded_build_policy) {\n    // The basic rule is that if we have <= _K items, then it's a leaf node, otherwise it's a split node.\n    // There's some regrettable complications caused by the problem that root nodes have to be \"special\":\n    // 1. We identify root nodes by the arguable logic that _n_items == n->n_descendants, regardless of how many descendants they actually have\n    // 2. Root nodes with only 1 child need to be a \"dummy\" parent\n    // 3. Due to the _n_items \"hack\", we need to be careful with the cases where _n_items <= _K or _n_items > _K\n    if (indices.size() == 1 && !is_root)\n      return indices[0];\n\n    if (indices.size() <= (size_t)_K && (!is_root || (size_t)_n_items <= (size_t)_K || indices.size() == 1)) {\n      threaded_build_policy.lock_n_nodes();\n      _allocate_size(_n_nodes + 1, threaded_build_policy);\n      S item = _n_nodes++;\n      threaded_build_policy.unlock_n_nodes();\n\n      threaded_build_policy.lock_shared_nodes();\n      Node* m = _get(item);\n      m->n_descendants = is_root ? _n_items : (S)indices.size();\n\n      // Using std::copy instead of a loop seems to resolve issues #3 and #13,\n      // probably because gcc 4.8 goes overboard with optimizations.\n      // Using memcpy instead of std::copy for MSVC compatibility. #235\n      // Only copy when necessary to avoid crash in MSVC 9. #293\n      if (!indices.empty())\n        memcpy(m->children, &indices[0], indices.size() * sizeof(S));\n\n      threaded_build_policy.unlock_shared_nodes();\n      return item;\n    }\n\n    threaded_build_policy.lock_shared_nodes();\n    vector<Node*> children;\n    for (size_t i = 0; i < indices.size(); i++) {\n      S j = indices[i];\n      Node* n = _get(j);\n      if (n)\n        children.push_back(n);\n    }\n\n    vector<S> children_indices[2];\n    Node* m = (Node*)alloca(_s);\n\n    for (int attempt = 0; attempt < 3; attempt++) {\n      children_indices[0].clear();\n      children_indices[1].clear();\n      D::create_split(children, _f, _s, _random, m);\n\n      for (size_t i = 0; i < indices.size(); i++) {\n        S j = indices[i];\n        Node* n = _get(j);\n        if (n) {\n          bool side = D::side(m, n, _f, _random);\n          children_indices[side].push_back(j);\n        } else {\n          annoylib_showUpdate(\"No node for index %d?\\n\", j);\n        }\n      }\n\n      if (_split_imbalance(children_indices[0], children_indices[1]) < 0.95)\n        break;\n    }\n    threaded_build_policy.unlock_shared_nodes();\n\n    // If we didn't find a hyperplane, just randomize sides as a last option\n    while (_split_imbalance(children_indices[0], children_indices[1]) > 0.99) {\n      if (_verbose)\n        annoylib_showUpdate(\"\\tNo hyperplane found (left has %zu children, right has %zu children)\\n\",\n          children_indices[0].size(), children_indices[1].size());\n\n      children_indices[0].clear();\n      children_indices[1].clear();\n\n      // Set the vector to 0.0\n      for (int z = 0; z < _f; z++)\n        m->v[z] = 0;\n\n      for (size_t i = 0; i < indices.size(); i++) {\n        S j = indices[i];\n        // Just randomize...\n        children_indices[_random.flip()].push_back(j);\n      }\n    }\n\n    int flip = (children_indices[0].size() > children_indices[1].size());\n\n    m->n_descendants = is_root ? _n_items : (S)indices.size();\n    for (int side = 0; side < 2; side++) {\n      // run _make_tree for the smallest child first (for cache locality)\n      m->children[side^flip] = _make_tree(children_indices[side^flip], false, _random, threaded_build_policy);\n    }\n\n    threaded_build_policy.lock_n_nodes();\n    _allocate_size(_n_nodes + 1, threaded_build_policy);\n    S item = _n_nodes++;\n    threaded_build_policy.unlock_n_nodes();\n\n    threaded_build_policy.lock_shared_nodes();\n    memcpy(_get(item), m, _s);\n    threaded_build_policy.unlock_shared_nodes();\n\n    return item;\n  }\n\n  void _get_all_nns(const T* v, size_t n, int search_k, vector<S>* result, vector<T>* distances) const {\n    Node* v_node = (Node *)alloca(_s);\n    D::template zero_value<Node>(v_node);\n    memcpy(v_node->v, v, sizeof(T) * _f);\n    D::init_node(v_node, _f);\n\n    std::priority_queue<pair<T, S> > q;\n\n    if (search_k == -1) {\n      search_k = n * _roots.size();\n    }\n\n    for (size_t i = 0; i < _roots.size(); i++) {\n      q.push(make_pair(Distance::template pq_initial_value<T>(), _roots[i]));\n    }\n\n    std::vector<S> nns;\n    while (nns.size() < (size_t)search_k && !q.empty()) {\n      const pair<T, S>& top = q.top();\n      T d = top.first;\n      S i = top.second;\n      Node* nd = _get(i);\n      q.pop();\n      if (nd->n_descendants == 1 && i < _n_items) {\n        nns.push_back(i);\n      } else if (nd->n_descendants <= _K) {\n        const S* dst = nd->children;\n        nns.insert(nns.end(), dst, &dst[nd->n_descendants]);\n      } else {\n        T margin = D::margin(nd, v, _f);\n        q.push(make_pair(D::pq_distance(d, margin, 1), static_cast<S>(nd->children[1])));\n        q.push(make_pair(D::pq_distance(d, margin, 0), static_cast<S>(nd->children[0])));\n      }\n    }\n\n    // Get distances for all items\n    // To avoid calculating distance multiple times for any items, sort by id\n    std::sort(nns.begin(), nns.end());\n    vector<pair<T, S> > nns_dist;\n    S last = -1;\n    for (size_t i = 0; i < nns.size(); i++) {\n      S j = nns[i]; \n      if (j == last)\n        continue;\n      last = j;\n      if (_get(j)->n_descendants == 1)  // This is only to guard a really obscure case, #284\n        nns_dist.push_back(make_pair(D::distance(v_node, _get(j), _f), j));\n    }\n\n    size_t m = nns_dist.size();\n    size_t p = n < m ? n : m; // Return this many items\n    std::partial_sort(nns_dist.begin(), nns_dist.begin() + p, nns_dist.end());\n    for (size_t i = 0; i < p; i++) {\n      if (distances)\n        distances->push_back(D::normalized_distance(nns_dist[i].first));\n      result->push_back(nns_dist[i].second);\n    }\n  }\n};\n\nclass AnnoyIndexSingleThreadedBuildPolicy {\npublic:\n  template<typename S, typename T, typename D, typename Random>\n  static void build(AnnoyIndex<S, T, D, Random, AnnoyIndexSingleThreadedBuildPolicy>* annoy, int q, int n_threads) {\n    AnnoyIndexSingleThreadedBuildPolicy threaded_build_policy;\n    annoy->thread_build(q, 0, threaded_build_policy);\n  }\n\n  void lock_n_nodes() {}\n  void unlock_n_nodes() {}\n\n  void lock_nodes() {}\n  void unlock_nodes() {}\n\n  void lock_shared_nodes() {}\n  void unlock_shared_nodes() {}\n\n  void lock_roots() {}\n  void unlock_roots() {}\n};\n\n#ifdef ANNOYLIB_MULTITHREADED_BUILD\nclass AnnoyIndexMultiThreadedBuildPolicy {\nprivate:\n  std::shared_timed_mutex nodes_mutex;\n  std::mutex n_nodes_mutex;\n  std::mutex roots_mutex;\n\npublic:\n  template<typename S, typename T, typename D, typename Random>\n  static void build(AnnoyIndex<S, T, D, Random, AnnoyIndexMultiThreadedBuildPolicy>* annoy, int q, int n_threads) {\n    AnnoyIndexMultiThreadedBuildPolicy threaded_build_policy;\n    if (n_threads == -1) {\n      // If the hardware_concurrency() value is not well defined or not computable, it returns 0.\n      // We guard against this by using at least 1 thread.\n      n_threads = std::max(1, (int)std::thread::hardware_concurrency());\n    }\n\n    vector<std::thread> threads(n_threads);\n\n    for (int thread_idx = 0; thread_idx < n_threads; thread_idx++) {\n      int trees_per_thread = q == -1 ? -1 : (int)floor((q + thread_idx) / n_threads);\n\n      threads[thread_idx] = std::thread(\n        &AnnoyIndex<S, T, D, Random, AnnoyIndexMultiThreadedBuildPolicy>::thread_build,\n        annoy,\n        trees_per_thread,\n        thread_idx,\n        std::ref(threaded_build_policy)\n      );\n    }\n\n    for (auto& thread : threads) {\n      thread.join();\n    }\n  }\n\n  void lock_n_nodes() {\n    n_nodes_mutex.lock();\n  }\n  void unlock_n_nodes() {\n    n_nodes_mutex.unlock();\n  }\n\n  void lock_nodes() {\n    nodes_mutex.lock();\n  }\n  void unlock_nodes() {\n    nodes_mutex.unlock();\n  }\n\n  void lock_shared_nodes() {\n    nodes_mutex.lock_shared();\n  }\n  void unlock_shared_nodes() {\n    nodes_mutex.unlock_shared();\n  }\n\n  void lock_roots() {\n    roots_mutex.lock();\n  }\n  void unlock_roots() {\n    roots_mutex.unlock();\n  }\n};\n#endif\n\n}\n\n#endif\n// vim: tabstop=2 shiftwidth=2\n"
  },
  {
    "path": "src/annoyluamodule.cc",
    "content": "// Copyright (c) 2016 Boris Nagaev\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n// use this file except in compliance with the License. You may obtain a copy of\n// the License at\n//\n// http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT\n// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the\n// License for the specific language governing permissions and limitations under\n// the License.\n\n#include <cstring>\n#include <typeinfo>\n\n#include <lua.hpp>\n\n#include \"annoylib.h\"\n#include \"kissrandom.h\"\n\n#if LUA_VERSION_NUM == 501\n#define compat_setfuncs(L, funcs) luaL_register(L, NULL, funcs)\n#define compat_rawlen lua_objlen\n#else\n#define compat_setfuncs(L, funcs) luaL_setfuncs(L, funcs, 0)\n#define compat_rawlen lua_rawlen\n#endif\n\nusing namespace Annoy;\n\ntemplate<typename Distance>\nclass LuaAnnoy {\npublic:\n  typedef int32_t AnnoyS;\n  typedef float AnnoyT;\n  typedef AnnoyIndex<AnnoyS, AnnoyT, Distance, Kiss64Random, AnnoyIndexSingleThreadedBuildPolicy> Impl;\n  typedef LuaAnnoy<Distance> ThisClass;\n\n  class LuaArrayProxy {\n  public:\n    LuaArrayProxy(lua_State* L, int object, int f)\n      : L_(L)\n      , object_(object)\n    {\n      luaL_checktype(L, object, LUA_TTABLE);\n      int v_len = compat_rawlen(L, object);\n      luaL_argcheck(L, v_len == f, object, \"Length of v != f\");\n    }\n\n    double operator[](int index) const {\n      lua_rawgeti(L_, object_, index + 1);\n      double result = lua_tonumber(L_, -1);\n      lua_pop(L_, 1);\n      return result;\n    }\n\n  private:\n    lua_State* L_;\n    int object_;\n  };\n\n  static void toVector(lua_State* L, int object, int f, AnnoyT* dst) {\n    LuaArrayProxy proxy(L, object, f);\n    for (int i = 0; i < f; i++) {\n      dst[i] = proxy[i];\n    }\n  }\n\n  template <typename Vector>\n  static void pushVector(lua_State* L, const Vector& v) {\n    lua_createtable(L, v.size(), 0);\n    for (int j = 0; j < v.size(); j++) {\n      lua_pushnumber(L, v[j]);\n      lua_rawseti(L, -2, j + 1);\n    }\n  }\n\n  static const char* typeAsString() {\n    return typeid(Impl).name();\n  }\n\n  static Impl* getAnnoy(lua_State* L, int object) {\n    return reinterpret_cast<Impl*>(\n      luaL_checkudata(L, object, typeAsString())\n    );\n  }\n\n  static int getItemIndex(lua_State* L, int object, int size = -1) {\n    int item = luaL_checkinteger(L, object);\n    luaL_argcheck(L, item >= 0, object, \"Index must be >= 0\");\n    if (size != -1) {\n      luaL_argcheck(L, item < size, object, \"Index must be < size\");\n    }\n    return item;\n  }\n\n  static int gc(lua_State* L) {\n    Impl* self = getAnnoy(L, 1);\n    self->~Impl();\n    return 0;\n  }\n\n  static int tostring(lua_State* L) {\n    Impl* self = getAnnoy(L, 1);\n    lua_pushfstring(\n      L,\n      \"annoy.AnnoyIndex object (%dx%d, %s distance)\",\n      self->get_n_items(), self->get_f(), Distance::name()\n    );\n    return 1;\n  }\n\n  static int add_item(lua_State* L) {\n    Impl* self = getAnnoy(L, 1);\n    int item = getItemIndex(L, 2);\n    self->add_item_impl(item, LuaArrayProxy(L, 3, self->get_f()));\n    return 0;\n  }\n\n  static int build(lua_State* L) {\n    int nargs = lua_gettop(L);\n    Impl* self = getAnnoy(L, 1);\n    int n_trees = luaL_checkinteger(L, 2);\n    self->build(n_trees, 1);\n    lua_pushboolean(L, true);\n    return 1;\n  }\n\n  static int on_disk_build(lua_State* L) {\n    Impl* self = getAnnoy(L, 1);\n    const char* filename = luaL_checkstring(L, 2);\n    self->on_disk_build(filename);\n    lua_pushboolean(L, true);\n    return 1;\n  }\n\n  static int save(lua_State* L) {\n    int nargs = lua_gettop(L);\n    Impl* self = getAnnoy(L, 1);\n    const char* filename = luaL_checkstring(L, 2);\n    bool prefault = true;\n    if (nargs >= 3) {\n      prefault = lua_toboolean(L, 3);\n    }\n    self->save(filename, prefault);\n    lua_pushboolean(L, true);\n    return 1;\n  }\n\n  static int load(lua_State* L) {\n    Impl* self = getAnnoy(L, 1);\n    int nargs = lua_gettop(L);\n    const char* filename = luaL_checkstring(L, 2);\n    bool prefault = true;\n    if (nargs >= 3) {\n      prefault = lua_toboolean(L, 3);\n    }\n    if (!self->load(filename, prefault)) {\n      return luaL_error(L, \"Can't load file: %s\", filename);\n    }\n    lua_pushboolean(L, true);\n    return 1;\n  }\n\n  static int unload(lua_State* L) {\n    Impl* self = getAnnoy(L, 1);\n    self->unload();\n    lua_pushboolean(L, true);\n    return 1;\n  }\n\n  struct Searcher {\n    std::vector<AnnoyS> result;\n    std::vector<AnnoyT> distances;\n    Impl* self;\n    int n;\n    int search_k;\n    bool include_distances;\n\n    Searcher(lua_State* L) {\n      int nargs = lua_gettop(L);\n      self = getAnnoy(L, 1);\n      n = luaL_checkinteger(L, 3);\n      search_k = -1;\n      if (nargs >= 4) {\n        search_k = luaL_checkinteger(L, 4);\n      }\n      include_distances = false;\n      if (nargs >= 5) {\n        include_distances = lua_toboolean(L, 5);\n      }\n    }\n\n    int pushResults(lua_State* L) {\n      pushVector(L, result);\n      if (include_distances) {\n        pushVector(L, distances);\n      }\n      return include_distances ? 2 : 1;\n    }\n  };\n\n  static int get_nns_by_item(lua_State* L) {\n    Searcher s(L);\n    int item = getItemIndex(L, 2, s.self->get_n_items());\n    s.self->get_nns_by_item(item, s.n, s.search_k, &s.result,\n        s.include_distances ? &s.distances : NULL);\n    return s.pushResults(L);\n  }\n\n  static int get_nns_by_vector(lua_State* L) {\n    Searcher s(L);\n    std::vector<AnnoyT> _vec(s.self->get_f());\n    AnnoyT* vec = &(_vec[0]);\n    toVector(L, 2, s.self->get_f(), vec);\n    s.self->get_nns_by_vector(vec, s.n, s.search_k, &s.result,\n        s.include_distances ? &s.distances : NULL);\n    return s.pushResults(L);\n  }\n\n  static int get_item_vector(lua_State* L) {\n    Impl* self = getAnnoy(L, 1);\n    int item = getItemIndex(L, 2, self->get_n_items());\n    std::vector<AnnoyT> _vec(self->get_f());\n    AnnoyT* vec = &(_vec[0]);\n    self->get_item(item, vec);\n    pushVector(L, _vec);\n    return 1;\n  }\n\n  static int get_distance(lua_State* L) {\n    Impl* self = getAnnoy(L, 1);\n    int i = getItemIndex(L, 2, self->get_n_items());\n    int j = getItemIndex(L, 3, self->get_n_items());\n    AnnoyT distance = self->get_distance(i, j);\n    lua_pushnumber(L, distance);\n    return 1;\n  }\n\n  static int get_n_items(lua_State* L) {\n    Impl* self = getAnnoy(L, 1);\n    lua_pushnumber(L, self->get_n_items());\n    return 1;\n  }\n\n  static const luaL_Reg* getMetatable() {\n    static const luaL_Reg funcs[] = {\n      {\"__gc\", &ThisClass::gc},\n      {\"__tostring\", &ThisClass::tostring},\n      {NULL, NULL},\n    };\n    return funcs;\n  }\n\n  static const luaL_Reg* getMethods() {\n    static const luaL_Reg funcs[] = {\n      {\"add_item\", &ThisClass::add_item},\n      {\"build\", &ThisClass::build},\n      {\"save\", &ThisClass::save},\n      {\"load\", &ThisClass::load},\n      {\"unload\", &ThisClass::unload},\n      {\"get_nns_by_item\", &ThisClass::get_nns_by_item},\n      {\"get_nns_by_vector\", &ThisClass::get_nns_by_vector},\n      {\"get_item_vector\", &ThisClass::get_item_vector},\n      {\"get_distance\", &ThisClass::get_distance},\n      {\"get_n_items\", &ThisClass::get_n_items},\n      {\"on_disk_build\", &ThisClass::on_disk_build},\n      {NULL, NULL},\n    };\n    return funcs;\n  }\n\n  static void createNew(lua_State* L, int f) {\n    void* self = lua_newuserdata(L, sizeof(Impl));\n    if (luaL_newmetatable(L, typeAsString())) {\n      compat_setfuncs(L, getMetatable());\n      lua_newtable(L);\n      compat_setfuncs(L, getMethods());\n      lua_setfield(L, -2, \"__index\");\n    }\n    new (self) Impl(f);\n    lua_setmetatable(L, -2);\n  }\n};\n\nstatic int lua_an_make(lua_State* L) {\n  int f = luaL_checkinteger(L, 1);\n  const char* metric = \"angular\";\n  if (lua_gettop(L) >= 2) {\n      metric = luaL_checkstring(L, 2);\n  }\n  if (strcmp(metric, \"angular\") == 0) {\n    LuaAnnoy<Angular>::createNew(L, f);\n    return 1;\n  } else if (strcmp(metric, \"euclidean\") == 0) {\n    LuaAnnoy<Euclidean>::createNew(L, f);\n    return 1;\n  } else if (strcmp(metric, \"manhattan\") == 0) {\n    LuaAnnoy<Manhattan>::createNew(L, f);\n    return 1;\n  } else {\n    return luaL_error(L, \"Unknown metric: %s\", metric);\n  }\n}\n\nstatic const luaL_Reg LUA_ANNOY_FUNCS[] = {\n  {\"AnnoyIndex\", lua_an_make},\n  {NULL, NULL},\n};\n\nextern \"C\" {\nint luaopen_annoy(lua_State* L) {\n  lua_newtable(L);\n  compat_setfuncs(L, LUA_ANNOY_FUNCS);\n  return 1;\n}\n}\n\n// vim: tabstop=2 shiftwidth=2\n"
  },
  {
    "path": "src/annoymodule.cc",
    "content": "// Copyright (c) 2013 Spotify AB\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n// use this file except in compliance with the License. You may obtain a copy of\n// the License at\n//\n// http://www.apache.org/licenses/LICENSE-2.0\n//\n// Unless required by applicable law or agreed to in writing, software\n// distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT\n// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the\n// License for the specific language governing permissions and limitations under\n// the License.\n\n#include \"annoylib.h\"\n#include \"kissrandom.h\"\n#include \"Python.h\"\n#include \"structmember.h\"\n#include <exception>\n#if defined(_MSC_VER) && _MSC_VER == 1500\ntypedef signed __int32    int32_t;\n#else\n#include <stdint.h>\n#endif\n\n\n#if defined(ANNOYLIB_USE_AVX512)\n#define AVX_INFO \"Using 512-bit AVX instructions\"\n#elif defined(ANNOYLIB_USE_AVX128)\n#define AVX_INFO \"Using 128-bit AVX instructions\"\n#else\n#define AVX_INFO \"Not using AVX instructions\"\n#endif\n\n#if defined(_MSC_VER)\n#define COMPILER_INFO \"Compiled using MSC\"\n#elif defined(__GNUC__)\n#define COMPILER_INFO \"Compiled on GCC\"\n#else\n#define COMPILER_INFO \"Compiled on unknown platform\"\n#endif\n\n#define ANNOY_DOC (COMPILER_INFO \". \" AVX_INFO \".\")\n\n#if PY_MAJOR_VERSION >= 3\n#define IS_PY3K\n#endif\n\n#ifndef Py_TYPE\n    #define Py_TYPE(ob) (((PyObject*)(ob))->ob_type)\n#endif\n\n#ifdef IS_PY3K\n    #define PyInt_FromLong PyLong_FromLong \n#endif\n\nusing namespace Annoy;\n\n#ifdef ANNOYLIB_MULTITHREADED_BUILD\n  typedef AnnoyIndexMultiThreadedBuildPolicy AnnoyIndexThreadedBuildPolicy;\n#else\n  typedef AnnoyIndexSingleThreadedBuildPolicy AnnoyIndexThreadedBuildPolicy;\n#endif\n\ntemplate class Annoy::AnnoyIndexInterface<int32_t, float>;\n\nclass HammingWrapper : public AnnoyIndexInterface<int32_t, float> {\n  // Wrapper class for Hamming distance, using composition.\n  // This translates binary (float) vectors into packed uint64_t vectors.\n  // This is questionable from a performance point of view. Should reconsider this solution.\nprivate:\n  int32_t _f_external, _f_internal;\n  AnnoyIndex<int32_t, uint64_t, Hamming, Kiss64Random, AnnoyIndexThreadedBuildPolicy> _index;\n  void _pack(const float* src, uint64_t* dst) const {\n    for (int32_t i = 0; i < _f_internal; i++) {\n      dst[i] = 0;\n      for (int32_t j = 0; j < 64 && i*64+j < _f_external; j++) {\n\tdst[i] |= (uint64_t)(src[i * 64 + j] > 0.5) << j;\n      }\n    }\n  };\n  void _unpack(const uint64_t* src, float* dst) const {\n    for (int32_t i = 0; i < _f_external; i++) {\n      dst[i] = (src[i / 64] >> (i % 64)) & 1;\n    }\n  };\npublic:\n  HammingWrapper(int f) : _f_external(f), _f_internal((f + 63) / 64), _index((f + 63) / 64) {};\n  bool add_item(int32_t item, const float* w, char**error) {\n    vector<uint64_t> w_internal(_f_internal, 0);\n    _pack(w, &w_internal[0]);\n    return _index.add_item(item, &w_internal[0], error);\n  };\n  bool build(int q, int n_threads, char** error) { return _index.build(q, n_threads, error); };\n  bool unbuild(char** error) { return _index.unbuild(error); };\n  bool save(const char* filename, bool prefault, char** error) { return _index.save(filename, prefault, error); };\n  void unload() { _index.unload(); };\n  bool load(const char* filename, bool prefault, char** error) { return _index.load(filename, prefault, error); };\n  float get_distance(int32_t i, int32_t j) const { return _index.get_distance(i, j); };\n  void get_nns_by_item(int32_t item, size_t n, int search_k, vector<int32_t>* result, vector<float>* distances) const {\n    if (distances) {\n      vector<uint64_t> distances_internal;\n      _index.get_nns_by_item(item, n, search_k, result, &distances_internal);\n      distances->insert(distances->begin(), distances_internal.begin(), distances_internal.end());\n    } else {\n      _index.get_nns_by_item(item, n, search_k, result, NULL);\n    }\n  };\n  void get_nns_by_vector(const float* w, size_t n, int search_k, vector<int32_t>* result, vector<float>* distances) const {\n    vector<uint64_t> w_internal(_f_internal, 0);\n    _pack(w, &w_internal[0]);\n    if (distances) {\n      vector<uint64_t> distances_internal;\n      _index.get_nns_by_vector(&w_internal[0], n, search_k, result, &distances_internal);\n      distances->insert(distances->begin(), distances_internal.begin(), distances_internal.end());\n    } else {\n      _index.get_nns_by_vector(&w_internal[0], n, search_k, result, NULL);\n    }\n  };\n  int32_t get_n_items() const { return _index.get_n_items(); };\n  int32_t get_n_trees() const { return _index.get_n_trees(); };\n  void verbose(bool v) { _index.verbose(v); };\n  void get_item(int32_t item, float* v) const {\n    vector<uint64_t> v_internal(_f_internal, 0);\n    _index.get_item(item, &v_internal[0]);\n    _unpack(&v_internal[0], v);\n  };\n  void set_seed(uint64_t q) { _index.set_seed(q); };\n  bool on_disk_build(const char* filename, char** error) { return _index.on_disk_build(filename, error); };\n};\n\n// annoy python object\ntypedef struct {\n  PyObject_HEAD\n  int f;\n  AnnoyIndexInterface<int32_t, float>* ptr;\n} py_annoy;\n\n\nstatic PyObject *\npy_an_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) {\n  py_annoy *self = (py_annoy *)type->tp_alloc(type, 0);\n  if (self == NULL) {\n    return NULL;\n  }\n  const char *metric = NULL;\n\n  static char const * kwlist[] = {\"f\", \"metric\", NULL};\n  if (!PyArg_ParseTupleAndKeywords(args, kwargs, \"i|s\", (char**)kwlist, &self->f, &metric))\n    return NULL;\n  if (!metric) {\n    // This keeps coming up, see #368 etc\n    PyErr_WarnEx(PyExc_FutureWarning, \"The default argument for metric will be removed \"\n\t\t \"in future version of Annoy. Please pass metric='angular' explicitly.\", 1);\n    self->ptr = new AnnoyIndex<int32_t, float, Angular, Kiss64Random, AnnoyIndexThreadedBuildPolicy>(self->f);\n  } else if (!strcmp(metric, \"angular\")) {\n   self->ptr = new AnnoyIndex<int32_t, float, Angular, Kiss64Random, AnnoyIndexThreadedBuildPolicy>(self->f);\n  } else if (!strcmp(metric, \"euclidean\")) {\n    self->ptr = new AnnoyIndex<int32_t, float, Euclidean, Kiss64Random, AnnoyIndexThreadedBuildPolicy>(self->f);\n  } else if (!strcmp(metric, \"manhattan\")) {\n    self->ptr = new AnnoyIndex<int32_t, float, Manhattan, Kiss64Random, AnnoyIndexThreadedBuildPolicy>(self->f);\n  } else if (!strcmp(metric, \"hamming\")) {\n    self->ptr = new HammingWrapper(self->f);\n  } else if (!strcmp(metric, \"dot\")) {\n    self->ptr = new AnnoyIndex<int32_t, float, DotProduct, Kiss64Random, AnnoyIndexThreadedBuildPolicy>(self->f);\n  } else {\n    PyErr_SetString(PyExc_ValueError, \"No such metric\");\n    return NULL;\n  }\n\n  return (PyObject *)self;\n}\n\n\nstatic int \npy_an_init(py_annoy *self, PyObject *args, PyObject *kwargs) {\n  // Seems to be needed for Python 3\n  const char *metric = NULL;\n  int f;\n  static char const * kwlist[] = {\"f\", \"metric\", NULL};\n  if (!PyArg_ParseTupleAndKeywords(args, kwargs, \"i|s\", (char**)kwlist, &f, &metric))\n    return (int) NULL;\n  return 0;\n}\n\n\nstatic void \npy_an_dealloc(py_annoy* self) {\n  delete self->ptr;\n  Py_TYPE(self)->tp_free((PyObject*)self);\n}\n\n\nstatic PyMemberDef py_annoy_members[] = {\n  {(char*)\"f\", T_INT, offsetof(py_annoy, f), 0,\n   (char*)\"\"},\n  {NULL}\t/* Sentinel */\n};\n\n\nstatic PyObject *\npy_an_load(py_annoy *self, PyObject *args, PyObject *kwargs) {\n  char *filename, *error;\n  bool prefault = false;\n  if (!self->ptr) \n    return NULL;\n  static char const * kwlist[] = {\"fn\", \"prefault\", NULL};\n  if (!PyArg_ParseTupleAndKeywords(args, kwargs, \"s|b\", (char**)kwlist, &filename, &prefault))\n    return NULL;\n\n  if (!self->ptr->load(filename, prefault, &error)) {\n    PyErr_SetString(PyExc_IOError, error);\n    free(error);\n    return NULL;\n  }\n  Py_RETURN_TRUE;\n}\n\n\nstatic PyObject *\npy_an_save(py_annoy *self, PyObject *args, PyObject *kwargs) {\n  char *filename, *error;\n  bool prefault = false;\n  if (!self->ptr) \n    return NULL;\n  static char const * kwlist[] = {\"fn\", \"prefault\", NULL};\n  if (!PyArg_ParseTupleAndKeywords(args, kwargs, \"s|b\", (char**)kwlist, &filename, &prefault))\n    return NULL;\n\n  if (!self->ptr->save(filename, prefault, &error)) {\n    PyErr_SetString(PyExc_IOError, error);\n    free(error);\n    return NULL;\n  }\n  Py_RETURN_TRUE;\n}\n\n\nPyObject*\nget_nns_to_python(const vector<int32_t>& result, const vector<float>& distances, int include_distances) {\n  PyObject* l = NULL;\n  PyObject* d = NULL;\n  PyObject* t = NULL;\n\n  if ((l = PyList_New(result.size())) == NULL) {\n    goto error;\n  }\n  for (size_t i = 0; i < result.size(); i++) {\n    PyObject* res = PyInt_FromLong(result[i]);\n    if (res == NULL) {\n      goto error;\n    }\n    PyList_SetItem(l, i, res);\n  }\n  if (!include_distances)\n    return l;\n\n  if ((d = PyList_New(distances.size())) == NULL) {\n    goto error;\n  }\n\n  for (size_t i = 0; i < distances.size(); i++) {\n    PyObject* dist = PyFloat_FromDouble(distances[i]);\n    if (dist == NULL) {\n      goto error;\n    }\n    PyList_SetItem(d, i, dist);\n  }\n\n  if ((t = PyTuple_Pack(2, l, d)) == NULL) {\n    goto error;\n  }\n  Py_XDECREF(l);\n  Py_XDECREF(d);\n\n  return t;\n\n  error:\n    Py_XDECREF(l);\n    Py_XDECREF(d);\n    Py_XDECREF(t);\n    return NULL;\n}\n\n\nbool check_constraints(py_annoy *self, int32_t item, bool building) {\n  if (item < 0) {\n    PyErr_SetString(PyExc_IndexError, \"Item index can not be negative\");\n    return false;\n  } else if (!building && item >= self->ptr->get_n_items()) {\n    PyErr_SetString(PyExc_IndexError, \"Item index larger than the largest item index\");\n    return false;\n  } else {\n    return true;\n  }\n}\n\nstatic PyObject* \npy_an_get_nns_by_item(py_annoy *self, PyObject *args, PyObject *kwargs) {\n  int32_t item, n, search_k=-1, include_distances=0;\n  if (!self->ptr) \n    return NULL;\n\n  static char const * kwlist[] = {\"i\", \"n\", \"search_k\", \"include_distances\", NULL};\n  if (!PyArg_ParseTupleAndKeywords(args, kwargs, \"ii|ii\", (char**)kwlist, &item, &n, &search_k, &include_distances))\n    return NULL;\n\n  if (!check_constraints(self, item, false)) {\n    return NULL;\n  }\n\n  vector<int32_t> result;\n  vector<float> distances;\n\n  Py_BEGIN_ALLOW_THREADS;\n  self->ptr->get_nns_by_item(item, n, search_k, &result, include_distances ? &distances : NULL);\n  Py_END_ALLOW_THREADS;\n\n  return get_nns_to_python(result, distances, include_distances);\n}\n\n\nbool\nconvert_list_to_vector(PyObject* v, int f, vector<float>* w) {\n  Py_ssize_t length = PyObject_Size(v);\n  if (length == -1) {\n    return false;\n  }\n  if (length != f) {\n    PyErr_Format(PyExc_IndexError, \"Vector has wrong length (expected %d, got %ld)\", f, length);\n    return false;\n  }\n\n  for (int z = 0; z < f; z++) {\n    PyObject *key = PyInt_FromLong(z);\n    if (key == NULL) {\n      return false;\n    }\n    PyObject *pf = PyObject_GetItem(v, key);\n    Py_DECREF(key);\n    if (pf == NULL) {\n      return false;\n    }\n    double value = PyFloat_AsDouble(pf);\n    Py_DECREF(pf);\n    if (value == -1.0 && PyErr_Occurred()) {\n      return false;\n    }\n    (*w)[z] = value;\n  }\n  return true;\n}\n\nstatic PyObject* \npy_an_get_nns_by_vector(py_annoy *self, PyObject *args, PyObject *kwargs) {\n  PyObject* v;\n  int32_t n, search_k=-1, include_distances=0;\n  if (!self->ptr) \n    return NULL;\n\n  static char const * kwlist[] = {\"vector\", \"n\", \"search_k\", \"include_distances\", NULL};\n  if (!PyArg_ParseTupleAndKeywords(args, kwargs, \"Oi|ii\", (char**)kwlist, &v, &n, &search_k, &include_distances))\n    return NULL;\n\n  vector<float> w(self->f);\n  if (!convert_list_to_vector(v, self->f, &w)) {\n    return NULL;\n  }\n\n  vector<int32_t> result;\n  vector<float> distances;\n\n  Py_BEGIN_ALLOW_THREADS;\n  self->ptr->get_nns_by_vector(&w[0], n, search_k, &result, include_distances ? &distances : NULL);\n  Py_END_ALLOW_THREADS;\n\n  return get_nns_to_python(result, distances, include_distances);\n}\n\n\nstatic PyObject* \npy_an_get_item_vector(py_annoy *self, PyObject *args) {\n  int32_t item;\n  if (!self->ptr) \n    return NULL;\n  if (!PyArg_ParseTuple(args, \"i\", &item))\n    return NULL;\n\n  if (!check_constraints(self, item, false)) {\n    return NULL;\n  }\n\n  vector<float> v(self->f);\n  self->ptr->get_item(item, &v[0]);\n  PyObject* l = PyList_New(self->f);\n  if (l == NULL) {\n    return NULL;\n  }\n  for (int z = 0; z < self->f; z++) {\n    PyObject* dist = PyFloat_FromDouble(v[z]);\n    if (dist == NULL) {\n      goto error;\n    }\n    PyList_SetItem(l, z, dist);\n  }\n\n  return l;\n\n  error:\n    Py_XDECREF(l);\n    return NULL;\n}\n\n\nstatic PyObject* \npy_an_add_item(py_annoy *self, PyObject *args, PyObject* kwargs) {\n  PyObject* v;\n  int32_t item;\n  if (!self->ptr) \n    return NULL;\n  static char const * kwlist[] = {\"i\", \"vector\", NULL};\n  if (!PyArg_ParseTupleAndKeywords(args, kwargs, \"iO\", (char**)kwlist, &item, &v))\n    return NULL;\n\n  if (!check_constraints(self, item, true)) {\n    return NULL;\n  }\n\n  vector<float> w(self->f);\n  if (!convert_list_to_vector(v, self->f, &w)) {\n    return NULL;\n  }\n  char* error;\n  if (!self->ptr->add_item(item, &w[0], &error)) {\n    PyErr_SetString(PyExc_Exception, error);\n    free(error);\n    return NULL;\n  }\n\n  Py_RETURN_NONE;\n}\n\nstatic PyObject *\npy_an_on_disk_build(py_annoy *self, PyObject *args, PyObject *kwargs) {\n  char *filename, *error;\n  if (!self->ptr)\n    return NULL;\n  static char const * kwlist[] = {\"fn\", NULL};\n  if (!PyArg_ParseTupleAndKeywords(args, kwargs, \"s\", (char**)kwlist, &filename))\n    return NULL;\n\n  if (!self->ptr->on_disk_build(filename, &error)) {\n    PyErr_SetString(PyExc_IOError, error);\n    free(error);\n    return NULL;\n  }\n  Py_RETURN_TRUE;\n}\n\nstatic PyObject *\npy_an_build(py_annoy *self, PyObject *args, PyObject *kwargs) {\n  int q;\n  int n_jobs = -1;\n  if (!self->ptr) \n    return NULL;\n  static char const * kwlist[] = {\"n_trees\", \"n_jobs\", NULL};\n  if (!PyArg_ParseTupleAndKeywords(args, kwargs, \"i|i\", (char**)kwlist, &q, &n_jobs))\n    return NULL;\n\n  bool res;\n  char* error;\n  Py_BEGIN_ALLOW_THREADS;\n  res = self->ptr->build(q, n_jobs, &error);\n  Py_END_ALLOW_THREADS;\n  if (!res) {\n    PyErr_SetString(PyExc_Exception, error);\n    free(error);\n    return NULL;\n  }\n\n  Py_RETURN_TRUE;\n}\n\n\nstatic PyObject *\npy_an_unbuild(py_annoy *self) {\n  if (!self->ptr) \n    return NULL;\n\n  char* error;\n  if (!self->ptr->unbuild(&error)) {\n    PyErr_SetString(PyExc_Exception, error);\n    free(error);\n    return NULL;\n  }\n\n  Py_RETURN_TRUE;\n}\n\n\nstatic PyObject *\npy_an_unload(py_annoy *self) {\n  if (!self->ptr) \n    return NULL;\n\n  self->ptr->unload();\n\n  Py_RETURN_TRUE;\n}\n\n\nstatic PyObject *\npy_an_get_distance(py_annoy *self, PyObject *args) {\n  int32_t i, j;\n  if (!self->ptr) \n    return NULL;\n  if (!PyArg_ParseTuple(args, \"ii\", &i, &j))\n    return NULL;\n\n  if (!check_constraints(self, i, false) || !check_constraints(self, j, false)) {\n    return NULL;\n  }\n\n  double d = self->ptr->get_distance(i,j);\n  return PyFloat_FromDouble(d);\n}\n\n\nstatic PyObject *\npy_an_get_n_items(py_annoy *self) {\n  if (!self->ptr) \n    return NULL;\n\n  int32_t n = self->ptr->get_n_items();\n  return PyInt_FromLong(n);\n}\n\nstatic PyObject *\npy_an_get_n_trees(py_annoy *self) {\n  if (!self->ptr) \n    return NULL;\n\n  int32_t n = self->ptr->get_n_trees();\n  return PyInt_FromLong(n);\n}\n\nstatic PyObject *\npy_an_verbose(py_annoy *self, PyObject *args) {\n  int verbose;\n  if (!self->ptr) \n    return NULL;\n  if (!PyArg_ParseTuple(args, \"i\", &verbose))\n    return NULL;\n\n  self->ptr->verbose((bool)verbose);\n\n  Py_RETURN_TRUE;\n}\n\n\nstatic PyObject *\npy_an_set_seed(py_annoy *self, PyObject *args) {\n  int q;\n  if (!self->ptr)\n    return NULL;\n  if (!PyArg_ParseTuple(args, \"i\", &q))\n    return NULL;\n\n  self->ptr->set_seed(q);\n\n  Py_RETURN_NONE;\n}\n\n\nstatic PyMethodDef AnnoyMethods[] = {\n  {\"load\",\t(PyCFunction)py_an_load, METH_VARARGS | METH_KEYWORDS, \"Loads (mmaps) an index from disk.\"},\n  {\"save\",\t(PyCFunction)py_an_save, METH_VARARGS | METH_KEYWORDS, \"Saves the index to disk.\"},\n  {\"get_nns_by_item\",(PyCFunction)py_an_get_nns_by_item, METH_VARARGS | METH_KEYWORDS, \"Returns the `n` closest items to item `i`.\\n\\n:param search_k: the query will inspect up to `search_k` nodes.\\n`search_k` gives you a run-time tradeoff between better accuracy and speed.\\n`search_k` defaults to `n_trees * n` if not provided.\\n\\n:param include_distances: If `True`, this function will return a\\n2 element tuple of lists. The first list contains the `n` closest items.\\nThe second list contains the corresponding distances.\"},\n  {\"get_nns_by_vector\",(PyCFunction)py_an_get_nns_by_vector, METH_VARARGS | METH_KEYWORDS, \"Returns the `n` closest items to vector `vector`.\\n\\n:param search_k: the query will inspect up to `search_k` nodes.\\n`search_k` gives you a run-time tradeoff between better accuracy and speed.\\n`search_k` defaults to `n_trees * n` if not provided.\\n\\n:param include_distances: If `True`, this function will return a\\n2 element tuple of lists. The first list contains the `n` closest items.\\nThe second list contains the corresponding distances.\"},\n  {\"get_item_vector\",(PyCFunction)py_an_get_item_vector, METH_VARARGS, \"Returns the vector for item `i` that was previously added.\"},\n  {\"add_item\",(PyCFunction)py_an_add_item, METH_VARARGS | METH_KEYWORDS, \"Adds item `i` (any nonnegative integer) with vector `v`.\\n\\nNote that it will allocate memory for `max(i)+1` items.\"},\n  {\"on_disk_build\",(PyCFunction)py_an_on_disk_build, METH_VARARGS | METH_KEYWORDS, \"Build will be performed with storage on disk instead of RAM.\"},\n  {\"build\",(PyCFunction)py_an_build, METH_VARARGS | METH_KEYWORDS, \"Builds a forest of `n_trees` trees.\\n\\nMore trees give higher precision when querying. After calling `build`,\\nno more items can be added. `n_jobs` specifies the number of threads used to build the trees. `n_jobs=-1` uses all available CPU cores.\"},\n  {\"unbuild\",(PyCFunction)py_an_unbuild, METH_NOARGS, \"Unbuilds the tree in order to allows adding new items.\\n\\nbuild() has to be called again afterwards in order to\\nrun queries.\"},\n  {\"unload\",(PyCFunction)py_an_unload, METH_NOARGS, \"Unloads an index from disk.\"},\n  {\"get_distance\",(PyCFunction)py_an_get_distance, METH_VARARGS, \"Returns the distance between items `i` and `j`.\"},\n  {\"get_n_items\",(PyCFunction)py_an_get_n_items, METH_NOARGS, \"Returns the number of items in the index.\"},\n  {\"get_n_trees\",(PyCFunction)py_an_get_n_trees, METH_NOARGS, \"Returns the number of trees in the index.\"},\n  {\"verbose\",(PyCFunction)py_an_verbose, METH_VARARGS, \"\"},\n  {\"set_seed\",(PyCFunction)py_an_set_seed, METH_VARARGS, \"Sets the seed of Annoy's random number generator.\"},\n  {NULL, NULL, 0, NULL}\t\t /* Sentinel */\n};\n\n\nstatic PyTypeObject PyAnnoyType = {\n  PyVarObject_HEAD_INIT(NULL, 0)\n  \"annoy.Annoy\",          /*tp_name*/\n  sizeof(py_annoy),       /*tp_basicsize*/\n  0,                      /*tp_itemsize*/\n  (destructor)py_an_dealloc, /*tp_dealloc*/\n  0,                      /*tp_print*/\n  0,                      /*tp_getattr*/\n  0,                      /*tp_setattr*/\n  0,                      /*tp_compare*/\n  0,                      /*tp_repr*/\n  0,                      /*tp_as_number*/\n  0,                      /*tp_as_sequence*/\n  0,                      /*tp_as_mapping*/\n  0,                      /*tp_hash */\n  0,                      /*tp_call*/\n  0,                      /*tp_str*/\n  0,                      /*tp_getattro*/\n  0,                      /*tp_setattro*/\n  0,                      /*tp_as_buffer*/\n  Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /*tp_flags*/\n  ANNOY_DOC,              /* tp_doc */\n  0,                      /* tp_traverse */\n  0,                      /* tp_clear */\n  0,                      /* tp_richcompare */\n  0,                      /* tp_weaklistoffset */\n  0,                      /* tp_iter */\n  0,                      /* tp_iternext */\n  AnnoyMethods,           /* tp_methods */\n  py_annoy_members,       /* tp_members */\n  0,                      /* tp_getset */\n  0,                      /* tp_base */\n  0,                      /* tp_dict */\n  0,                      /* tp_descr_get */\n  0,                      /* tp_descr_set */\n  0,                      /* tp_dictoffset */\n  (initproc)py_an_init,   /* tp_init */\n  0,                      /* tp_alloc */\n  py_an_new,              /* tp_new */\n};\n\nstatic PyMethodDef module_methods[] = {\n  {NULL}\t/* Sentinel */\n};\n\n#if PY_MAJOR_VERSION >= 3\n  static struct PyModuleDef moduledef = {\n    PyModuleDef_HEAD_INIT,\n    \"annoylib\",          /* m_name */\n    ANNOY_DOC,           /* m_doc */\n    -1,                  /* m_size */\n    module_methods,      /* m_methods */\n    NULL,                /* m_reload */\n    NULL,                /* m_traverse */\n    NULL,                /* m_clear */\n    NULL,                /* m_free */\n  };\n#endif\n\nPyObject *create_module(void) {\n  PyObject *m;\n\n  if (PyType_Ready(&PyAnnoyType) < 0)\n    return NULL;\n\n#if PY_MAJOR_VERSION >= 3\n  m = PyModule_Create(&moduledef);\n#else\n  m = Py_InitModule(\"annoylib\", module_methods);\n#endif\n\n  if (m == NULL)\n    return NULL;\n\n  Py_INCREF(&PyAnnoyType);\n  PyModule_AddObject(m, \"Annoy\", (PyObject *)&PyAnnoyType);\n  return m;\n}\n\n#if PY_MAJOR_VERSION >= 3\n  PyMODINIT_FUNC PyInit_annoylib(void) {\n    return create_module();      // it should return moudule object in py3\n  }\n#else\n  PyMODINIT_FUNC initannoylib(void) {\n    create_module();\n  }\n#endif\n\n\n// vim: tabstop=2 shiftwidth=2\n"
  },
  {
    "path": "src/kissrandom.h",
    "content": "#ifndef ANNOY_KISSRANDOM_H\n#define ANNOY_KISSRANDOM_H\n\n#if defined(_MSC_VER) && _MSC_VER == 1500\ntypedef unsigned __int32    uint32_t;\ntypedef unsigned __int64    uint64_t;\n#else\n#include <stdint.h>\n#endif\n\nnamespace Annoy {\n\n// KISS = \"keep it simple, stupid\", but high quality random number generator\n// http://www0.cs.ucl.ac.uk/staff/d.jones/GoodPracticeRNG.pdf -> \"Use a good RNG and build it into your code\"\n// http://mathforum.org/kb/message.jspa?messageID=6627731\n// https://de.wikipedia.org/wiki/KISS_(Zufallszahlengenerator)\n\n// 32 bit KISS\nstruct Kiss32Random {\n  uint32_t x;\n  uint32_t y;\n  uint32_t z;\n  uint32_t c;\n\n  static const uint32_t default_seed = 123456789;\n#if __cplusplus < 201103L\n  typedef uint32_t seed_type;\n#endif\n\n  // seed must be != 0\n  Kiss32Random(uint32_t seed = default_seed) {\n    x = seed;\n    y = 362436000;\n    z = 521288629;\n    c = 7654321;\n  }\n\n  uint32_t kiss() {\n    // Linear congruence generator\n    x = 69069 * x + 12345;\n\n    // Xor shift\n    y ^= y << 13;\n    y ^= y >> 17;\n    y ^= y << 5;\n\n    // Multiply-with-carry\n    uint64_t t = 698769069ULL * z + c;\n    c = t >> 32;\n    z = (uint32_t) t;\n\n    return x + y + z;\n  }\n  inline int flip() {\n    // Draw random 0 or 1\n    return kiss() & 1;\n  }\n  inline size_t index(size_t n) {\n    // Draw random integer between 0 and n-1 where n is at most the number of data points you have\n    return kiss() % n;\n  }\n  inline void set_seed(uint32_t seed) {\n    x = seed;\n  }\n};\n\n// 64 bit KISS. Use this if you have more than about 2^24 data points (\"big data\" ;) )\nstruct Kiss64Random {\n  uint64_t x;\n  uint64_t y;\n  uint64_t z;\n  uint64_t c;\n\n  static const uint64_t default_seed = 1234567890987654321ULL;\n#if __cplusplus < 201103L\n  typedef uint64_t seed_type;\n#endif\n\n  // seed must be != 0\n  Kiss64Random(uint64_t seed = default_seed) {\n    x = seed;\n    y = 362436362436362436ULL;\n    z = 1066149217761810ULL;\n    c = 123456123456123456ULL;\n  }\n\n  uint64_t kiss() {\n    // Linear congruence generator\n    z = 6906969069LL*z+1234567;\n\n    // Xor shift\n    y ^= (y<<13);\n    y ^= (y>>17);\n    y ^= (y<<43);\n\n    // Multiply-with-carry (uint128_t t = (2^58 + 1) * x + c; c = t >> 64; x = (uint64_t) t)\n    uint64_t t = (x<<58)+c;\n    c = (x>>6);\n    x += t;\n    c += (x<t);\n\n    return x + y + z;\n  }\n  inline int flip() {\n    // Draw random 0 or 1\n    return kiss() & 1;\n  }\n  inline size_t index(size_t n) {\n    // Draw random integer between 0 and n-1 where n is at most the number of data points you have\n    return kiss() % n;\n  }\n  inline void set_seed(uint64_t seed) {\n    x = seed;\n  }\n};\n\n}\n\n#endif\n// vim: tabstop=2 shiftwidth=2\n"
  },
  {
    "path": "src/mman.h",
    "content": "\n// This is from https://code.google.com/p/mman-win32/\n// \n// Licensed under MIT\n\n#ifndef _MMAN_WIN32_H\n#define _MMAN_WIN32_H\n\n#ifndef _WIN32_WINNT\t\t// Allow use of features specific to Windows XP or later.                   \n#define _WIN32_WINNT 0x0501\t// Change this to the appropriate value to target other versions of Windows.\n#endif\t\t\t\t\t\t\n\n#include <sys/types.h>\n#include <windows.h>\n#include <errno.h>\n#include <io.h>\n\n#define PROT_NONE       0\n#define PROT_READ       1\n#define PROT_WRITE      2\n#define PROT_EXEC       4\n\n#define MAP_FILE        0\n#define MAP_SHARED      1\n#define MAP_PRIVATE     2\n#define MAP_TYPE        0xf\n#define MAP_FIXED       0x10\n#define MAP_ANONYMOUS   0x20\n#define MAP_ANON        MAP_ANONYMOUS\n\n#define MAP_FAILED      ((void *)-1)\n\n/* Flags for msync. */\n#define MS_ASYNC        1\n#define MS_SYNC         2\n#define MS_INVALIDATE   4\n\n#ifndef FILE_MAP_EXECUTE\n#define FILE_MAP_EXECUTE    0x0020\n#endif\n\nstatic int __map_mman_error(const DWORD err, const int deferr)\n{\n    if (err == 0)\n        return 0;\n    //TODO: implement\n    return err;\n}\n\nstatic DWORD __map_mmap_prot_page(const int prot)\n{\n    DWORD protect = 0;\n    \n    if (prot == PROT_NONE)\n        return protect;\n        \n    if ((prot & PROT_EXEC) != 0)\n    {\n        protect = ((prot & PROT_WRITE) != 0) ? \n                    PAGE_EXECUTE_READWRITE : PAGE_EXECUTE_READ;\n    }\n    else\n    {\n        protect = ((prot & PROT_WRITE) != 0) ?\n                    PAGE_READWRITE : PAGE_READONLY;\n    }\n    \n    return protect;\n}\n\nstatic DWORD __map_mmap_prot_file(const int prot)\n{\n    DWORD desiredAccess = 0;\n    \n    if (prot == PROT_NONE)\n        return desiredAccess;\n        \n    if ((prot & PROT_READ) != 0)\n        desiredAccess |= FILE_MAP_READ;\n    if ((prot & PROT_WRITE) != 0)\n        desiredAccess |= FILE_MAP_WRITE;\n    if ((prot & PROT_EXEC) != 0)\n        desiredAccess |= FILE_MAP_EXECUTE;\n    \n    return desiredAccess;\n}\n\ninline void* mmap(void *addr, size_t len, int prot, int flags, int fildes, off_t off)\n{\n    HANDLE fm, h;\n    \n    void * map = MAP_FAILED;\n    \n#ifdef _MSC_VER\n#pragma warning(push)\n#pragma warning(disable: 4293)\n#endif\n\n    const DWORD dwFileOffsetLow = (sizeof(off_t) <= sizeof(DWORD)) ? \n                    (DWORD)off : (DWORD)(off & 0xFFFFFFFFL);\n    const DWORD dwFileOffsetHigh = (sizeof(off_t) <= sizeof(DWORD)) ?\n                    (DWORD)0 : (DWORD)((off >> 32) & 0xFFFFFFFFL);\n    const DWORD protect = __map_mmap_prot_page(prot);\n    const DWORD desiredAccess = __map_mmap_prot_file(prot);\n\n    const off_t maxSize = off + (off_t)len;\n\n    const DWORD dwMaxSizeLow = (sizeof(off_t) <= sizeof(DWORD)) ? \n                    (DWORD)maxSize : (DWORD)(maxSize & 0xFFFFFFFFL);\n    const DWORD dwMaxSizeHigh = (sizeof(off_t) <= sizeof(DWORD)) ?\n                    (DWORD)0 : (DWORD)((maxSize >> 32) & 0xFFFFFFFFL);\n\n#ifdef _MSC_VER\n#pragma warning(pop)\n#endif\n\n    errno = 0;\n    \n    if (len == 0 \n        /* Unsupported flag combinations */\n        || (flags & MAP_FIXED) != 0\n        /* Usupported protection combinations */\n        || prot == PROT_EXEC)\n    {\n        errno = EINVAL;\n        return MAP_FAILED;\n    }\n    \n    h = ((flags & MAP_ANONYMOUS) == 0) ? \n                    (HANDLE)_get_osfhandle(fildes) : INVALID_HANDLE_VALUE;\n\n    if ((flags & MAP_ANONYMOUS) == 0 && h == INVALID_HANDLE_VALUE)\n    {\n        errno = EBADF;\n        return MAP_FAILED;\n    }\n\n    fm = CreateFileMapping(h, NULL, protect, dwMaxSizeHigh, dwMaxSizeLow, NULL);\n\n    if (fm == NULL)\n    {\n        errno = __map_mman_error(GetLastError(), EPERM);\n        return MAP_FAILED;\n    }\n  \n    map = MapViewOfFile(fm, desiredAccess, dwFileOffsetHigh, dwFileOffsetLow, len);\n\n    CloseHandle(fm);\n  \n    if (map == NULL)\n    {\n        errno = __map_mman_error(GetLastError(), EPERM);\n        return MAP_FAILED;\n    }\n\n    return map;\n}\n\ninline int munmap(void *addr, size_t len)\n{\n    if (UnmapViewOfFile(addr))\n        return 0;\n        \n    errno =  __map_mman_error(GetLastError(), EPERM);\n    \n    return -1;\n}\n\ninline int mprotect(void *addr, size_t len, int prot)\n{\n    DWORD newProtect = __map_mmap_prot_page(prot);\n    DWORD oldProtect = 0;\n    \n    if (VirtualProtect(addr, len, newProtect, &oldProtect))\n        return 0;\n    \n    errno =  __map_mman_error(GetLastError(), EPERM);\n    \n    return -1;\n}\n\ninline int msync(void *addr, size_t len, int flags)\n{\n    if (FlushViewOfFile(addr, len))\n        return 0;\n    \n    errno =  __map_mman_error(GetLastError(), EPERM);\n    \n    return -1;\n}\n\ninline int mlock(const void *addr, size_t len)\n{\n    if (VirtualLock((LPVOID)addr, len))\n        return 0;\n        \n    errno =  __map_mman_error(GetLastError(), EPERM);\n    \n    return -1;\n}\n\ninline int munlock(const void *addr, size_t len)\n{\n    if (VirtualUnlock((LPVOID)addr, len))\n        return 0;\n        \n    errno =  __map_mman_error(GetLastError(), EPERM);\n    \n    return -1;\n}\n\n#if !defined(__MINGW32__)\ninline int ftruncate(const int fd, const int64_t size) {\n    if (fd < 0) {\n        errno = EBADF;\n        return -1;\n    }\n\n    HANDLE h = reinterpret_cast<HANDLE>(_get_osfhandle(fd));\n    LARGE_INTEGER li_start, li_size;\n    li_start.QuadPart = static_cast<int64_t>(0);\n    li_size.QuadPart = size;\n    if (SetFilePointerEx(h, li_start, NULL, FILE_CURRENT) == ~0 ||\n        SetFilePointerEx(h, li_size, NULL, FILE_BEGIN) == ~0 ||\n        !SetEndOfFile(h)) {\n        unsigned long error = GetLastError();\n        fprintf(stderr, \"I/O error while truncating: %lu\\n\", error);\n        switch (error) {\n            case ERROR_INVALID_HANDLE:\n                errno = EBADF;\n                break;\n            default:\n                errno = EIO;\n                break;\n        }\n        return -1;\n    }        \n    return 0;\n}\n#endif\n\n#endif \n"
  },
  {
    "path": "test/accuracy_test.py",
    "content": "# Copyright (c) 2013 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n# use this file except in compliance with the License. You may obtain a copy of\n# the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT\n# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the\n# License for the specific language governing permissions and limitations under\n# the License.\n\nfrom __future__ import print_function\n\nimport os\n\nimport h5py\n\nfrom annoy import AnnoyIndex\n\ntry:\n    from urllib import urlretrieve\nexcept ImportError:\n    from urllib.request import urlretrieve  # Python 3\n\n\ndef _get_index(dataset, custom_distance=None, custom_dim=None):\n    url = 'http://ann-benchmarks.com/%s.hdf5' % dataset\n    vectors_fn = os.path.join(\"test\", dataset + \".hdf5\")\n    index_fn = os.path.join(\"test\", dataset + \".annoy\")\n\n    if not os.path.exists(vectors_fn):\n        print(\"downloading\", url, \"->\", vectors_fn)\n        urlretrieve(url, vectors_fn)\n\n    dataset_f = h5py.File(vectors_fn, \"r\")\n    distance = dataset_f.attrs[\"distance\"]\n    if custom_distance is not None:\n         distance = custom_distance\n    f = dataset_f[\"train\"].shape[1]\n    if custom_dim:\n         f = custom_dim\n    if custom_distance:\n        dataset = dataset.rsplit('-', 2)[0] + \"-%d-%s\" % (f, custom_distance)\n        index_fn = os.path.join('test', dataset + '.annoy')\n\n\n    annoy = AnnoyIndex(f, distance)\n\n    if not os.path.exists(index_fn):\n        print(\"adding items\", distance, f)\n        for i, v in enumerate(dataset_f[\"train\"]):\n            if len(v) > f:\n                v = v[:f]\n            annoy.add_item(i, v)\n\n        print(\"building index\")\n        annoy.build(10)\n        annoy.save(index_fn)\n    else:\n        annoy.load(index_fn)\n    return annoy, dataset_f, dataset\n\n\ndef _test_index(dataset, exp_accuracy, custom_metric=None, custom_dim=None):\n    annoy, dataset_f, dataset = _get_index(dataset, custom_metric, custom_dim)\n\n    n, k = 0, 0\n\n    for i, v in enumerate(dataset_f[\"test\"]):\n        if custom_dim:\n            v = v[:custom_dim]\n        js_fast = annoy.get_nns_by_vector(v, 10, 10000)\n        js_real = dataset_f[\"neighbors\"][i][:10]\n        assert len(js_fast) == 10\n        assert len(js_real) == 10\n\n        n += 10\n        k += len(set(js_fast).intersection(js_real))\n\n    accuracy = 100.0 * k / n\n    print(\n        \"%50s accuracy: %5.2f%% (expected %5.2f%%)\" % (dataset, accuracy, exp_accuracy)\n    )\n\n\n    assert accuracy > exp_accuracy - 1.0  # should be within 1%\n\n\ndef test_glove_25():\n    _test_index(\"glove-25-angular\", 69.00)\n\n\ndef test_nytimes_16():\n    _test_index(\"nytimes-16-angular\", 80.00)\n\n\ndef test_lastfm_dot():\n    _test_index('lastfm-64-dot', 60.00, 'dot', 64)\n\n\ndef test_lastfm_angular():\n    _test_index('lastfm-64-dot', 60.00, 'angular', 65)\n"
  },
  {
    "path": "test/angular_index_test.py",
    "content": "# Copyright (c) 2013 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n# use this file except in compliance with the License. You may obtain a copy of\n# the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT\n# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the\n# License for the specific language governing permissions and limitations under\n# the License.\n\nimport random\n\nimport numpy\nimport pytest\n\nfrom annoy import AnnoyIndex\n\n\ndef test_get_nns_by_vector():\n    f = 3\n    i = AnnoyIndex(f, \"angular\")\n    i.add_item(0, [0, 0, 1])\n    i.add_item(1, [0, 1, 0])\n    i.add_item(2, [1, 0, 0])\n    i.build(10)\n\n    assert i.get_nns_by_vector([3, 2, 1], 3) == [2, 1, 0]\n    assert i.get_nns_by_vector([1, 2, 3], 3) == [0, 1, 2]\n    assert i.get_nns_by_vector([2, 0, 1], 3) == [2, 0, 1]\n\n\ndef test_get_nns_by_item():\n    f = 3\n    i = AnnoyIndex(f, \"angular\")\n    i.add_item(0, [2, 1, 0])\n    i.add_item(1, [1, 2, 0])\n    i.add_item(2, [0, 0, 1])\n    i.build(10)\n\n    assert i.get_nns_by_item(0, 3) == [0, 1, 2]\n    assert i.get_nns_by_item(1, 3) == [1, 0, 2]\n    assert i.get_nns_by_item(2, 3) in [[2, 0, 1], [2, 1, 0]]  # could be either\n\n\ndef test_dist():\n    f = 2\n    i = AnnoyIndex(f, \"angular\")\n    i.add_item(0, [0, 1])\n    i.add_item(1, [1, 1])\n\n    assert i.get_distance(0, 1) == pytest.approx((2 * (1.0 - 2**-0.5)) ** 0.5)\n\n\ndef test_dist_2():\n    f = 2\n    i = AnnoyIndex(f, \"angular\")\n    i.add_item(0, [1000, 0])\n    i.add_item(1, [10, 0])\n\n    assert i.get_distance(0, 1) == pytest.approx(0)\n\n\ndef test_dist_3():\n    f = 2\n    i = AnnoyIndex(f, \"angular\")\n    i.add_item(0, [97, 0])\n    i.add_item(1, [42, 42])\n\n    dist = ((1 - 2**-0.5) ** 2 + (2**-0.5) ** 2) ** 0.5\n\n    assert i.get_distance(0, 1) == pytest.approx(dist)\n\n\ndef test_dist_degen():\n    f = 2\n    i = AnnoyIndex(f, \"angular\")\n    i.add_item(0, [1, 0])\n    i.add_item(1, [0, 0])\n\n    assert i.get_distance(0, 1) == pytest.approx(2.0**0.5)\n\n\ndef test_large_index():\n    # Generate pairs of random points where the pair is super close\n    f = 10\n    i = AnnoyIndex(f, \"angular\")\n    for j in range(0, 10000, 2):\n        p = [random.gauss(0, 1) for z in range(f)]\n        f1 = random.random() + 1\n        f2 = random.random() + 1\n        x = [f1 * pi + random.gauss(0, 1e-2) for pi in p]\n        y = [f2 * pi + random.gauss(0, 1e-2) for pi in p]\n        i.add_item(j, x)\n        i.add_item(j + 1, y)\n\n    i.build(10)\n    for j in range(0, 10000, 2):\n        assert i.get_nns_by_item(j, 2) == [j, j + 1]\n        assert i.get_nns_by_item(j + 1, 2) == [j + 1, j]\n\n\ndef precision(n, n_trees=10, n_points=10000, n_rounds=10, search_k=100000):\n    found = 0\n    for r in range(n_rounds):\n        # create random points at distance x from (1000, 0, 0, ...)\n        f = 10\n        i = AnnoyIndex(f, \"angular\")\n        for j in range(n_points):\n            p = [random.gauss(0, 1) for z in range(f - 1)]\n            norm = sum([pi**2 for pi in p]) ** 0.5\n            x = [1000] + [pi / norm * j for pi in p]\n            i.add_item(j, x)\n\n        i.build(n_trees)\n\n        nns = i.get_nns_by_vector([1000] + [0] * (f - 1), n, search_k)\n        assert nns == sorted(nns)  # should be in order\n        # The number of gaps should be equal to the last item minus n-1\n        found += len([x for x in nns if x < n])\n\n    return 1.0 * found / (n * n_rounds)\n\n\ndef test_precision_1():\n    assert precision(1) >= 0.98\n\n\ndef test_precision_10():\n    assert precision(10) >= 0.98\n\n\ndef test_precision_100():\n    assert precision(100) >= 0.98\n\n\ndef test_precision_1000():\n    assert precision(1000) >= 0.98\n\n\ndef test_load_save_get_item_vector():\n    f = 3\n    i = AnnoyIndex(f, \"angular\")\n    i.add_item(0, [1.1, 2.2, 3.3])\n    i.add_item(1, [4.4, 5.5, 6.6])\n    i.add_item(2, [7.7, 8.8, 9.9])\n\n    numpy.testing.assert_array_almost_equal(i.get_item_vector(0), [1.1, 2.2, 3.3])\n    assert i.build(10)\n    assert i.save(\"blah.ann\")\n    numpy.testing.assert_array_almost_equal(i.get_item_vector(1), [4.4, 5.5, 6.6])\n    j = AnnoyIndex(f, \"angular\")\n    assert j.load(\"blah.ann\")\n    numpy.testing.assert_array_almost_equal(j.get_item_vector(2), [7.7, 8.8, 9.9])\n\n\ndef test_get_nns_search_k():\n    f = 3\n    i = AnnoyIndex(f, \"angular\")\n    i.add_item(0, [0, 0, 1])\n    i.add_item(1, [0, 1, 0])\n    i.add_item(2, [1, 0, 0])\n    i.build(10)\n\n    assert i.get_nns_by_item(0, 3, 10) == [0, 1, 2]\n    assert i.get_nns_by_vector([3, 2, 1], 3, 10) == [2, 1, 0]\n\n\ndef test_include_dists():\n    # Double checking issue 112\n    f = 40\n    i = AnnoyIndex(f, \"angular\")\n    v = numpy.random.normal(size=f)\n    i.add_item(0, v)\n    i.add_item(1, -v)\n    i.build(10)\n\n    indices, dists = i.get_nns_by_item(0, 2, 10, True)\n    assert indices == [0, 1]\n    assert dists[0] == pytest.approx(0.0)\n    assert dists[1] == pytest.approx(2.0)\n\n\ndef test_include_dists_check_ranges():\n    f = 3\n    i = AnnoyIndex(f, \"angular\")\n    for j in range(100000):\n        i.add_item(j, numpy.random.normal(size=f))\n    i.build(10)\n    indices, dists = i.get_nns_by_item(0, 100000, include_distances=True)\n    assert max(dists) <= 2.0\n    assert min(dists) == pytest.approx(0.0)\n\n\ndef test_distance_consistency():\n    n, f = 1000, 3\n    i = AnnoyIndex(f, \"angular\")\n    for j in range(n):\n        while True:\n            v = numpy.random.normal(size=f)\n            if numpy.dot(v, v) > 0.1:\n                break\n        i.add_item(j, v)\n    i.build(10)\n    for a in random.sample(range(n), 100):\n        indices, dists = i.get_nns_by_item(a, 100, include_distances=True)\n        for b, dist in zip(indices, dists):\n            u = i.get_item_vector(a)\n            v = i.get_item_vector(b)\n            assert dist == pytest.approx(i.get_distance(a, b), rel=1e-3, abs=1e-3)\n            u_norm = numpy.array(u) * numpy.dot(u, u) ** -0.5\n            v_norm = numpy.array(v) * numpy.dot(v, v) ** -0.5\n            # cos = numpy.clip(1 - cosine(u, v), -1, 1) # scipy returns 1 - cos\n            assert dist**2 == pytest.approx(\n                numpy.dot(u_norm - v_norm, u_norm - v_norm), rel=1e-3, abs=1e-3\n            )\n            # self.assertAlmostEqual(dist, (2*(1 - cos))**0.5)\n            assert dist**2 == pytest.approx(\n                sum([(x - y) ** 2 for x, y in zip(u_norm, v_norm)]),\n                rel=1e-3,\n                abs=1e-3,\n            )\n\n\ndef test_only_one_item():\n    # reported to annoy-user by Kireet Reddy\n    idx = AnnoyIndex(100, \"angular\")\n    idx.add_item(0, numpy.random.randn(100))\n    idx.build(n_trees=10)\n    idx.save(\"foo.idx\")\n    idx = AnnoyIndex(100, \"angular\")\n    idx.load(\"foo.idx\")\n    assert idx.get_n_items() == 1\n    assert idx.get_nns_by_vector(\n        vector=numpy.random.randn(100), n=50, include_distances=False\n    ) == [0]\n\n\ndef test_no_items():\n    idx = AnnoyIndex(100, \"angular\")\n    idx.build(n_trees=10)\n    idx.save(\"foo.idx\")\n    idx = AnnoyIndex(100, \"angular\")\n    idx.load(\"foo.idx\")\n    assert idx.get_n_items() == 0\n    assert (\n        idx.get_nns_by_vector(\n            vector=numpy.random.randn(100), n=50, include_distances=False\n        )\n        == []\n    )\n\n\ndef test_single_vector():\n    # https://github.com/spotify/annoy/issues/194\n    a = AnnoyIndex(3, \"angular\")\n    a.add_item(0, [1, 0, 0])\n    a.build(10)\n    a.save(\"1.ann\")\n    indices, dists = a.get_nns_by_vector([1, 0, 0], 3, include_distances=True)\n    assert indices == [0]\n    assert dists[0] ** 2 == pytest.approx(0.0)\n"
  },
  {
    "path": "test/annoy_test.go",
    "content": "/*\n# Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n# use this file except in compliance with the License. You may obtain a copy of\n# the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT\n# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the\n# License for the specific language governing permissions and limitations under\n# the License.\n*/\n\npackage annoy_test\n\nimport (\n\t\"math\"\n\t\"math/rand\"\n\t\"os\"\n\t\"testing\"\n\n\t\"github.com/spotify/annoy\"\n\t\"github.com/stretchr/testify/assert\"\n\t\"github.com/stretchr/testify/require\"\n\t\"github.com/stretchr/testify/suite\"\n)\n\ntype AnnoyTestSuite struct {\n\tsuite.Suite\n}\n\nfunc Round(f float64) float64 {\n\treturn math.Floor(f + 0.5)\n}\n\nfunc RoundPlus(f float64, places int) float64 {\n\tshift := math.Pow(10, float64(places))\n\treturn Round(f*shift) / shift\n}\n\nfunc (suite *AnnoyTestSuite) SetupTest() {\n}\n\nfunc (suite *AnnoyTestSuite) TestFileHandling() {\n\tindex := annoy.NewAnnoyIndexAngular(3)\n\tindex.AddItem(0, []float32{0, 0, 1})\n\tindex.AddItem(1, []float32{0, 1, 0})\n\tindex.AddItem(2, []float32{1, 0, 0})\n\tindex.Build(10)\n\n\tindex.Save(\"go_test.ann\")\n\n\tinfo, err := os.Stat(\"go_test.ann\")\n\tif err != nil {\n\t\tassert.Fail(suite.T(), \"Failed to create file, file not found\")\n\t}\n\tif info.Size() == 0 {\n\t\tassert.Fail(suite.T(), \"Failed to create file, file size zero\")\n\t}\n\n\tannoy.DeleteAnnoyIndexAngular(index)\n\n\tindex = annoy.NewAnnoyIndexAngular(3)\n\tif ret := index.Load(\"go_test.ann\"); ret == false {\n\t\tassert.Fail(suite.T(), \"Failed to load file\")\n\t}\n\n\tos.Remove(\"go_test.ann\")\n\tindex.Save(\"go_test2.ann\", false)\n\n\tinfo, err = os.Stat(\"go_test2.ann\")\n\tif err != nil {\n\t\tassert.Fail(suite.T(), \"Failed to create file without prefault, file not found\")\n\t}\n\tif info.Size() == 0 {\n\t\tassert.Fail(suite.T(), \"Failed to create file without prefault, file size zero\")\n\t}\n\n\tannoy.DeleteAnnoyIndexAngular(index)\n\n\tindex = annoy.NewAnnoyIndexAngular(3)\n\tif ret := index.Load(\"go_test2.ann\", false); ret == false {\n\t\tassert.Fail(suite.T(), \"Failed to load file without prefault\")\n\t}\n\n\tos.Remove(\"go_test2.ann\")\n\tindex.Save(\"go_test3.ann\", true)\n\n\tinfo, err = os.Stat(\"go_test3.ann\")\n\tif err != nil {\n\t\tassert.Fail(suite.T(), \"Failed to create file allowing prefault, file not found\")\n\t}\n\tif info.Size() == 0 {\n\t\tassert.Fail(suite.T(), \"Failed to create file allowing prefault, file size zero\")\n\t}\n\n\tannoy.DeleteAnnoyIndexAngular(index)\n\n\tindex = annoy.NewAnnoyIndexAngular(3)\n\tif ret := index.Load(\"go_test3.ann\", true); ret == false {\n\t\tassert.Fail(suite.T(), \"Failed to load file allowing prefault\")\n\t}\n\tannoy.DeleteAnnoyIndexAngular(index)\n\n\tos.Remove(\"go_test3.ann\")\n}\n\nfunc (suite *AnnoyTestSuite) TestOnDiskBuild() {\n\tindex := annoy.NewAnnoyIndexAngular(3)\n\tindex.OnDiskBuild(\"go_test.ann\")\n\n\tinfo, err := os.Stat(\"go_test.ann\")\n\tif err != nil {\n\t\tassert.Fail(suite.T(), \"Failed to create file, file not found\")\n\t}\n\tif info.Size() == 0 {\n\t\tassert.Fail(suite.T(), \"Failed to create file, file size zero\")\n\t}\n\n\tindex.AddItem(0, []float32{0, 0, 1})\n\tindex.AddItem(1, []float32{0, 1, 0})\n\tindex.AddItem(2, []float32{1, 0, 0})\n\tindex.Build(10)\n\n\tindex.Unload()\n\tindex.Load(\"go_test.ann\")\n\n\tresult := annoy.NewAnnoyVectorInt()\n\tdefer result.Free()\n\n\tindex.GetNnsByVector([]float32{3, 2, 1}, 3, -1, result)\n\tassert.Equal(suite.T(), []int32{2, 1, 0}, result.ToSlice())\n\n\tindex.GetNnsByVector([]float32{1, 2, 3}, 3, -1, result)\n\tassert.Equal(suite.T(), []int32{0, 1, 2}, result.ToSlice())\n\n\tindex.GetNnsByVector([]float32{2, 0, 1}, 3, -1, result)\n\tassert.Equal(suite.T(), []int32{2, 0, 1}, result.ToSlice())\n\n\tannoy.DeleteAnnoyIndexAngular(index)\n\n\tos.Remove(\"go_test.ann\")\n}\n\nfunc (suite *AnnoyTestSuite) TestGetNnsByVector() {\n\tt := suite.T()\n\tindex := annoy.NewAnnoyIndexAngular(3)\n\tindex.AddItem(0, []float32{0, 0, 1})\n\tindex.AddItem(1, []float32{0, 1, 0})\n\tindex.AddItem(2, []float32{1, 0, 0})\n\tindex.Build(10)\n\n\tt.Run(\"regular\", func(t *testing.T) {\n\t\tresult := annoy.NewAnnoyVectorInt()\n\t\tdefer result.Free()\n\n\t\tindex.GetNnsByVector([]float32{3, 2, 1}, 3, -1, result)\n\t\tassert.Equal(t, []int32{2, 1, 0}, result.ToSlice())\n\n\t\tindex.GetNnsByVector([]float32{1, 2, 3}, 3, -1, result)\n\t\tassert.Equal(t, []int32{0, 1, 2}, result.ToSlice())\n\n\t\tindex.GetNnsByVector([]float32{2, 0, 1}, 3, -1, result)\n\t\tassert.Equal(t, []int32{2, 0, 1}, result.ToSlice())\n\t})\n\n\tt.Run(\"with copying\", func(t *testing.T) {\n\t\tresult := annoy.NewAnnoyVectorInt()\n\t\tdefer result.Free()\n\n\t\tvar notAllocated []int32\n\t\tindex.GetNnsByVector([]float32{3, 2, 1}, 3, -1, result)\n\t\tresult.Copy(&notAllocated)\n\t\tassert.Equal(t, []int32{2, 1, 0}, notAllocated)\n\n\t\t// to make sure it will be overwritten\n\t\tvar alreadyAllocated = make([]int32, 10)\n\t\tfor i := 0; i < len(alreadyAllocated); i++ {\n\t\t\talreadyAllocated[i] = -1\n\t\t}\n\t\tindex.GetNnsByVector([]float32{3, 2, 1}, 3, -1, result)\n\t\tresult.Copy(&alreadyAllocated)\n\t\tassert.Equal(t, []int32{2, 1, 0}, alreadyAllocated)\n\n\t\tvar alreadyAllocatedCap = make([]int32, 0, 00)\n\t\tindex.GetNnsByVector([]float32{3, 2, 1}, 3, -1, result)\n\t\tresult.Copy(&alreadyAllocatedCap)\n\t\tassert.Equal(t, []int32{2, 1, 0}, alreadyAllocatedCap)\n\t})\n\n\tt.Run(\"with inner array\", func(t *testing.T) {\n\t\tresult := annoy.NewAnnoyVectorInt()\n\t\tdefer result.Free()\n\n\t\tindex.GetNnsByVector([]float32{3, 2, 1}, 3, -1, result)\n\t\tassert.Equal(t, []int32{2, 1, 0}, result.InnerArray())\n\t})\n\n\tannoy.DeleteAnnoyIndexAngular(index)\n}\n\nfunc (suite *AnnoyTestSuite) TestGetNnsByItem() {\n\tindex := annoy.NewAnnoyIndexAngular(3)\n\tindex.AddItem(0, []float32{2, 1, 0})\n\tindex.AddItem(1, []float32{1, 2, 0})\n\tindex.AddItem(2, []float32{0, 0, 1})\n\tindex.Build(10)\n\n\tvar result = annoy.NewAnnoyVectorInt()\n\tdefer result.Free()\n\n\tindex.GetNnsByItem(0, 3, -1, result)\n\tassert.Equal(suite.T(), []int32{0, 1, 2}, result.ToSlice())\n\n\tindex.GetNnsByItem(1, 3, -1, result)\n\tassert.Equal(suite.T(), []int32{1, 0, 2}, result.ToSlice())\n\n\tannoy.DeleteAnnoyIndexAngular(index)\n}\n\nfunc (suite *AnnoyTestSuite) TestGetItem() {\n\tindex := annoy.NewAnnoyIndexAngular(3)\n\tindex.AddItem(0, []float32{2, 1, 0})\n\tindex.AddItem(1, []float32{1, 2, 0})\n\tindex.AddItem(2, []float32{0, 0, 1})\n\tindex.Build(10)\n\n\tvar result = annoy.NewAnnoyVectorFloat()\n\tdefer result.Free()\n\n\tindex.GetItem(0, result)\n\tassert.Equal(suite.T(), []float32{2, 1, 0}, result.ToSlice())\n\n\tindex.GetItem(1, result)\n\tassert.Equal(suite.T(), []float32{1, 2, 0}, result.ToSlice())\n\n\tindex.GetItem(2, result)\n\tassert.Equal(suite.T(), []float32{0, 0, 1}, result.ToSlice())\n\n\tannoy.DeleteAnnoyIndexAngular(index)\n}\n\nfunc (suite *AnnoyTestSuite) TestGetDistance() {\n\tindex := annoy.NewAnnoyIndexAngular(2)\n\tindex.AddItem(0, []float32{0, 1})\n\tindex.AddItem(1, []float32{1, 1})\n\tindex.Build(10)\n\n\tassert.Equal(suite.T(), RoundPlus(math.Pow(2*(1.0-math.Pow(2, -0.5)), 0.5), 3), RoundPlus(float64(index.GetDistance(0, 1)), 3))\n\n\tannoy.DeleteAnnoyIndexAngular(index)\n}\n\nfunc (suite *AnnoyTestSuite) TestGetDotProductDistance() {\n\tindex := annoy.NewAnnoyIndexDotProduct(2)\n\tindex.AddItem(0, []float32{0, 1})\n\tindex.AddItem(1, []float32{1, 1})\n\tindex.Build(10)\n\n\tassert.True(suite.T(),\n\t\tmath.Abs(1.0-float64(index.GetDistance(0, 1))) < 0.00001)\n\n\tannoy.DeleteAnnoyIndexDotProduct(index)\n}\n\nfunc (suite *AnnoyTestSuite) TestLargeEuclideanIndex() {\n\tindex := annoy.NewAnnoyIndexEuclidean(10)\n\n\tfor j := 0; j < 10000; j += 2 {\n\t\tp := make([]float32, 0, 10)\n\t\tfor i := 0; i < 10; i++ {\n\t\t\tp = append(p, rand.Float32())\n\t\t}\n\t\tx := make([]float32, 0, 10)\n\t\tfor i := 0; i < 10; i++ {\n\t\t\tx = append(x, 1+p[i]+rand.Float32()*1e-2)\n\t\t}\n\t\ty := make([]float32, 0, 10)\n\t\tfor i := 0; i < 10; i++ {\n\t\t\ty = append(y, 1+p[i]+rand.Float32()*1e-2)\n\t\t}\n\t\tindex.AddItem(j, x)\n\t\tindex.AddItem(j+1, y)\n\t}\n\tindex.Build(10)\n\tresult := annoy.NewAnnoyVectorInt()\n\tdefer result.Free()\n\tfor j := 0; j < 10000; j += 2 {\n\t\tindex.GetNnsByItem(j, 2, -1, result)\n\n\t\trequire.Equal(suite.T(), result.ToSlice(), []int32{int32(j), int32(j + 1)})\n\n\t\tindex.GetNnsByItem(j+1, 2, -1, result)\n\t\trequire.Equal(suite.T(), result.ToSlice(), []int32{int32(j) + 1, int32(j)})\n\t}\n\tannoy.DeleteAnnoyIndexEuclidean(index)\n}\n\nfunc TestAnnoyTestSuite(t *testing.T) {\n\tsuite.Run(t, new(AnnoyTestSuite))\n}\n"
  },
  {
    "path": "test/annoy_test.lua",
    "content": "-- Copyright (c) 2016 Boris Nagaev\n--\n-- Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n-- use this file except in compliance with the License. You may obtain a copy of\n-- the License at\n--\n-- http://www.apache.org/licenses/LICENSE-2.0\n--\n-- Unless required by applicable law or agreed to in writing, software\n-- distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT\n-- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the\n-- License for the specific language governing permissions and limitations under\n-- the License.\n\nlocal AnnoyIndex = require 'annoy'.AnnoyIndex\n\nlocal function gauss(mu, sigma)\n  local sum = -6\n  for _ = 1, 12 do\n    sum = sum + math.random()\n  end\n  return mu + sum * sigma\nend\n\nlocal function randomVector(f, mu, sigma)\n  local v = {}\n  for i = 1, f do\n    v[i] = gauss(mu, sigma)\n  end\n  return v\nend\n\nlocal function round(x)\n    return (\"%.3f\"):format(x)\nend\n\nlocal function roundArray(array)\n    local rounded_array = {}\n    for k, v in ipairs(array) do\n        rounded_array[k] = round(v)\n    end\n    return rounded_array\nend\n\nlocal function isSorted(v)\n    for i = 2, #v do\n        if v[i-1] > v[i] then\n            return false\n        end\n    end\n    return true\nend\n\nlocal function max(array)\n    local ans = assert(array[1])\n    for _, v in ipairs(array) do\n        ans = math.max(ans, v)\n    end\n    return ans\nend\n\nlocal function min(array)\n    local ans = assert(array[1])\n    for _, v in ipairs(array) do\n        ans = math.min(ans, v)\n    end\n    return ans\nend\n\nlocal function precision(first1000, n, n_trees, n_points, n_rounds)\n    if not n_trees then\n        n_trees = 10\n    end\n    if not n_points then\n        n_points = 10000\n    end\n    if not n_rounds then\n        n_rounds = 10\n    end\n    local found = 0\n    for _ = 1, n_rounds do\n        local f = 10\n        local p_size\n        if first1000 then\n            -- create random points at distance x from (1000, 0, 0, ...)\n            p_size = f - 1\n        else\n            -- create random points at distance x\n            p_size = f\n        end\n        local i = AnnoyIndex(f, 'euclidean')\n        for j = 0, n_points - 1 do\n            local p = randomVector(p_size, 0, 1)\n            local norm\n            do\n                norm = 0\n                for _, pi in ipairs(p) do\n                    norm = norm + pi ^ 2\n                end\n                norm = norm ^ 0.5\n            end\n            local x = {}\n            do\n                if first1000 then\n                    x[1] = 1000\n                end\n                for _, pi in ipairs(p) do\n                    table.insert(x, pi / norm * j)\n                end\n            end\n            i:add_item(j, x)\n        end\n        i:build(n_trees)\n        local v = {}\n        do\n            for k = 1, f do\n                v[k] = 0\n            end\n            if first1000 then\n                v[1] = 1000\n            end\n        end\n        local nns = i:get_nns_by_vector(v, n)\n        assert(isSorted(nns))\n        -- The number of gaps should be equal to the last item minus n-1\n        for _, x in ipairs(nns) do\n            if x < n then\n                found = found + 1\n            end\n        end\n    end\n    return 1.0 * found / (n * n_rounds)\nend\n\ndescribe(\"angular annoy test\", function()\n\n    it(\"get_nns_by_vector\", function()\n        local f = 3\n        local i = AnnoyIndex(f)\n        i:add_item(0, {0, 0, 1})\n        i:add_item(1, {0, 1, 0})\n        i:add_item(2, {1, 0, 0})\n        i:build(10)\n        assert.same({2, 1, 0}, i:get_nns_by_vector({3, 2, 1}, 3))\n        assert.same({0, 1, 2}, i:get_nns_by_vector({1, 2, 3}, 3))\n        assert.same({2, 0, 1}, i:get_nns_by_vector({2, 0, 1}, 3))\n    end)\n\n    it(\"get_nns_by_item\", function()\n        local f = 3\n        local i = AnnoyIndex(f)\n        i:add_item(0, {2, 1, 0})\n        i:add_item(1, {1, 2, 0})\n        i:add_item(2, {0, 0, 1})\n        i:build(10)\n        assert.same({0, 1, 2}, i:get_nns_by_item(0, 3))\n        assert.same({1, 0, 2}, i:get_nns_by_item(1, 3))\n        do\n            local close_to_2 = i:get_nns_by_item(2, 3)\n            assert.equal(close_to_2[1], 2)\n            assert.truthy(\n                (close_to_2[2] == 0 and close_to_2[3] == 1)\n                or\n                (close_to_2[2] == 1 and close_to_2[3] == 0)\n            )\n        end\n    end)\n\n    it(\"dist\", function()\n        local f = 2\n        local i = AnnoyIndex(f)\n        i:add_item(0, {0, 1})\n        i:add_item(1, {1, 1})\n        assert.equal(round((2 * (1.0 - 2 ^ -0.5)) ^ 0.5), round(i:get_distance(0, 1)))\n    end)\n\n    it(\"dist_2\", function()\n        local f = 2\n        local i = AnnoyIndex(f)\n        i:add_item(0, {1000, 0})\n        i:add_item(1, {10, 0})\n        assert.equal(round(0), round(i:get_distance(0, 1)))\n    end)\n\n    it(\"dist_3\", function()\n        local f = 2\n        local i = AnnoyIndex(f)\n        i:add_item(0, {97, 0})\n        i:add_item(1, {42, 42})\n        local dist = ((1 - 2 ^ -0.5) ^ 2 + (2 ^ -0.5) ^ 2) ^ 0.5\n        assert.equal(round(dist), round(i:get_distance(0, 1)))\n    end)\n\n    it(\"dist_degen\", function()\n        local f = 2\n        local i = AnnoyIndex(f)\n        i:add_item(0, {1, 0})\n        i:add_item(1, {0, 0})\n        assert.equal(round(2.0 ^ 0.5), round(i:get_distance(0, 1)))\n    end)\n\n    it(\"large_index\", function()\n        -- Generate pairs of random points where the pair is super close\n        local f = 10\n        local i = AnnoyIndex(f)\n        for j = 0, 10000 - 1, 2 do\n            local p = randomVector(f, 0, 1)\n            local f1 = math.random() + 1\n            local f2 = math.random() + 1\n            local x = {}\n            local y = {}\n            for k, pi in ipairs(p) do\n                x[k] = f1 * pi + gauss(0, 1e-2)\n                y[k] = f2 * pi + gauss(0, 1e-2)\n            end\n            i:add_item(j, x)\n            i:add_item(j+1, y)\n        end\n        i:build(10)\n        for j = 0, 10000 - 1, 2 do\n            assert.same({j, j+1}, i:get_nns_by_item(j, 2))\n            assert.same({j+1, j}, i:get_nns_by_item(j+1, 2))\n        end\n    end)\n\n    it(\"precision_1\", function()\n        assert.truthy(precision(true, 1) >= 0.98)\n    end)\n\n    it(\"precision_10\", function()\n        assert.truthy(precision(true, 10) >= 0.98)\n    end)\n\n    it(\"precision_100\", function()\n        assert.truthy(precision(true, 100) >= 0.98)\n    end)\n\n    it(\"precision_1000\", function()\n        assert.truthy(precision(true, 1000) >= 0.98)\n    end)\n\n    it(\"load_save_get_item_vector\", function()\n        local f = 3\n        local i = AnnoyIndex(f)\n        i:add_item(0, {1.1, 2.2, 3.3})\n        i:add_item(1, {4.4, 5.5, 6.6})\n        i:add_item(2, {7.7, 8.8, 9.9})\n        assert.same(roundArray({1.1, 2.2, 3.3}), roundArray(i:get_item_vector(0)))\n        assert.truthy(i:build(10))\n        assert.truthy(i:save('blah.ann'))\n        assert.same(roundArray({4.4, 5.5, 6.6}), roundArray(i:get_item_vector(1)))\n        local j = AnnoyIndex(f)\n        assert.truthy(j:load('blah.ann'))\n        assert.same(roundArray({7.7, 8.8, 9.9}), roundArray(i:get_item_vector(2)))\n    end)\n\n    it(\"get_nns_search_k\", function()\n        local f = 3\n        local i = AnnoyIndex(f)\n        i:add_item(0, {0, 0, 1})\n        i:add_item(1, {0, 1, 0})\n        i:add_item(2, {1, 0, 0})\n        i:build(10)\n        assert.same({0, 1, 2}, i:get_nns_by_item(0, 3, 10))\n        assert.same({2, 1, 0}, i:get_nns_by_vector({3, 2, 1}, 3, 10))\n    end)\n\n    it(\"include_dists\", function()\n        -- Double checking issue 112\n        local f = 40\n        local i = AnnoyIndex(f)\n        local v = randomVector(f, 0, 1)\n        i:add_item(0, v)\n        local neg_v = {}\n        do\n            for k, value in ipairs(v) do\n                neg_v[k] = -value\n            end\n        end\n        i:add_item(1, neg_v)\n        i:build(10)\n        local indices, dists = i:get_nns_by_item(0, 2, 10, true)\n        assert.same({0, 1}, indices)\n        assert.same(roundArray({0.0, 2.0}), roundArray(dists))\n    end)\n\n\n    it(\"include_dists_check_ranges\", function()\n        local f = 3\n        local i = AnnoyIndex(f)\n        for j = 0, 100000 - 1 do\n            i:add_item(j, randomVector(f, 0, 1))\n        end\n        i:build(10)\n        local include_distances = true\n        local _, dists = i:get_nns_by_item(0, 100000, -1, include_distances)\n        assert.truthy(max(dists) < 2.0)\n        assert.equal(round(0.0), round(min(dists)))\n    end)\n\nend)\n\ndescribe(\"euclidean annoy test\", function()\n\n    it(\"get_nns_by_vector\", function()\n        local f = 2\n        local i = AnnoyIndex(f, 'euclidean')\n        i:add_item(0, {2, 2})\n        i:add_item(1, {3, 2})\n        i:add_item(2, {3, 3})\n        i:build(10)\n        assert.same({2, 1, 0}, i:get_nns_by_vector({4, 4}, 3))\n        assert.same({0, 1, 2}, i:get_nns_by_vector({1, 1}, 3))\n        assert.same({1, 2, 0}, i:get_nns_by_vector({4, 2}, 3))\n    end)\n\n    it(\"get_nns_by_item\", function()\n        local f = 2\n        local i = AnnoyIndex(f, 'euclidean')\n        i:add_item(0, {2, 2})\n        i:add_item(1, {3, 2})\n        i:add_item(2, {3, 3})\n        i:build(10)\n        assert.same({0, 1, 2}, i:get_nns_by_item(0, 3))\n        assert.same({2, 1, 0}, i:get_nns_by_item(2, 3))\n    end)\n\n    it(\"dist\", function()\n        local f = 2\n        local i = AnnoyIndex(f, 'euclidean')\n        i:add_item(0, {0, 1})\n        i:add_item(1, {1, 1})\n        assert.equal(round(1.0), round(i:get_distance(0, 1)))\n    end)\n\n    it(\"large_index\", function()\n        -- Generate pairs of random points where the pair is super close\n        local f = 10\n        -- local q = randomVector(f, 0, 10)\n        local i = AnnoyIndex(f, 'euclidean')\n        for j = 0, 10000 - 1, 2 do\n            local p = randomVector(f, 0, 1)\n            local x = {}\n            local y = {}\n            for k, pi in ipairs(p) do\n                x[k] = 1 + pi + gauss(0, 1e-2) -- todo: should be q[i]\n                y[k] = 1 + pi + gauss(0, 1e-2)\n            end\n            i:add_item(j, x)\n            i:add_item(j+1, y)\n        end\n        i:build(10)\n        for j = 0, 10000 - 1, 2 do\n            assert.same({j, j+1}, i:get_nns_by_item(j, 2))\n            assert.same({j+1, j}, i:get_nns_by_item(j+1, 2))\n        end\n    end)\n\n    it(\"precision_1\", function()\n        assert.truthy(precision(false, 1) >= 0.98)\n    end)\n\n    it(\"precision_10\", function()\n        assert.truthy(precision(false, 10) >= 0.98)\n    end)\n\n    it(\"precision_100\", function()\n        assert.truthy(precision(false, 100) >= 0.98)\n    end)\n\n    it(\"precision_1000\", function()\n        assert.truthy(precision(false, 1000) >= 0.98)\n    end)\n\n    it(\"get_nns_with_distances\", function()\n        local f = 3\n        local i = AnnoyIndex(f, 'euclidean')\n        i:add_item(0, {0, 0, 2})\n        i:add_item(1, {0, 1, 1})\n        i:add_item(2, {1, 0, 0})\n        i:build(10)\n        do\n            local l, d = i:get_nns_by_item(0, 3, -1, true)\n            assert.same({0, 1, 2}, l)\n            assert.same(\n                roundArray({0, 2, 5}),\n                roundArray({d[1]^2, d[2]^2, d[3]^2})\n            )\n        end\n        do\n            local l, d = i:get_nns_by_vector({2, 2, 2}, 3, -1, true)\n            assert.same({1, 0, 2}, l)\n            assert.same(\n                roundArray({6, 8, 9}),\n                roundArray({d[1]^2, d[2]^2, d[3]^2})\n            )\n        end\n    end)\n\n    it(\"include_dists\", function()\n        local f = 40\n        local i = AnnoyIndex(f)\n        local v = randomVector(f, 0, 1)\n        i:add_item(0, v)\n        local neg_v = {}\n        do\n            for k, value in ipairs(v) do\n                neg_v[k] = -value\n            end\n        end\n        i:add_item(1, neg_v)\n        i:build(10)\n        local indices, dists = i:get_nns_by_item(0, 2, 10, true)\n        assert.same({0, 1}, indices)\n        assert.same(round(0.0), round(dists[1]))\n    end)\n\nend)\n\ndescribe(\"index test\", function()\n\n    it(\"not_found_tree\", function()\n        local i = AnnoyIndex(10)\n        assert.has_error(function()\n            i:load('nonexists.tree')\n        end)\n    end)\n\n    it(\"binary_compatibility\", function()\n        local i = AnnoyIndex(10)\n        i:load('test/test.tree')\n\n        -- This might change in the future if we change the search\n        -- algorithm, but in that case let's update the test\n        assert.same(\n            {0, 85, 42, 11, 54, 38, 53, 66, 19, 31},\n            i:get_nns_by_item(0, 10)\n        )\n    end)\n\n    it(\"load_unload\", function()\n        -- Issue #108\n        local i = AnnoyIndex(10)\n        for _ = 1, 100000 do\n            i:load('test/test.tree')\n            i:unload()\n        end\n    end)\n\n    it(\"construct_load_destruct\", function()\n        for x = 1, 100000 do\n            local i = AnnoyIndex(10)\n            i:load('test/test.tree')\n            if x % 100 == 0 then\n                collectgarbage()\n            end\n        end\n    end)\n\n    it(\"construct_destruct\", function()\n        for _ = 1, 100000 do\n            local i = AnnoyIndex(10)\n            i:add_item(1000, randomVector(10, 0, 1))\n        end\n    end)\n\n    it(\"save_twice\", function()\n        -- Issue #100\n        local t = AnnoyIndex(10)\n        t:save(\"t.ann\")\n        t:save(\"t.ann\")\n    end)\n\n    it(\"load_save\", function()\n        -- Issue #61\n        local i = AnnoyIndex(10)\n        i:load('test/test.tree')\n        local u = i:get_item_vector(99)\n        i:save('i.tree')\n        local v = i:get_item_vector(99)\n        assert.same(u, v)\n        local j = AnnoyIndex(10)\n        j:load('test/test.tree')\n        local w = i:get_item_vector(99) -- maybe s/i/j/?\n        assert.same(u, w)\n        -- Ensure specifying if prefault is allowed does not impact result\n        j:save('j.tree', true)\n        local k = AnnoyIndex(10)\n        k:load('j.tree', true)\n        local x = k:get_item_vector(99)\n        assert.same(u, x)\n        k:save('k.tree', false)\n        local l = AnnoyIndex(10)\n        l:load('k.tree', false)\n        local y = l:get_item_vector(99)\n        assert.same(u, y)\n    end)\n\n    it(\"on_disk_build\", function()\n        local f = 2\n        local i = AnnoyIndex(f, 'euclidean')\n        i:on_disk_build('x.tree')\n        i:add_item(0, {2, 2})\n        i:add_item(1, {3, 2})\n        i:add_item(2, {3, 3})\n        i:build(10)\n        \n        i:unload()\n        i:load('x.tree')\n        \n        assert.same({2, 1, 0}, i:get_nns_by_vector({4, 4}, 3))\n        assert.same({0, 1, 2}, i:get_nns_by_vector({1, 1}, 3))\n        assert.same({1, 2, 0}, i:get_nns_by_vector({4, 2}, 3))\n    end)\nend)\n\ndescribe(\"types test\", function()\n\n    local n_points = 1000\n    local n_trees = 10\n\n    -- tests \"numpy\" and \"tuple\" are not applicable to Lua\n\n    it(\"wrong_length\", function()\n        local f = 10\n        local i = AnnoyIndex(f, 'euclidean')\n        i:add_item(0, randomVector(f, 0, 1))\n        assert.has_error(function()\n            i:add_item(1, randomVector(f + 1000, 0, 1))\n        end)\n        assert.has_error(function()\n            i:add_item(2, {})\n        end)\n        i:build(n_trees)\n    end)\n\n    it(\"range_errors\", function()\n        local f = 10\n        local i = AnnoyIndex(f, 'euclidean')\n        for j = 0, n_points - 1 do\n            i:add_item(j, randomVector(f, 0, 1))\n        end\n        assert.has_error(function()\n            i:add_item(-1, randomVector(f))\n        end)\n        i:build(n_trees)\n        for _, bad_index in ipairs({-1000, -1, n_points, n_points + 1000}) do\n            assert.has_error(function()\n                i:get_distance(0, bad_index)\n            end)\n            assert.has_error(function()\n                i:get_nns_by_item(bad_index, 1)\n            end)\n            assert.has_error(function()\n                i:get_item_vector(bad_index)\n            end)\n        end\n    end)\n\nend)\n\ndescribe(\"memory leaks\", function()\n\n    it(\"get_item_vector\", function()\n        local f = 10\n        local i = AnnoyIndex(f, 'euclidean')\n        i:add_item(0, randomVector(f, 0, 1))\n        for j = 0, 100 - 1 do\n            print(j, '...')\n            for _ = 1, 1000 * 1000 do\n                i:get_item_vector(0)\n            end\n        end\n    end)\n\n    it(\"get_lots_of_nns\", function()\n        local f = 10\n        local i = AnnoyIndex(f, 'euclidean')\n        i:add_item(0, randomVector(f, 0, 1))\n        i:build(10)\n        for _ = 1, 100 do\n            assert.same({0}, i:get_nns_by_item(0, 999999999))\n        end\n    end)\n\nend)\n"
  },
  {
    "path": "test/dot_index_test.py",
    "content": "# Copyright (c) 2018 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n# use this file except in compliance with the License. You may obtain a copy of\n# the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT\n# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the\n# License for the specific language governing permissions and limitations under\n# the License.\n\nimport random\n\nimport numpy\nimport pytest\n\nfrom annoy import AnnoyIndex\n\n\ndef dot_metric(a, b):\n    return -numpy.dot(a, b)\n\n\ndef recall(retrieved, relevant):\n    return float(len(set(relevant) & set(retrieved))) / float(len(set(relevant)))\n\n\ndef test_get_nns_by_vector():\n    f = 2\n    i = AnnoyIndex(f, \"dot\")\n    i.add_item(0, [2, 2])\n    i.add_item(1, [3, 2])\n    i.add_item(2, [3, 3])\n    i.build(10)\n\n    assert i.get_nns_by_vector([4, 4], 3) == [2, 1, 0]\n    assert i.get_nns_by_vector([1, 1], 3) == [2, 1, 0]\n    assert i.get_nns_by_vector([4, 2], 3) == [2, 1, 0]\n\n\ndef test_get_nns_by_item():\n    f = 2\n    i = AnnoyIndex(f, \"dot\")\n    i.add_item(0, [2, 2])\n    i.add_item(1, [3, 2])\n    i.add_item(2, [3, 3])\n    i.build(10)\n\n    assert i.get_nns_by_item(0, 3) == [2, 1, 0]\n    assert i.get_nns_by_item(2, 3) == [2, 1, 0]\n\n\ndef test_dist():\n    f = 2\n    i = AnnoyIndex(f, \"dot\")\n    i.add_item(0, [0, 1])\n    i.add_item(1, [1, 1])\n    i.add_item(2, [0, 0])\n    i.build(10)\n\n    assert i.get_distance(0, 1) == pytest.approx(1.0)\n    assert i.get_distance(1, 2) == pytest.approx(0.0)\n\n\ndef recall_at(n, n_trees=10, n_points=1000, n_rounds=5):\n    # the best movie/variable name\n    total_recall = 0.0\n\n    for r in range(n_rounds):\n        # create random points at distance x\n        f = 10\n        idx = AnnoyIndex(f, \"dot\")\n\n        data = numpy.array(\n            [[random.gauss(0, 1) for z in range(f)] for j in range(n_points)]\n        )\n\n        expected_results = [\n            sorted(range(n_points), key=lambda j: dot_metric(data[i], data[j]))[:n]\n            for i in range(n_points)\n        ]\n\n        for i, vec in enumerate(data):\n            idx.add_item(i, vec)\n\n        idx.build(n_trees)\n\n        for i in range(n_points):\n            nns = idx.get_nns_by_vector(data[i], n)\n            total_recall += recall(nns, expected_results[i])\n\n    return total_recall / float(n_rounds * n_points)\n\n\ndef test_recall_at_10():\n    value = recall_at(10)\n    assert value >= 0.65\n\n\ndef test_recall_at_100():\n    value = recall_at(100)\n    assert value >= 0.95\n\n\ndef test_recall_at_1000():\n    value = recall_at(1000)\n    assert value >= 0.99\n\n\ndef test_recall_at_1000_fewer_trees():\n    value = recall_at(1000, n_trees=4)\n    assert value >= 0.99\n\n\ndef test_get_nns_with_distances():\n    f = 3\n    i = AnnoyIndex(f, \"dot\")\n    i.add_item(0, [0, 0, 2])\n    i.add_item(1, [0, 1, 1])\n    i.add_item(2, [1, 0, 0])\n    i.build(10)\n\n    l, d = i.get_nns_by_item(0, 3, -1, True)\n    assert l == [0, 1, 2]\n    assert d[0] == pytest.approx(4)\n    assert d[1] == pytest.approx(2)\n    assert d[2] == pytest.approx(0)\n\n    l, d = i.get_nns_by_vector([2, 2, 2], 3, -1, True)\n    assert l == [0, 1, 2]\n    assert d[0] == pytest.approx(4)\n    assert d[1] == pytest.approx(4)\n    assert d[2] == pytest.approx(2)\n\n\ndef test_include_dists():\n    f = 40\n    i = AnnoyIndex(f, \"dot\")\n    v = numpy.random.normal(size=f)\n    i.add_item(0, v)\n    i.add_item(1, -v)\n    i.build(10)\n\n    indices, dists = i.get_nns_by_item(0, 2, 10, True)\n    assert indices == [0, 1]\n    assert dists[0] == pytest.approx(numpy.dot(v, v))\n\n\ndef test_distance_consistency():\n    n, f = 1000, 3\n    i = AnnoyIndex(f, \"dot\")\n    for j in range(n):\n        i.add_item(j, numpy.random.normal(size=f))\n    i.build(10)\n    for a in random.sample(range(n), 100):\n        indices, dists = i.get_nns_by_item(a, 100, include_distances=True)\n        for b, dist in zip(indices, dists):\n            assert dist == pytest.approx(\n                numpy.dot(i.get_item_vector(a), i.get_item_vector(b))\n            )\n        assert dist == pytest.approx(i.get_distance(a, b))\n\n"
  },
  {
    "path": "test/euclidean_index_test.py",
    "content": "# Copyright (c) 2013 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n# use this file except in compliance with the License. You may obtain a copy of\n# the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT\n# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the\n# License for the specific language governing permissions and limitations under\n# the License.\n\nimport random\n\nimport numpy\nimport pytest\n\nfrom annoy import AnnoyIndex\n\n\ndef test_get_nns_by_vector():\n    f = 2\n    i = AnnoyIndex(f, \"euclidean\")\n    i.add_item(0, [2, 2])\n    i.add_item(1, [3, 2])\n    i.add_item(2, [3, 3])\n    i.build(10)\n\n    assert i.get_nns_by_vector([4, 4], 3) == [2, 1, 0]\n    assert i.get_nns_by_vector([1, 1], 3) == [0, 1, 2]\n    assert i.get_nns_by_vector([4, 2], 3) == [1, 2, 0]\n\n\ndef test_get_nns_by_item():\n    f = 2\n    i = AnnoyIndex(f, \"euclidean\")\n    i.add_item(0, [2, 2])\n    i.add_item(1, [3, 2])\n    i.add_item(2, [3, 3])\n    i.build(10)\n\n    assert i.get_nns_by_item(0, 3) == [0, 1, 2]\n    assert i.get_nns_by_item(2, 3) == [2, 1, 0]\n\n\ndef test_dist():\n    f = 2\n    i = AnnoyIndex(f, \"euclidean\")\n    i.add_item(0, [0, 1])\n    i.add_item(1, [1, 1])\n    i.add_item(2, [0, 0])\n\n    assert i.get_distance(0, 1) == pytest.approx(1.0**0.5)\n    assert i.get_distance(1, 2) == pytest.approx(2.0**0.5)\n\n\ndef test_large_index():\n    # Generate pairs of random points where the pair is super close\n    f = 10\n    [random.gauss(0, 10) for z in range(f)]\n    i = AnnoyIndex(f, \"euclidean\")\n    for j in range(0, 10000, 2):\n        p = [random.gauss(0, 1) for z in range(f)]\n        x = [1 + pi + random.gauss(0, 1e-2) for pi in p]  # todo: should be q[i]\n        y = [1 + pi + random.gauss(0, 1e-2) for pi in p]\n        i.add_item(j, x)\n        i.add_item(j + 1, y)\n\n    i.build(10)\n    for j in range(0, 10000, 2):\n        assert i.get_nns_by_item(j, 2) == [j, j + 1]\n        assert i.get_nns_by_item(j + 1, 2) == [j + 1, j]\n\n\ndef precision(n, n_trees=10, n_points=10000, n_rounds=10):\n    found = 0\n    for r in range(n_rounds):\n        # create random points at distance x\n        f = 10\n        i = AnnoyIndex(f, \"euclidean\")\n        for j in range(n_points):\n            p = [random.gauss(0, 1) for z in range(f)]\n            norm = sum([pi**2 for pi in p]) ** 0.5\n            x = [pi / norm * j for pi in p]\n            i.add_item(j, x)\n\n        i.build(n_trees)\n\n        nns = i.get_nns_by_vector([0] * f, n)\n        assert nns == sorted(nns)  # should be in order\n        # The number of gaps should be equal to the last item minus n-1\n        found += len([x for x in nns if x < n])\n\n    return 1.0 * found / (n * n_rounds)\n\n\ndef test_precision_1():\n    assert precision(1) >= 0.98\n\n\ndef test_precision_10():\n    assert precision(10) >= 0.98\n\n\ndef test_precision_100():\n    assert precision(100) >= 0.98\n\n\ndef test_precision_1000():\n    assert precision(1000) >= 0.98\n\n\ndef test_get_nns_with_distances():\n    f = 3\n    i = AnnoyIndex(f, \"euclidean\")\n    i.add_item(0, [0, 0, 2])\n    i.add_item(1, [0, 1, 1])\n    i.add_item(2, [1, 0, 0])\n    i.build(10)\n\n    l, d = i.get_nns_by_item(0, 3, -1, True)\n    assert l == [0, 1, 2]\n    assert d[0] ** 2 == pytest.approx(0)\n    assert d[1] ** 2 == pytest.approx(2)\n    assert d[2] ** 2 == pytest.approx(5)\n\n    l, d = i.get_nns_by_vector([2, 2, 2], 3, -1, True)\n    assert l == [1, 0, 2]\n    assert d[0] ** 2 == pytest.approx(6)\n    assert d[1] ** 2 == pytest.approx(8)\n    assert d[2] ** 2 == pytest.approx(9)\n\n\ndef test_include_dists():\n    f = 40\n    i = AnnoyIndex(f, \"euclidean\")\n    v = numpy.random.normal(size=f)\n    i.add_item(0, v)\n    i.add_item(1, -v)\n    i.build(10)\n\n    indices, dists = i.get_nns_by_item(0, 2, 10, True)\n    assert indices == [0, 1]\n    assert dists[0] == pytest.approx(0)\n\n\ndef test_distance_consistency():\n    n, f = 1000, 3\n    i = AnnoyIndex(f, \"euclidean\")\n    for j in range(n):\n        i.add_item(j, numpy.random.normal(size=f))\n    i.build(10)\n    for a in random.sample(range(n), 100):\n        indices, dists = i.get_nns_by_item(a, 100, include_distances=True)\n        for b, dist in zip(indices, dists):\n            assert dist == pytest.approx(i.get_distance(a, b))\n            u = numpy.array(i.get_item_vector(a))\n            v = numpy.array(i.get_item_vector(b))\n            assert dist == pytest.approx(numpy.dot(u - v, u - v) ** 0.5)\n            assert dist == pytest.approx(\n                sum([(x - y) ** 2 for x, y in zip(u, v)]) ** 0.5\n            )\n\n\ndef test_rounding_error():\n    # https://github.com/spotify/annoy/issues/314\n    i = AnnoyIndex(1, \"euclidean\")\n    i.add_item(0, [0.7125930])\n    i.add_item(1, [0.7123166])\n    assert i.get_distance(0, 1) >= 0.0\n"
  },
  {
    "path": "test/examples_test.py",
    "content": "def execfile(fn):\n    with open(fn) as f:\n        exec(f.read())\n\n\ndef simple_test():\n    execfile(\"examples/simple_test.py\")\n\n\ndef mmap_test():\n    execfile(\"examples/mmap_test.py\")\n\n\ndef precision_test():\n    execfile(\"examples/precision_test.py\")\n"
  },
  {
    "path": "test/hamming_index_test.py",
    "content": "# Copyright (c) 2013 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n# use this file except in compliance with the License. You may obtain a copy of\n# the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT\n# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the\n# License for the specific language governing permissions and limitations under\n# the License.\n\n\nimport numpy\nimport pytest\n\nfrom annoy import AnnoyIndex\n\n\ndef test_basic_conversion():\n    f = 100\n    i = AnnoyIndex(f, \"hamming\")\n    u = numpy.random.binomial(1, 0.5, f)\n    v = numpy.random.binomial(1, 0.5, f)\n    i.add_item(0, u)\n    i.add_item(1, v)\n    u2 = i.get_item_vector(0)\n    v2 = i.get_item_vector(1)\n    assert numpy.dot(u - u2, u - u2) == pytest.approx(0.0)\n    assert numpy.dot(v - v2, v - v2) == pytest.approx(0.0)\n    assert i.get_distance(0, 0) == pytest.approx(0.0)\n    assert i.get_distance(1, 1) == pytest.approx(0.0)\n    assert i.get_distance(0, 1) == pytest.approx(numpy.dot(u - v, u - v))\n    assert i.get_distance(1, 0) == pytest.approx(numpy.dot(u - v, u - v))\n\n\ndef test_basic_nns():\n    f = 100\n    i = AnnoyIndex(f, \"hamming\")\n    u = numpy.random.binomial(1, 0.5, f)\n    v = numpy.random.binomial(1, 0.5, f)\n    i.add_item(0, u)\n    i.add_item(1, v)\n    i.build(10)\n    assert i.get_nns_by_item(0, 99) == [0, 1]\n    assert i.get_nns_by_item(1, 99) == [1, 0]\n    rs, ds = i.get_nns_by_item(0, 99, include_distances=True)\n    assert rs == [0, 1]\n    assert ds[0] == pytest.approx(0)\n    assert ds[1] == pytest.approx(numpy.dot(u - v, u - v))\n\n\ndef test_save_load():\n    f = 100\n    i = AnnoyIndex(f, \"hamming\")\n    u = numpy.random.binomial(1, 0.5, f)\n    v = numpy.random.binomial(1, 0.5, f)\n    i.add_item(0, u)\n    i.add_item(1, v)\n    i.build(10)\n    i.save(\"blah.ann\")\n    j = AnnoyIndex(f, \"hamming\")\n    j.load(\"blah.ann\")\n    rs, ds = j.get_nns_by_item(0, 99, include_distances=True)\n    assert rs == [0, 1]\n    assert ds[0] == pytest.approx(0)\n    assert ds[1] == pytest.approx(numpy.dot(u - v, u - v))\n\n\ndef test_many_vectors():\n    f = 10\n    i = AnnoyIndex(f, \"hamming\")\n    for x in range(100000):\n        i.add_item(x, numpy.random.binomial(1, 0.5, f))\n    i.build(10)\n\n    rs, ds = i.get_nns_by_vector([0] * f, 10000, include_distances=True)\n    assert min(ds) >= 0\n    assert max(ds) <= f\n\n    dists = []\n    for x in range(1000):\n        rs, ds = i.get_nns_by_vector(\n            numpy.random.binomial(1, 0.5, f), 1, search_k=1000, include_distances=True\n        )\n        dists.append(ds[0])\n    avg_dist = 1.0 * sum(dists) / len(dists)\n    assert avg_dist <= 0.42\n\n\n@pytest.mark.skip  # will fix later\ndef test_zero_vectors():\n    # Mentioned on the annoy-user list\n    bitstrings = [\n        \"0000000000011000001110000011111000101110111110000100000100000000\",\n        \"0000000000011000001110000011111000101110111110000100000100000001\",\n        \"0000000000011000001110000011111000101110111110000100000100000010\",\n        \"0010010100011001001000010001100101011110000000110000011110001100\",\n        \"1001011010000110100101101001111010001110100001101000111000001110\",\n        \"0111100101111001011110010010001100010111000111100001101100011111\",\n        \"0011000010011101000011010010111000101110100101111000011101001011\",\n        \"0011000010011100000011010010111000101110100101111000011101001011\",\n        \"1001100000111010001010000010110000111100100101001001010000000111\",\n        \"0000000000111101010100010001000101101001000000011000001101000000\",\n        \"1000101001010001011100010111001100110011001100110011001111001100\",\n        \"1110011001001111100110010001100100001011000011010010111100100111\",\n    ]\n    vectors = [[int(bit) for bit in bitstring] for bitstring in bitstrings]\n\n    f = 64\n    idx = AnnoyIndex(f, \"hamming\")\n    for i, v in enumerate(vectors):\n        idx.add_item(i, v)\n\n    idx.build(10)\n    idx.save(\"idx.ann\")\n    idx = AnnoyIndex(f, \"hamming\")\n    idx.load(\"idx.ann\")\n    js, ds = idx.get_nns_by_item(0, 5, include_distances=True)\n    assert js[0] == 0\n    assert ds[:4] == [0, 1, 1, 22]\n"
  },
  {
    "path": "test/holes_test.py",
    "content": "# Copyright (c) 2013 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n# use this file except in compliance with the License. You may obtain a copy of\n# the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT\n# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the\n# License for the specific language governing permissions and limitations under\n# the License.\n\nimport random\n\nimport numpy\n\nfrom annoy import AnnoyIndex\n\n\ndef test_random_holes():\n    f = 10\n    index = AnnoyIndex(f, \"angular\")\n    valid_indices = random.sample(range(2000), 1000)  # leave holes\n    for i in valid_indices:\n        v = numpy.random.normal(size=(f,))\n        index.add_item(i, v)\n    index.build(10)\n    for i in valid_indices:\n        js = index.get_nns_by_item(i, 10000)\n        for j in js:\n            assert j in valid_indices\n    for i in range(1000):\n        v = numpy.random.normal(size=(f,))\n        js = index.get_nns_by_vector(v, 10000)\n        for j in js:\n            assert j in valid_indices\n\n\ndef _test_holes_base(n, f=100, base_i=100000):\n    annoy = AnnoyIndex(f, \"angular\")\n    for i in range(n):\n        annoy.add_item(base_i + i, numpy.random.normal(size=(f,)))\n    annoy.build(100)\n    res = annoy.get_nns_by_item(base_i, n)\n    assert set(res) == set([base_i + i for i in range(n)])\n\n\ndef test_root_one_child():\n    # See https://github.com/spotify/annoy/issues/223\n    _test_holes_base(1)\n\n\ndef test_root_two_children():\n    _test_holes_base(2)\n\n\ndef test_root_some_children():\n    # See https://github.com/spotify/annoy/issues/295\n    _test_holes_base(10)\n\n\ndef test_root_many_children():\n    _test_holes_base(1000)\n"
  },
  {
    "path": "test/index_test.py",
    "content": "# Copyright (c) 2013 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n# use this file except in compliance with the License. You may obtain a copy of\n# the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT\n# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the\n# License for the specific language governing permissions and limitations under\n# the License.\n\nimport os\nimport random\n\nimport pytest\n\nfrom annoy import AnnoyIndex\n\n\ndef test_not_found_tree():\n    i = AnnoyIndex(10, \"angular\")\n    with pytest.raises(IOError):\n        i.load(\"nonexists.tree\")\n\n\ndef test_binary_compatibility():\n    i = AnnoyIndex(10, \"angular\")\n    i.load(\"test/test.tree\")\n\n    # This might change in the future if we change the search algorithm, but in that case let's update the test\n    assert i.get_nns_by_item(0, 10) == [0, 85, 42, 11, 54, 38, 53, 66, 19, 31]\n\n\ndef test_load_unload():\n    # Issue #108\n    i = AnnoyIndex(10, \"angular\")\n    for x in range(100000):\n        i.load(\"test/test.tree\")\n        i.unload()\n\n\ndef test_construct_load_destruct():\n    for x in range(100000):\n        i = AnnoyIndex(10, \"angular\")\n        i.load(\"test/test.tree\")\n\n\ndef test_construct_destruct():\n    for x in range(100000):\n        i = AnnoyIndex(10, \"angular\")\n        i.add_item(1000, [random.gauss(0, 1) for z in range(10)])\n\n\ndef test_save_twice():\n    # Issue #100\n    t = AnnoyIndex(10, \"angular\")\n    for i in range(100):\n        t.add_item(i, [random.gauss(0, 1) for z in range(10)])\n    t.build(10)\n    t.save(\"t1.ann\")\n    t.save(\"t2.ann\")\n\n\ndef test_load_save():\n    # Issue #61\n    i = AnnoyIndex(10, \"angular\")\n    i.load(\"test/test.tree\")\n    u = i.get_item_vector(99)\n    i.save(\"i.tree\")\n    v = i.get_item_vector(99)\n    assert u == v\n    j = AnnoyIndex(10, \"angular\")\n    j.load(\"test/test.tree\")\n    w = i.get_item_vector(99)\n    assert u == w\n    # Ensure specifying if prefault is allowed does not impact result\n    j.save(\"j.tree\", True)\n    k = AnnoyIndex(10, \"angular\")\n    k.load(\"j.tree\", True)\n    x = k.get_item_vector(99)\n    assert u == x\n    k.save(\"k.tree\", False)\n    l = AnnoyIndex(10, \"angular\")\n    l.load(\"k.tree\", False)\n    y = l.get_item_vector(99)\n    assert u == y\n\n\ndef test_save_without_build():\n    t = AnnoyIndex(10, \"angular\")\n    for i in range(100):\n        t.add_item(i, [random.gauss(0, 1) for z in range(10)])\n    # Note: in earlier version, this was allowed (see eg #61)\n    with pytest.raises(Exception):\n        t.save(\"x.tree\")\n\n\ndef test_unbuild_with_loaded_tree():\n    i = AnnoyIndex(10, \"angular\")\n    i.load(\"test/test.tree\")\n    with pytest.raises(Exception):\n        i.unbuild()\n\n\ndef test_seed():\n    i = AnnoyIndex(10, \"angular\")\n    i.load(\"test/test.tree\")\n    i.set_seed(42)\n\n\ndef test_unknown_distance():\n    with pytest.raises(Exception):\n        AnnoyIndex(10, \"banana\")\n\n\ndef test_metric_kwarg():\n    # Issue 211\n    i = AnnoyIndex(2, metric=\"euclidean\")\n    i.add_item(0, [1, 0])\n    i.add_item(1, [9, 0])\n    assert i.get_distance(0, 1) == pytest.approx(8)\n    assert i.f == 2\n\n\ndef test_metric_f_kwargs():\n    AnnoyIndex(f=3, metric=\"euclidean\")\n\n\ndef test_item_vector_after_save():\n    # Issue #279\n    a = AnnoyIndex(3, \"angular\")\n    a.verbose(True)\n    a.add_item(1, [1, 0, 0])\n    a.add_item(2, [0, 1, 0])\n    a.add_item(3, [0, 0, 1])\n    a.build(-1)\n    assert a.get_n_items() == 4\n    assert a.get_item_vector(3) == [0, 0, 1]\n    assert set(a.get_nns_by_item(1, 999)) == set([1, 2, 3])\n    a.save(\"something.annoy\")\n    assert a.get_n_items() == 4\n    assert a.get_item_vector(3) == [0, 0, 1]\n    assert set(a.get_nns_by_item(1, 999)) == set([1, 2, 3])\n\n\ndef test_prefault():\n    i = AnnoyIndex(10, \"angular\")\n    i.load(\"test/test.tree\", prefault=True)\n    assert i.get_nns_by_item(0, 10) == [0, 85, 42, 11, 54, 38, 53, 66, 19, 31]\n\n\ndef test_fail_save():\n    t = AnnoyIndex(40, \"angular\")\n    with pytest.raises(IOError):\n        t.save(\"\")\n\n\ndef test_overwrite_index():\n    # Issue #335\n    f = 40\n\n    # Build the initial index\n    t = AnnoyIndex(f, \"angular\")\n    for i in range(1000):\n        v = [random.gauss(0, 1) for z in range(f)]\n        t.add_item(i, v)\n    t.build(10)\n    t.save(\"test.ann\")\n\n    # Load index file\n    t2 = AnnoyIndex(f, \"angular\")\n    t2.load(\"test.ann\")\n\n    # Overwrite index file\n    t3 = AnnoyIndex(f, \"angular\")\n    for i in range(500):\n        v = [random.gauss(0, 1) for z in range(f)]\n        t3.add_item(i, v)\n    t3.build(10)\n    if os.name == \"nt\":\n        # Can't overwrite on Windows\n        with pytest.raises(IOError):\n            t3.save(\"test.ann\")\n    else:\n        t3.save(\"test.ann\")\n        # Get nearest neighbors\n        v = [random.gauss(0, 1) for z in range(f)]\n        t2.get_nns_by_vector(v, 1000)  # Should not crash\n\n\ndef test_get_n_trees():\n    i = AnnoyIndex(10, \"angular\")\n    i.load(\"test/test.tree\")\n    assert i.get_n_trees() == 10\n\n\ndef test_write_failed():\n    f = 40\n\n    # Build the initial index\n    t = AnnoyIndex(f, \"angular\")\n    t.verbose(True)\n    for i in range(1000):\n        v = [random.gauss(0, 1) for z in range(f)]\n        t.add_item(i, v)\n    t.build(10)\n\n    if os.name == \"nt\":\n        path = \"Z:\\\\xyz.annoy\"\n    else:\n        path = \"/x/y/z.annoy\"\n    with pytest.raises(Exception):\n        t.save(path)\n\n\ndef test_dimension_mismatch():\n    t = AnnoyIndex(100, \"angular\")\n    for i in range(1000):\n        t.add_item(i, [random.gauss(0, 1) for z in range(100)])\n    t.build(10)\n    t.save(\"test.annoy\")\n\n    u = AnnoyIndex(200, \"angular\")\n    with pytest.raises(IOError):\n        u.load(\"test.annoy\")\n    u = AnnoyIndex(50, \"angular\")\n    with pytest.raises(IOError):\n        u.load(\"test.annoy\")\n\n\ndef test_add_after_save():\n    # 398\n    t = AnnoyIndex(100, \"angular\")\n    for i in range(1000):\n        t.add_item(i, [random.gauss(0, 1) for z in range(100)])\n    t.build(10)\n    t.save(\"test.annoy\")\n\n    # Used to segfault:\n    v = [random.gauss(0, 1) for z in range(100)]\n    with pytest.raises(Exception):\n        t.add_item(i, v)\n\n\ndef test_build_twice():\n    # 420\n    t = AnnoyIndex(100, \"angular\")\n    for i in range(1000):\n        t.add_item(i, [random.gauss(0, 1) for z in range(100)])\n    t.build(10)\n    # Used to segfault:\n    with pytest.raises(Exception):\n        t.build(10)\n\n\ndef test_very_large_index():\n    # 388\n    f = 3\n    dangerous_size = 2**31\n    size_per_vector = 4 * (f + 3)\n    n_vectors = int(dangerous_size / size_per_vector)\n    m = AnnoyIndex(3, \"angular\")\n    m.verbose(True)\n    for i in range(100):\n        m.add_item(n_vectors + i, [random.gauss(0, 1) for z in range(f)])\n    n_trees = 10\n    m.build(n_trees)\n    path = \"test_big.annoy\"\n    m.save(path)  # Raises on Windows\n\n    # Sanity check size of index\n    assert os.path.getsize(path) >= dangerous_size\n    assert os.path.getsize(path) < dangerous_size + 100e3\n\n    # Sanity check number of trees\n    assert m.get_n_trees() == n_trees\n"
  },
  {
    "path": "test/manhattan_index_test.py",
    "content": "# Copyright (c) 2013 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n# use this file except in compliance with the License. You may obtain a copy of\n# the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT\n# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the\n# License for the specific language governing permissions and limitations under\n# the License.\n\nimport random\n\nimport numpy\nimport pytest\n\nfrom annoy import AnnoyIndex\n\n\ndef test_get_nns_by_vector():\n    f = 2\n    i = AnnoyIndex(f, \"manhattan\")\n    i.add_item(0, [2, 2])\n    i.add_item(1, [3, 2])\n    i.add_item(2, [3, 3])\n    i.build(10)\n\n    assert i.get_nns_by_vector([4, 4], 3) == [2, 1, 0]\n    assert i.get_nns_by_vector([1, 1], 3) == [0, 1, 2]\n    assert i.get_nns_by_vector([5, 3], 3) == [2, 1, 0]\n\n\ndef test_get_nns_by_item():\n    f = 2\n    i = AnnoyIndex(f, \"manhattan\")\n    i.add_item(0, [2, 2])\n    i.add_item(1, [3, 2])\n    i.add_item(2, [3, 3])\n    i.build(10)\n\n    assert i.get_nns_by_item(0, 3) == [0, 1, 2]\n    assert i.get_nns_by_item(2, 3) == [2, 1, 0]\n\n\ndef test_dist():\n    f = 2\n    i = AnnoyIndex(f, \"manhattan\")\n    i.add_item(0, [0, 1])\n    i.add_item(1, [1, 1])\n    i.add_item(2, [0, 0])\n\n    assert i.get_distance(0, 1) == pytest.approx(1.0)\n    assert i.get_distance(1, 2) == pytest.approx(2.0)\n\n\ndef test_large_index():\n    # Generate pairs of random points where the pair is super close\n    f = 10\n    i = AnnoyIndex(f, \"manhattan\")\n    for j in range(0, 10000, 2):\n        p = [random.gauss(0, 1) for z in range(f)]\n        x = [1 + pi + random.gauss(0, 1e-2) for pi in p]\n        y = [1 + pi + random.gauss(0, 1e-2) for pi in p]\n        i.add_item(j, x)\n        i.add_item(j + 1, y)\n\n    i.build(10)\n    for j in range(0, 10000, 2):\n        assert i.get_nns_by_item(j, 2) == [j, j + 1]\n        assert i.get_nns_by_item(j + 1, 2) == [j + 1, j]\n\n\ndef precision(n, n_trees=10, n_points=10000, n_rounds=10):\n    found = 0\n    for r in range(n_rounds):\n        # create random points at distance x\n        f = 10\n        i = AnnoyIndex(f, \"manhattan\")\n        for j in range(n_points):\n            p = [random.gauss(0, 1) for z in range(f)]\n            norm = sum([pi**2 for pi in p]) ** 0.5\n            x = [pi / norm + j for pi in p]\n            i.add_item(j, x)\n\n        i.build(n_trees)\n\n        nns = i.get_nns_by_vector([0] * f, n)\n        assert nns == sorted(nns)  # should be in order\n        # The number of gaps should be equal to the last item minus n-1\n        found += len([x for x in nns if x < n])\n\n    return 1.0 * found / (n * n_rounds)\n\n\ndef test_precision_1():\n    assert precision(1) >= 0.98\n\n\ndef test_precision_10():\n    assert precision(10) >= 0.98\n\n\ndef test_precision_100():\n    assert precision(100) >= 0.98\n\n\ndef test_precision_1000():\n    assert precision(1000) >= 0.98\n\n\ndef test_get_nns_with_distances():\n    f = 3\n    i = AnnoyIndex(f, \"manhattan\")\n    i.add_item(0, [0, 0, 2])\n    i.add_item(1, [0, 1, 1])\n    i.add_item(2, [1, 0, 0])\n    i.build(10)\n\n    l, d = i.get_nns_by_item(0, 3, -1, True)\n    assert l == [0, 1, 2]\n    assert d[0] == pytest.approx(0)\n    assert d[1] == pytest.approx(2)\n    assert d[2] == pytest.approx(3)\n\n    l, d = i.get_nns_by_vector([2, 2, 1], 3, -1, True)\n    assert l == [1, 2, 0]\n    assert d[0] == pytest.approx(3)\n    assert d[1] == pytest.approx(4)\n    assert d[2] == pytest.approx(5)\n\n\ndef test_include_dists():\n    f = 40\n    i = AnnoyIndex(f, \"manhattan\")\n    v = numpy.random.normal(size=f)\n    i.add_item(0, v)\n    i.add_item(1, -v)\n    i.build(10)\n\n    indices, dists = i.get_nns_by_item(0, 2, 10, True)\n    assert indices == [0, 1]\n    assert dists[0] == pytest.approx(0)\n\n\ndef test_distance_consistency():\n    n, f = 1000, 3\n    i = AnnoyIndex(f, \"manhattan\")\n    for j in range(n):\n        i.add_item(j, numpy.random.normal(size=f))\n    i.build(10)\n    for a in random.sample(range(n), 100):\n        indices, dists = i.get_nns_by_item(a, 100, include_distances=True)\n        for b, dist in zip(indices, dists):\n            assert dist == pytest.approx(i.get_distance(a, b))\n            u = numpy.array(i.get_item_vector(a))\n            v = numpy.array(i.get_item_vector(b))\n            assert dist == pytest.approx(numpy.sum(numpy.fabs(u - v)))\n            assert dist == pytest.approx(\n                sum([abs(float(x) - float(y)) for x, y in zip(u, v)])\n            )\n"
  },
  {
    "path": "test/memory_leak_test.py",
    "content": "# Copyright (c) 2013 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n# use this file except in compliance with the License. You may obtain a copy of\n# the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT\n# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the\n# License for the specific language governing permissions and limitations under\n# the License.\n\nimport pytest\nimport random\n\nfrom annoy import AnnoyIndex\n\n\ndef test_get_item_vector():\n    f = 10\n    i = AnnoyIndex(f, \"euclidean\")\n    i.add_item(0, [random.gauss(0, 1) for x in range(f)])\n    for j in range(100):\n        print(j, \"...\")\n        for k in range(1000 * 1000):\n            i.get_item_vector(0)\n\n\ndef test_get_lots_of_nns():\n    f = 10\n    i = AnnoyIndex(f, \"euclidean\")\n    i.add_item(0, [random.gauss(0, 1) for x in range(f)])\n    i.build(10)\n    for j in range(100):\n        assert i.get_nns_by_item(0, 999999999) == [0]\n\n\ndef test_build_unbuid():\n    f = 10\n    i = AnnoyIndex(f, \"euclidean\")\n    for j in range(1000):\n        i.add_item(j, [random.gauss(0, 1) for x in range(f)])\n    i.build(10)\n\n    for j in range(100):\n        i.unbuild()\n        i.build(10)\n\n    assert i.get_n_items() == 1000\n\n\ndef test_include_distances():\n    # See #633\n    # (Not able to repro it though)\n    f = 10\n    i = AnnoyIndex(f, \"euclidean\")\n    for j in range(10000):\n        i.add_item(j, [random.gauss(0, 1) for x in range(f)])\n    i.build(10)\n\n    v = [random.gauss(0, 1) for x in range(f)]\n    for _ in range(10000000):\n        indices, distances = i.get_nns_by_vector(v, 1, include_distances=True)\n"
  },
  {
    "path": "test/multithreaded_build_test.py",
    "content": "import numpy\n\nfrom annoy import AnnoyIndex\n\n\ndef _test_building_with_threads(n_jobs):\n    n, f = 10000, 10\n    n_trees = 31\n    i = AnnoyIndex(f, \"euclidean\")\n    for j in range(n):\n        i.add_item(j, numpy.random.normal(size=f))\n    assert i.build(n_trees, n_jobs=n_jobs)\n    assert n_trees == i.get_n_trees()\n\n\ndef test_one_thread():\n    _test_building_with_threads(1)\n\n\ndef test_two_threads():\n    _test_building_with_threads(2)\n\n\ndef test_four_threads():\n    _test_building_with_threads(4)\n\n\ndef test_eight_threads():\n    _test_building_with_threads(8)\n"
  },
  {
    "path": "test/on_disk_build_test.py",
    "content": "# Copyright (c) 2013 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n# use this file except in compliance with the License. You may obtain a copy of\n# the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT\n# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the\n# License for the specific language governing permissions and limitations under\n# the License.\n\nimport os\n\nimport pytest\n\nfrom annoy import AnnoyIndex\n\n\n@pytest.fixture(scope=\"module\", autouse=True)\ndef setUp():\n    if os.path.exists(\"on_disk.ann\"):\n        os.remove(\"on_disk.ann\")\n\n\ndef add_items(i):\n    i.add_item(0, [2, 2])\n    i.add_item(1, [3, 2])\n    i.add_item(2, [3, 3])\n\n\ndef check_nns(i):\n    assert i.get_nns_by_vector([4, 4], 3) == [2, 1, 0]\n    assert i.get_nns_by_vector([1, 1], 3) == [0, 1, 2]\n    assert i.get_nns_by_vector([4, 2], 3) == [1, 2, 0]\n\n\ndef test_on_disk():\n    f = 2\n    i = AnnoyIndex(f, \"euclidean\")\n    i.on_disk_build(\"on_disk.ann\")\n    add_items(i)\n    i.build(10)\n    check_nns(i)\n    i.unload()\n    i.load(\"on_disk.ann\")\n    check_nns(i)\n    j = AnnoyIndex(f, \"euclidean\")\n    j.load(\"on_disk.ann\")\n    check_nns(j)\n"
  },
  {
    "path": "test/seed_test.py",
    "content": "# Copyright (c) 2013 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n# use this file except in compliance with the License. You may obtain a copy of\n# the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT\n# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the\n# License for the specific language governing permissions and limitations under\n# the License.\n\nimport numpy\n\nfrom annoy import AnnoyIndex\n\n\ndef test_seeding():\n    f = 10\n    X = numpy.random.rand(1000, f)\n    Y = numpy.random.rand(50, f)\n\n    indexes = []\n    for i in range(2):\n        index = AnnoyIndex(f, \"angular\")\n        index.set_seed(42)\n        for j in range(X.shape[0]):\n            index.add_item(j, X[j])\n\n        index.build(10)\n        indexes.append(index)\n\n    for k in range(Y.shape[0]):\n        assert indexes[0].get_nns_by_vector(Y[k], 100) == indexes[1].get_nns_by_vector(\n            Y[k], 100\n        )\n"
  },
  {
    "path": "test/threading_test.py",
    "content": "# Copyright (c) 2013 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n# use this file except in compliance with the License. You may obtain a copy of\n# the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT\n# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the\n# License for the specific language governing permissions and limitations under\n# the License.\n\nimport multiprocessing.pool\n\nimport numpy\n\nfrom annoy import AnnoyIndex\n\n\ndef test_threads():\n    n, f = 10000, 10\n    i = AnnoyIndex(f, \"euclidean\")\n    for j in range(n):\n        i.add_item(j, numpy.random.normal(size=f))\n    i.build(10)\n\n    pool = multiprocessing.pool.ThreadPool()\n\n    def query_f(j):\n        i.get_nns_by_item(1, 1000)\n\n    pool.map(query_f, range(n))\n"
  },
  {
    "path": "test/types_test.py",
    "content": "# Copyright (c) 2013 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n# use this file except in compliance with the License. You may obtain a copy of\n# the License at\n#\n# http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS, WITHOUT\n# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the\n# License for the specific language governing permissions and limitations under\n# the License.\n\nimport random\n\nimport numpy\nimport pytest\n\nfrom annoy import AnnoyIndex\n\n\ndef test_numpy(n_points=1000, n_trees=10):\n    f = 10\n    i = AnnoyIndex(f, \"euclidean\")\n    for j in range(n_points):\n        a = numpy.random.normal(size=f)\n        a = a.astype(\n            random.choice([numpy.float64, numpy.float32, numpy.uint8, numpy.int16])\n        )\n        i.add_item(j, a)\n\n    i.build(n_trees)\n\n\ndef test_tuple(n_points=1000, n_trees=10):\n    f = 10\n    i = AnnoyIndex(f, \"euclidean\")\n    for j in range(n_points):\n        i.add_item(j, tuple(random.gauss(0, 1) for x in range(f)))\n\n    i.build(n_trees)\n\n\ndef test_wrong_length(n_points=1000, n_trees=10):\n    f = 10\n    i = AnnoyIndex(f, \"euclidean\")\n    i.add_item(0, [random.gauss(0, 1) for x in range(f)])\n    with pytest.raises(IndexError):\n        i.add_item(1, [random.gauss(0, 1) for x in range(f + 1000)])\n    with pytest.raises(IndexError):\n        i.add_item(2, [])\n\n    i.build(n_trees)\n\n\ndef test_range_errors(n_points=1000, n_trees=10):\n    f = 10\n    i = AnnoyIndex(f, \"euclidean\")\n    for j in range(n_points):\n        i.add_item(j, [random.gauss(0, 1) for x in range(f)])\n    with pytest.raises(IndexError):\n        i.add_item(-1, [random.gauss(0, 1) for x in range(f)])\n    i.build(n_trees)\n    for bad_index in [-1000, -1, n_points, n_points + 1000]:\n        with pytest.raises(IndexError):\n            i.get_distance(0, bad_index)\n        with pytest.raises(IndexError):\n            i.get_nns_by_item(bad_index, 1)\n        with pytest.raises(IndexError):\n            i.get_item_vector(bad_index)\n\n\ndef test_missing_len():\n    \"\"\"\n    We should get a helpful error message if our vector doesn't have a\n    __len__ method.\n    \"\"\"\n\n    class FakeCollection:\n        pass\n\n    i = AnnoyIndex(10, \"euclidean\")\n    with pytest.raises(TypeError) as excinfo:\n        i.add_item(1, FakeCollection())\n    assert str(excinfo.value) == \"object of type 'FakeCollection' has no len()\"\n\n\ndef test_missing_getitem():\n    \"\"\"\n    We should get a helpful error message if our vector doesn't have a\n    __getitem__ method.\n    \"\"\"\n\n    class FakeCollection:\n        def __len__(self):\n            return 5\n\n    i = AnnoyIndex(5, \"euclidean\")\n    with pytest.raises(TypeError) as excinfo:\n        i.add_item(1, FakeCollection())\n    assert str(excinfo.value) == \"'FakeCollection' object is not subscriptable\"\n\n\ndef test_short():\n    \"\"\"\n    Ensure we handle our vector not being long enough.\n    \"\"\"\n\n    class FakeCollection:\n        def __len__(self):\n            return 3\n\n        def __getitem__(self, i):\n            raise IndexError\n\n    i = AnnoyIndex(3, \"euclidean\")\n    with pytest.raises(IndexError):\n        i.add_item(1, FakeCollection())\n\n\ndef test_non_float():\n    \"\"\"\n    We should error gracefully if non-floats are provided in our vector.\n    \"\"\"\n    array_strings = [\"1\", \"2\", \"3\"]\n\n    i = AnnoyIndex(3, \"euclidean\")\n    with pytest.raises(TypeError) as excinfo:\n        i.add_item(1, array_strings)\n    assert str(excinfo.value) == \"must be real number, not str\"\n"
  },
  {
    "path": "tox.ini",
    "content": "[tox]\nenvlist=py{26,27,33,34,35,36,37,38,39,310,311,312,313}, go, lua\n\n[testenv]\nsetenv =\n  TRAVIS = {env:TRAVIS:}\ncommands =\n  pip install numpy h5py\n  pip install .\n  python setup.py nosetests --verbosity=3\n\n[testenv:go]\nsetenv =\n  GOPATH = {env:HOME:}/gopath\n  GOROOT = /usr/local/go\nwhitelist_externals=*\ncommands =\n  mkdir -p {env:GOPATH:}/src/annoyindex\n  wget https://storage.googleapis.com/golang/go1.5.linux-amd64.tar.gz\n  sudo tar -C /usr/local -xzf go1.5.linux-amd64.tar.gz\n  sudo add-apt-repository -y ppa:timsc/swig-3.0.12\n  sudo apt-get update -qq\n  sudo apt-get install -y swig3.0\n  swig3.0 -go -intgosize 64 -cgo -c++ src/annoygomodule.i\n  cp src/annoygomodule_wrap.cxx src/annoyindex.go src/annoygomodule.h src/annoylib.h src/kissrandom.h {env:GOPATH:}/src/annoyindex\n  {env:GOROOT}/bin/go build annoyindex\n\n[testenv:lua]\nsetenv =\n  HOME = {env:HOME}\nwhitelist_externals=*\ncommands =\n  pip install hererocks\n  hererocks {toxworkdir}/here --{env:LUA:} --luarocks 2.2\n  {toxworkdir}/here/bin/luarocks make\n  {toxworkdir}/here/bin/luarocks install busted\n  {toxworkdir}/here/bin/busted test/annoy_test.lua\n"
  }
]