Repository: spotify/annoy
Branch: main
Commit: 379f744667ab
Files: 50
Total size: 211.3 KB

Directory structure:
gitextract__vrufcg9/

├── .github/
│   └── workflows/
│       ├── ci.yml
│       └── publish.yml
├── .gitignore
├── CMakeLists.txt
├── LICENSE
├── MANIFEST.in
├── README.rst
├── README_GO.rst
├── README_Lua.md
├── RELEASE.md
├── annoy/
│   ├── __init__.py
│   ├── __init__.pyi
│   └── py.typed
├── annoy-dev-1.rockspec
├── debian/
│   ├── changelog
│   ├── compat
│   ├── control
│   └── rules
├── examples/
│   ├── mmap_test.py
│   ├── precision_test.cpp
│   ├── precision_test.py
│   ├── s_compile_cpp.sh
│   └── simple_test.py
├── setup.cfg
├── setup.py
├── src/
│   ├── annoygomodule.h
│   ├── annoygomodule.i
│   ├── annoylib.h
│   ├── annoyluamodule.cc
│   ├── annoymodule.cc
│   ├── kissrandom.h
│   └── mman.h
├── test/
│   ├── accuracy_test.py
│   ├── angular_index_test.py
│   ├── annoy_test.go
│   ├── annoy_test.lua
│   ├── dot_index_test.py
│   ├── euclidean_index_test.py
│   ├── examples_test.py
│   ├── hamming_index_test.py
│   ├── holes_test.py
│   ├── index_test.py
│   ├── manhattan_index_test.py
│   ├── memory_leak_test.py
│   ├── multithreaded_build_test.py
│   ├── on_disk_build_test.py
│   ├── seed_test.py
│   ├── threading_test.py
│   └── types_test.py
└── tox.ini

================================================
FILE CONTENTS
================================================

================================================
FILE: .github/workflows/ci.yml
================================================
name: Annoy

on:
  push:
    branches:
      - main
  pull_request:

jobs:
  unit-tests:
    runs-on: ubuntu-22.04
    strategy:
      fail-fast: false
      matrix:
        python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "3.13"]
        os: ["ubuntu-20.04", "macos-latest", "windows-latest"]

    steps:
      - uses: actions/checkout@v3 # Pull the repository
      - uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python-version }}
      - run: pip install .
      - run: pip install h5py numpy pytest
      - run: pytest -v


================================================
FILE: .github/workflows/publish.yml
================================================
name: Publish

on:
  push:
    tags:
      - 'v*.*.*'

jobs:
  build:
    runs-on: ${{ matrix.os }}
    strategy:
      fail-fast: false
      matrix:
        os: [ubuntu-latest, windows-latest, macos-latest]

    steps:
      - name: Checkout code
        uses: actions/checkout@v2

      - name: Set up QEMU (for Linux aarch64)
        if: runner.os == 'Linux'
        uses: docker/setup-qemu-action@v3
        with:
          platforms: arm64

      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: '3.x'

      - name: Install cibuildwheel
        run: python -m pip install cibuildwheel==3.2.1

      - name: Build wheels
        run: python -m cibuildwheel --output-dir dist
        env:
          CIBW_BEFORE_BUILD: python -m pip install -U pip && rm -rf build
          CIBW_ARCHS_LINUX: auto aarch64

      - name: Upload wheels
        uses: actions/upload-artifact@v4
        with:
          name: built-wheels-${{ matrix.os }}-${{ strategy.job-index }}
          path: ./dist/*.whl

      - name: Build source distribution
        if: matrix.os == 'ubuntu-latest'
        run: python -m pip install build && python -m build --sdist --outdir dist

      - name: Upload sdist
        if: matrix.os == 'ubuntu-latest'
        uses: actions/upload-artifact@v4
        with:
          name: built-sdist
          path: ./dist/*.tar.gz

  publish:
    needs: build
    runs-on: ubuntu-latest
    # pypi trusted publishing via OIDC
    permissions:
      id-token: write
    steps:
      - name: Download all artifacts
        uses: actions/download-artifact@v4
        with:
          pattern: built-*
          path: dist
          merge-multiple: true

      - name: Publish package
        uses: pypa/gh-action-pypi-publish@release/v1
        if: startsWith(github.ref, 'refs/tags/v') && github.event_name == 'push'
        with:
          password: ${{ secrets.PYPI_API_TOKEN }}


================================================
FILE: .gitignore
================================================
*.egg-info/
*.egg/
*.so
*.o
build/
dist/
.vscode/
*.pdb

MANIFEST
*.py[cod]
*.idea

# testing
*.ann
*.tree
*.annoy
*.idx
*.hdf5


================================================
FILE: CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.15...3.25 FATAL_ERROR)

project(Annoy
  DESCRIPTION "Approximate Nearest Neighbors Oh Yeah"
  VERSION 1.17.1
  LANGUAGES CXX)

add_library(Annoy INTERFACE)
add_library(Annoy::Annoy ALIAS Annoy)

foreach (HEADER annoylib.h kissrandom.h mman.h)
  configure_file("${CMAKE_CURRENT_SOURCE_DIR}/src/${HEADER}" "${CMAKE_CURRENT_BINARY_DIR}/include/annoy/${HEADER}" COPYONLY)
endforeach ()

target_include_directories(Annoy INTERFACE
  $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/include>
  $<INSTALL_INTERFACE:include>)

# Install
include(GNUInstallDirs)

install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/include/
  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})

install(TARGETS Annoy
  EXPORT AnnoyTargets)

install(EXPORT AnnoyTargets
  FILE AnnoyConfig.cmake
  NAMESPACE Annoy::
  DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/annoy)

export(TARGETS Annoy NAMESPACE Annoy:: FILE AnnoyConfig.cmake)


================================================
FILE: LICENSE
================================================

                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright 2021 (c) Spotify and its affiliates.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: MANIFEST.in
================================================
include README.rst LICENSE ann.png
include src/annoylib.h
include src/kissrandom.h
include src/mman.h


================================================
FILE: README.rst
================================================
Annoy
-----


.. figure:: https://raw.github.com/spotify/annoy/master/ann.png
   :alt: Annoy example
   :align: center

.. image:: https://github.com/spotify/annoy/actions/workflows/ci.yml/badge.svg
   :target: https://github.com/spotify/annoy/actions

Annoy (`Approximate Nearest Neighbors <http://en.wikipedia.org/wiki/Nearest_neighbor_search#Approximate_nearest_neighbor>`__ Oh Yeah) is a C++ library with Python bindings to search for points in space that are close to a given query point. It also creates large read-only file-based data structures that are `mmapped <https://en.wikipedia.org/wiki/Mmap>`__ into memory so that many processes may share the same data.

Install
-------

To install, simply do ``pip install --user annoy`` to pull down the latest version from `PyPI <https://pypi.python.org/pypi/annoy>`_.

For the C++ version, just clone the repo and ``#include "annoylib.h"``.

Background
----------

There are some other libraries to do nearest neighbor search. Annoy is almost as fast as the fastest libraries, (see below), but there is actually another feature that really sets Annoy apart: it has the ability to **use static files as indexes**. In particular, this means you can **share index across processes**. Annoy also decouples creating indexes from loading them, so you can pass around indexes as files and map them into memory quickly. Another nice thing of Annoy is that it tries to minimize memory footprint so the indexes are quite small.

Why is this useful? If you want to find nearest neighbors and you have many CPU's, you only need to build the index once. You can also pass around and distribute static files to use in production environment, in Hadoop jobs, etc. Any process will be able to load (mmap) the index into memory and will be able to do lookups immediately.

We use it at `Spotify <http://www.spotify.com/>`__ for music recommendations. After running matrix factorization algorithms, every user/item can be represented as a vector in f-dimensional space. This library helps us search for similar users/items. We have many millions of tracks in a high-dimensional space, so memory usage is a prime concern.

Annoy was built by `Erik Bernhardsson <http://www.erikbern.com>`__ in a couple of afternoons during `Hack Week <http://labs.spotify.com/2013/02/15/organizing-a-hack-week/>`__.

Summary of features
-------------------

* `Euclidean distance <https://en.wikipedia.org/wiki/Euclidean_distance>`__, `Manhattan distance <https://en.wikipedia.org/wiki/Taxicab_geometry>`__, `cosine distance <https://en.wikipedia.org/wiki/Cosine_similarity>`__, `Hamming distance <https://en.wikipedia.org/wiki/Hamming_distance>`__, or `Dot (Inner) Product distance <https://en.wikipedia.org/wiki/Dot_product>`__
* Cosine distance is equivalent to Euclidean distance of normalized vectors = sqrt(2-2*cos(u, v))
* Works better if you don't have too many dimensions (like <100) but seems to perform surprisingly well even up to 1,000 dimensions
* Small memory usage
* Lets you share memory between multiple processes
* Index creation is separate from lookup (in particular you can not add more items once the tree has been created)
* Native Python support, tested with 2.7, 3.6, and 3.7.
* Build index on disk to enable indexing big datasets that won't fit into memory (contributed by `Rene Hollander <https://github.com/ReneHollander>`__)

Python code example
-------------------

.. code-block:: python

  from annoy import AnnoyIndex
  import random

  f = 40  # Length of item vector that will be indexed

  t = AnnoyIndex(f, 'angular')
  for i in range(1000):
      v = [random.gauss(0, 1) for z in range(f)]
      t.add_item(i, v)

  t.build(10) # 10 trees
  t.save('test.ann')

  # ...

  u = AnnoyIndex(f, 'angular')
  u.load('test.ann') # super fast, will just mmap the file
  print(u.get_nns_by_item(0, 1000)) # will find the 1000 nearest neighbors

Right now it only accepts integers as identifiers for items. Note that it will allocate memory for max(id)+1 items because it assumes your items are numbered 0 … n-1. If you need other id's, you will have to keep track of a map yourself.

Full Python API
---------------

* ``AnnoyIndex(f, metric)`` returns a new index that's read-write and stores vector of ``f`` dimensions. Metric can be ``"angular"``, ``"euclidean"``, ``"manhattan"``, ``"hamming"``, or ``"dot"``.
* ``a.add_item(i, v)`` adds item ``i`` (any nonnegative integer) with vector ``v``. Note that it will allocate memory for ``max(i)+1`` items.
* ``a.build(n_trees, n_jobs=-1)`` builds a forest of ``n_trees`` trees. More trees gives higher precision when querying. After calling ``build``, no more items can be added. ``n_jobs`` specifies the number of threads used to build the trees. ``n_jobs=-1`` uses all available CPU cores.
* ``a.save(fn, prefault=False)`` saves the index to disk and loads it (see next function). After saving, no more items can be added.
* ``a.load(fn, prefault=False)`` loads (mmaps) an index from disk. If `prefault` is set to `True`, it will pre-read the entire file into memory (using mmap with `MAP_POPULATE`). Default is `False`.
* ``a.unload()`` unloads.
* ``a.get_nns_by_item(i, n, search_k=-1, include_distances=False)`` returns the ``n`` closest items. During the query it will inspect up to ``search_k`` nodes which defaults to ``n_trees * n`` if not provided. ``search_k`` gives you a run-time tradeoff between better accuracy and speed. If you set ``include_distances`` to ``True``, it will return a 2 element tuple with two lists in it: the second one containing all corresponding distances.
* ``a.get_nns_by_vector(v, n, search_k=-1, include_distances=False)`` same but query by vector ``v``.
* ``a.get_item_vector(i)`` returns the vector for item ``i`` that was previously added.
* ``a.get_distance(i, j)`` returns the distance between items ``i`` and ``j``. NOTE: this used to return the *squared* distance, but has been changed as of Aug 2016.
* ``a.get_n_items()`` returns the number of items in the index.
* ``a.get_n_trees()`` returns the number of trees in the index.
* ``a.on_disk_build(fn)`` prepares annoy to build the index in the specified file instead of RAM (execute before adding items, no need to save after build)
* ``a.set_seed(seed)`` will initialize the random number generator with the given seed.  Only used for building up the tree, i. e. only necessary to pass this before adding the items.  Will have no effect after calling `a.build(n_trees)` or `a.load(fn)`.

Notes:

* There's no bounds checking performed on the values so be careful.
* Annoy uses Euclidean distance of normalized vectors for its angular distance, which for two vectors u,v is equal to ``sqrt(2(1-cos(u,v)))``


The C++ API is very similar: just ``#include "annoylib.h"`` to get access to it.

Tradeoffs
---------

There are just two main parameters needed to tune Annoy: the number of trees ``n_trees`` and the number of nodes to inspect during searching ``search_k``.

* ``n_trees`` is provided during build time and affects the build time and the index size. A larger value will give more accurate results, but larger indexes.
* ``search_k`` is provided in runtime and affects the search performance. A larger value will give more accurate results, but will take longer time to return.

If ``search_k`` is not provided, it will default to ``n * n_trees`` where ``n`` is the number of approximate nearest neighbors. Otherwise, ``search_k`` and ``n_trees`` are roughly independent, i.e. the value of ``n_trees`` will not affect search time if ``search_k`` is held constant and vice versa. Basically it's recommended to set ``n_trees`` as large as possible given the amount of memory you can afford, and it's recommended to set ``search_k`` as large as possible given the time constraints you have for the queries.

You can also accept slower search times in favour of reduced loading times, memory usage, and disk IO. On supported platforms the index is prefaulted during ``load`` and ``save``, causing the file to be pre-emptively read from disk into memory. If you set ``prefault`` to ``False``, pages of the mmapped index are instead read from disk and cached in memory on-demand, as necessary for a search to complete. This can significantly increase early search times but may be better suited for systems with low memory compared to index size, when few queries are executed against a loaded index, and/or when large areas of the index are unlikely to be relevant to search queries.


How does it work
----------------

Using `random projections <http://en.wikipedia.org/wiki/Locality-sensitive_hashing#Random_projection>`__ and by building up a tree. At every intermediate node in the tree, a random hyperplane is chosen, which divides the space into two subspaces. This hyperplane is chosen by sampling two points from the subset and taking the hyperplane equidistant from them.

We do this k times so that we get a forest of trees. k has to be tuned to your need, by looking at what tradeoff you have between precision and performance.

Hamming distance (contributed by `Martin Aumüller <https://github.com/maumueller>`__) packs the data into 64-bit integers under the hood and uses built-in bit count primitives so it could be quite fast. All splits are axis-aligned.

Dot Product distance (contributed by `Peter Sobot <https://github.com/psobot>`__ and `Pavel Korobov <https://github.com/pkorobov>`__) reduces the provided vectors from dot (or "inner-product") space to a more query-friendly cosine space using `a method by Bachrach et al., at Microsoft Research, published in 2014 <https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/XboxInnerProduct.pdf>`__.


More info
---------

* `Dirk Eddelbuettel <https://github.com/eddelbuettel>`__ provides an `R version of Annoy <http://dirk.eddelbuettel.com/code/rcpp.annoy.html>`__.
* `Andy Sloane <https://github.com/a1k0n>`__ provides a `Java version of Annoy <https://github.com/spotify/annoy-java>`__ although currently limited to cosine and read-only.
* `Pishen Tsai <https://github.com/pishen>`__ provides a `Scala wrapper of Annoy <https://github.com/pishen/annoy4s>`__ which uses JNA to call the C++ library of Annoy.
* `Atsushi Tatsuma <https://github.com/yoshoku>`__ provides `Ruby bindings for Annoy <https://github.com/yoshoku/annoy.rb>`__.
* There is `experimental support for Go <https://github.com/spotify/annoy/blob/master/README_GO.rst>`__ provided by `Taneli Leppä <https://github.com/rosmo>`__.
* `Boris Nagaev <https://github.com/starius>`__ wrote `Lua bindings <https://github.com/spotify/annoy/blob/master/README_Lua.md>`__.
* During part of Spotify Hack Week 2016 (and a bit afterward), `Jim Kang <https://github.com/jimkang>`__ wrote `Node bindings <https://github.com/jimkang/annoy-node>`__ for Annoy.
* `Min-Seok Kim <https://github.com/mskimm>`__ built a `Scala version <https://github.com/mskimm/ann4s>`__ of Annoy.
* `hanabi1224 <https://github.com/hanabi1224>`__ built a read-only `Rust version <https://github.com/hanabi1224/RuAnnoy>`__ of Annoy, together with **dotnet, jvm and dart** read-only bindings.
* `Presentation from New York Machine Learning meetup <http://www.slideshare.net/erikbern/approximate-nearest-neighbor-methods-and-vector-models-nyc-ml-meetup>`__ about Annoy
* Annoy is available as a `conda package <https://anaconda.org/conda-forge/python-annoy>`__ on Linux, OS X, and Windows.
* `ann-benchmarks <https://github.com/erikbern/ann-benchmarks>`__ is a benchmark for several approximate nearest neighbor libraries. Annoy seems to be fairly competitive, especially at higher precisions:

.. figure:: https://raw.githubusercontent.com/erikbern/ann-benchmarks/main/results/glove-100-angular.png
   :alt: ANN benchmarks
   :align: center
   :target: https://github.com/erikbern/ann-benchmarks

Source code
-----------

It's all written in C++ with a handful of ugly optimizations for performance and memory usage. You have been warned :)

The code should support Windows, thanks to `Qiang Kou <https://github.com/thirdwing>`__ and `Timothy Riley <https://github.com/tjrileywisc>`__.

To run the tests, execute `python setup.py nosetests`. The test suite includes a big real world dataset that is downloaded from the internet, so it will take a few minutes to execute.

Discuss
-------

Feel free to post any questions or comments to the `annoy-user <https://groups.google.com/group/annoy-user>`__ group. I'm `@fulhack <https://twitter.com/fulhack>`__ on Twitter.


================================================
FILE: README_GO.rst
================================================
Install
-------

To install, you'll need Swig (tested with Swig 4.2.1 on Ubuntu 24.04), and then just::

  swig -go -intgosize 64 -cgo -c++ src/annoygomodule.i
  mkdir -p $(go env GOPATH)/src/annoy
  cp src/annoygomodule_wrap.cxx src/annoy.go src/annoygomodule.h src/annoylib.h src/kissrandom.h test/annoy_test.go $(go env GOPATH)/src/annoy
  cd $(go env GOPATH)/src/annoy
  go mod init github.com/spotify/annoy
  go mod tidy
  go test

Background
----------

See the main README.

Go code example
-------------------

.. code-block:: go

  package main
  
  import (
         "fmt"
         "math/rand"

         "github.com/spotify/annoy"
  )
  
  func main() {
       f := 40
       t := annoy.NewAnnoyIndexAngular(f)
       for i := 0; i < 1000; i++ {
       	 item := make([]float32, 0, f)
       	 for x:= 0; x < f; x++ {
  	     item = append(item, rand.Float32())
  	 }
  	 t.AddItem(i, item)
       }
       t.Build(10)
       t.Save("test.ann")
  
       annoy.DeleteAnnoyIndexAngular(t)
       
       t = annoy.NewAnnoyIndexAngular(f)
       t.Load("test.ann")
       
       result := annoyindex.NewAnnoyVectorInt()
       defer result.Free()
       t.GetNnsByItem(0, 1000, -1, result)
       fmt.Printf("%v\n", result.ToSlice())
  
  }
  
Right now it only accepts integers as identifiers for items. Note that it will allocate memory for max(id)+1 items because it assumes your items are numbered 0 … n-1. If you need other id's, you will have to keep track of a map yourself.

Full Go API
---------------

See annoygomodule.h. Generally the same as Python API except some arguments are not optional. Go binding does not support multithreaded build.

Tests
-------
A simple test is supplied in test/annoy_test.go.

Discuss
-------

Memroy leak in the previous versions has been fixed thanks to https://github.com/swig/swig/issues/2292. (memory leak fix is implemented in https://github.com/Rikanishu/annoy-go)

Go glue written by Taneli Leppä (@rosmo). You can contact me via email (see https://github.com/rosmo).


================================================
FILE: README_Lua.md
================================================
Install
-------

To install, you'll need Lua (binary + library) and LuaRocks.

If you have Python and Pip, you can get Lua and LuaRocks
using [hererocks](https://github.com/mpeterv/hererocks/),
written by Peter Melnichenko.

```
  pip install hererocks
  hererocks here --lua 5.1 --luarocks 2.2
```

This command installs Lua and LuaRocks locally to directory `here`.
To activate it, add `here/bin` to `PATH`:

```
  export PATH="$(pwd)/here/bin/:$PATH"
```

Then you can use commands `lua`, `luarocks`,
and tools installed by `luarocks`.

To build and install `annoy`, type:

```
  luarocks make
```

Background
----------

See the main README.

Lua code example
----------------

```lua
local annoy = require "annoy"

local f = 3
local t = annoy.AnnoyIndex(f) -- Length of item vector that will be indexed
for i = 0, 999 do
  local v = {math.random(), math.random(), math.random()}
  t:add_item(i, v)
end

t:build(10) -- 10 trees
t:save('test.ann')

-- ...

local u = annoy.AnnoyIndex(f)
u:load('test.ann') -- super fast, will just mmap the file

-- find the 10 nearest neighbors
local neighbors = u:get_nns_by_item(0, 10)
for rank, i in ipairs(neighbors) do
  print("neighbor", rank, "is", i)
end
```

Full Lua API
------------

Lua API closely resembles Python API, see main README. Lua binding does not support multithreaded build.


Tests
-------

File `test/annoy_test.lua` is the literal translation of
`test/annoy_test.py` from Python+Nosetests to Lua+Busted.

To run tests, you need [Busted](http://olivinelabs.com/busted/),
Elegant Lua unit testing. To install it, type:

```
  luarocks install busted
```

To run tests, type:

```
  busted test/annoy_test.lua
```

It will take few minutes to execute.

Discuss
-------

There might be some memory leaks if inputs are incorrect.
Some functions allocate stack objects calling Lua functions throwing
Lua errors (e.g., `luaL_checkinteger`). A Lua error may omit calling
C++ destructors when unwinding the stack. (If it does, depends on
the Lua implementation and platform being in use.)

Lua binding was written by Boris Nagaev.
You can contact me via email (see https://github.com/starius).


================================================
FILE: RELEASE.md
================================================
How to release
--------------

1. Make sure you're on master. `git checkout master && git fetch && git reset --hard origin/master`
1. Update `setup.py` to the newest version, `git add setup.py && git commit -m "version 1.2.3"`
1. `python setup.py sdist bdist_wheel`
1. `git tag -a v1.2.3 -m "version 1.2.3"`
1. `git push --tags origin master` to push the last version to Github
1. Go to https://github.com/spotify/annoy/releases and click "Draft a new release"
1. `twine upload dist/annoy-1.2.3*`

TODO
----

* Wheel


================================================
FILE: annoy/__init__.py
================================================
# Copyright (c) 2013 Spotify AB
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations under
# the License.

# This module is a dummy wrapper around the underlying C++ module.
from .annoylib import Annoy as AnnoyIndex


================================================
FILE: annoy/__init__.pyi
================================================

from typing import Sized, overload
from typing_extensions import Literal, Protocol

class _Vector(Protocol, Sized):
    def __getitem__(self, __index: int) -> float: ...

class AnnoyIndex:
    f: int
    def __init__(self, f: int, metric: Literal["angular", "euclidean", "manhattan", "hamming", "dot"]) -> None: ...
    def load(self, fn: str, prefault: bool = ...) -> Literal[True]: ...
    def save(self, fn: str, prefault: bool = ...) -> Literal[True]: ...
    @overload
    def get_nns_by_item(self, i: int, n: int, search_k: int = ..., include_distances: Literal[False] = ...) -> list[int]: ...
    @overload
    def get_nns_by_item(
        self, i: int, n: int, search_k: int, include_distances: Literal[True]
    ) -> tuple[list[int], list[float]]: ...
    @overload
    def get_nns_by_item(
        self, i: int, n: int, search_k: int = ..., *, include_distances: Literal[True]
    ) -> tuple[list[int], list[float]]: ...
    @overload
    def get_nns_by_vector(
        self, vector: _Vector, n: int, search_k: int = ..., include_distances: Literal[False] = ...
    ) -> list[int]: ...
    @overload
    def get_nns_by_vector(
        self, vector: _Vector, n: int, search_k: int, include_distances: Literal[True]
    ) -> tuple[list[int], list[float]]: ...
    @overload
    def get_nns_by_vector(
        self, vector: _Vector, n: int, search_k: int = ..., *, include_distances: Literal[True]
    ) -> tuple[list[int], list[float]]: ...
    def get_item_vector(self, __i: int) -> list[float]: ...
    def add_item(self, i: int, vector: _Vector) -> None: ...
    def on_disk_build(self, fn: str) -> Literal[True]: ...
    def build(self, n_trees: int, n_jobs: int = ...) -> Literal[True]: ...
    def unbuild(self) -> Literal[True]: ...
    def unload(self) -> Literal[True]: ...
    def get_distance(self, __i: int, __j: int) -> float: ...
    def get_n_items(self) -> int: ...
    def get_n_trees(self) -> int: ...
    def verbose(self, __v: bool) -> Literal[True]: ...
    def set_seed(self, __s: int) -> None: ...


================================================
FILE: annoy/py.typed
================================================


================================================
FILE: annoy-dev-1.rockspec
================================================
-- Copyright (c) 2016 Boris Nagaev
--
-- Licensed under the Apache License, Version 2.0 (the "License"); you may not
-- use this file except in compliance with the License. You may obtain a copy of
-- the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-- License for the specific language governing permissions and limitations under
-- the License.

package = "annoy"
version = "dev-1"
source = {
    url = "git://github.com/spotify/annoy.git",
}
description = {
    summary = "Approximate Nearest Neighbors Oh Yeah",
    homepage = "https://github.com/spotify/annoy",
    license = "Apache",
    detailed = [[
Annoy (Approximate Nearest Neighbors Oh Yeah) is a C++ library with Python
Go and Lua bindings to search for points in space that are close to a given
query point. It also creates large read-only file-based data structures
that are mmapped into memory so that many processes may share the same data.
]],
}
dependencies = {
    "lua >= 5.1",
}
build = {
    type = "builtin",
    modules = {
        ['annoy'] = {
            sources = {
                "src/annoyluamodule.cc",
            },
        },
    },
    platforms = {
        unix = {
            modules = {
                ['annoy'] = {
                    libraries = {"stdc++"},
                },
            },
        },
        mingw32 = {
            modules = {
                ['annoy'] = {
                    libraries = {"stdc++"},
                },
            },
        },
    },
}


================================================
FILE: debian/changelog
================================================
spotify-annoy (1.0.0) unstable; urgency=low

  * Initial release.

 -- Erik Bernhardsson <erikbern@spotify.com>  Wed, 20 Feb 2013 00:00:00 +0000


================================================
FILE: debian/compat
================================================
7


================================================
FILE: debian/control
================================================
Source: spotify-annoy
Section: non-free/net
Priority: extra
Maintainer: Erik Bernhardsson <erikbern@spotify.com>
Build-Depends: debhelper (>= 7), python-all-dev, python-setuptools
Standards-Version: 3.7.2
XS-Python-Version: >= 2.6

Package: spotify-annoy
Architecture: any
Depends: ${python:Depends}
Description: Python module (written in C++) for high-dimensional approximate nearest neigbor (ANN) queries


================================================
FILE: debian/rules
================================================
#!/usr/bin/make -f

%:
	dh $@


================================================
FILE: examples/mmap_test.py
================================================
from annoy import AnnoyIndex

a = AnnoyIndex(3, 'angular')
a.add_item(0, [1, 0, 0])
a.add_item(1, [0, 1, 0])
a.add_item(2, [0, 0, 1])
a.build(-1)
a.save('test.tree')

b = AnnoyIndex(3)
b.load('test.tree')

print(b.get_nns_by_item(0, 100))
print(b.get_nns_by_vector([1.0, 0.5, 0.5], 100))


================================================
FILE: examples/precision_test.cpp
================================================
/*
 * precision_test.cpp

 *
 *  Created on: Jul 13, 2016
 *      Author: Claudio Sanhueza
 *      Contact: csanhuezalobos@gmail.com
 */

#include <iostream>
#include <iomanip>
#include "../src/kissrandom.h"
#include "../src/annoylib.h"
#include <chrono>
#include <algorithm>
#include <map>
#include <random>

using namespace Annoy;
int precision(int f=40, int n=1000000){
	std::chrono::high_resolution_clock::time_point t_start, t_end;

	std::default_random_engine generator;
	std::normal_distribution<double> distribution(0.0, 1.0);

	//******************************************************
	//Building the tree
	AnnoyIndex<int, double, Angular, Kiss32Random, AnnoyIndexMultiThreadedBuildPolicy> t = AnnoyIndex<int, double, Angular, Kiss32Random, AnnoyIndexMultiThreadedBuildPolicy>(f);

	std::cout << "Building index ... be patient !!" << std::endl;
	std::cout << "\"Trees that are slow to grow bear the best fruit\" (Moliere)" << std::endl;


	for(int i=0; i<n; ++i){
		double *vec = (double *) malloc( f * sizeof(double) );

		for(int z=0; z<f; ++z){
			vec[z] = (distribution(generator));
		}

		t.add_item(i, vec);

		std::cout << "Loading objects ...\t object: "<< i+1 << "\tProgress:"<< std::fixed << std::setprecision(2) << (double) i / (double)(n + 1) * 100 << "%\r";

	}
	std::cout << std::endl;
	std::cout << "Building index num_trees = 2 * num_features ...";
	t_start = std::chrono::high_resolution_clock::now();
	t.build(2 * f);
	t_end = std::chrono::high_resolution_clock::now();
	auto duration = std::chrono::duration_cast<std::chrono::seconds>( t_end - t_start ).count();
	std::cout << " Done in "<< duration << " secs." << std::endl;


	std::cout << "Saving index ...";
	t.save("precision.tree");
	std::cout << " Done" << std::endl;


	//******************************************************
	std::vector<int> limits = {10, 100, 1000, 10000};
	int K=10;
	int prec_n = 1000;

	std::map<int, double> prec_sum;
	std::map<int, double> time_sum;
	std::vector<int> closest;

	//init precision and timers map
	for(std::vector<int>::iterator it = limits.begin(); it!=limits.end(); ++it){
		prec_sum[(*it)] = 0.0;
		time_sum[(*it)] = 0.0;
	}

	// doing the work
	for(int i=0; i<prec_n; ++i){

		//select a random node
		int j = rand() % n;

		std::cout << "finding nbs for " << j << std::endl;

		// getting the K closest
		t.get_nns_by_item(j, K, n, &closest, nullptr);

		std::vector<int> toplist;
		std::vector<int> intersection;

		for(std::vector<int>::iterator limit = limits.begin(); limit!=limits.end(); ++limit){

			t_start = std::chrono::high_resolution_clock::now();
			t.get_nns_by_item(j, (*limit), (size_t) -1, &toplist, nullptr); //search_k defaults to "n_trees * n" if not provided.
			t_end = std::chrono::high_resolution_clock::now();
			auto duration = std::chrono::duration_cast<std::chrono::milliseconds>( t_end - t_start ).count();

			//intersecting results
			std::sort(closest.begin(), closest.end(), std::less<int>());
			std::sort(toplist.begin(), toplist.end(), std::less<int>());
			intersection.resize(std::max(closest.size(), toplist.size()));
			std::vector<int>::iterator it_set = std::set_intersection(closest.begin(), closest.end(), toplist.begin(), toplist.end(), intersection.begin());
			intersection.resize(it_set-intersection.begin());

			// storing metrics
			int found = intersection.size();
			double hitrate = found / (double) K;
			prec_sum[(*limit)] += hitrate;

			time_sum[(*limit)] += duration;


			//deallocate memory
			vector<int>().swap(intersection);
			vector<int>().swap(toplist);
		}

		//print resulting metrics
		for(std::vector<int>::iterator limit = limits.begin(); limit!=limits.end(); ++limit){
			std::cout << "limit: " << (*limit) << "\tprecision: "<< std::fixed << std::setprecision(2) << (100.0 * prec_sum[(*limit)] / (i + 1)) << "% \tavg. time: "<< std::fixed<< std::setprecision(6) << (time_sum[(*limit)] / (i + 1)) * 1e-04 << "s" << std::endl;
		}

		closest.clear(); vector<int>().swap(closest);

	}

	std::cout << "\nDone" << std::endl;
	return 0;
}


void help(){
	std::cout << "Annoy Precision C++ example" << std::endl;
	std::cout << "Usage:" << std::endl;
	std::cout << "(default)		./precision" << std::endl;
	std::cout << "(using parameters)	./precision num_features num_nodes" << std::endl;
	std::cout << std::endl;
}

void feedback(int f, int n){
	std::cout<<"Runing precision example with:" << std::endl;
	std::cout<<"num. features: "<< f << std::endl;
	std::cout<<"num. nodes: "<< n << std::endl;
	std::cout << std::endl;
}


int main(int argc, char **argv) {
	int f, n;


	if(argc == 1){
		f = 40;
		n = 1000000;

		feedback(f,n);

		precision(40, 1000000);
	}
	else if(argc == 3){

		f = atoi(argv[1]);
		n = atoi(argv[2]);

		feedback(f,n);

		precision(f, n);
	}
	else {
		help();
		return EXIT_FAILURE;
	}


	return EXIT_SUCCESS;
}


================================================
FILE: examples/precision_test.py
================================================
from __future__ import print_function
import random, time
from annoy import AnnoyIndex

try:
    xrange
except NameError:
    # Python 3 compat
    xrange = range

n, f = 100000, 40

t = AnnoyIndex(f, 'angular')
for i in xrange(n):
    v = []
    for z in xrange(f):
        v.append(random.gauss(0, 1))
    t.add_item(i, v)

t.build(2 * f)
t.save('test.tree')

limits = [10, 100, 1000, 10000]
k = 10
prec_sum = {}
prec_n = 1000
time_sum = {}

for i in xrange(prec_n):
    j = random.randrange(0, n)
        
    closest = set(t.get_nns_by_item(j, k, n))
    for limit in limits:
        t0 = time.time()
        toplist = t.get_nns_by_item(j, k, limit)
        T = time.time() - t0
            
        found = len(closest.intersection(toplist))
        hitrate = 1.0 * found / k
        prec_sum[limit] = prec_sum.get(limit, 0.0) + hitrate
        time_sum[limit] = time_sum.get(limit, 0.0) + T

for limit in limits:
    print('limit: %-9d precision: %6.2f%% avg time: %.6fs'
          % (limit, 100.0 * prec_sum[limit] / (i + 1),
             time_sum[limit] / (i + 1)))


================================================
FILE: examples/s_compile_cpp.sh
================================================
#!/bin/bash


echo "compiling precision example..."
cmd="g++ precision_test.cpp -DANNOYLIB_MULTITHREADED_BUILD -o precision_test -std=c++14 -pthread"
eval $cmd
echo "Done"


================================================
FILE: examples/simple_test.py
================================================
from annoy import AnnoyIndex

a = AnnoyIndex(3, 'angular')
a.add_item(0, [1, 0, 0])
a.add_item(1, [0, 1, 0])
a.add_item(2, [0, 0, 1])
a.build(-1)

print(a.get_nns_by_item(0, 100))
print(a.get_nns_by_vector([1.0, 0.5, 0.5], 100))


================================================
FILE: setup.cfg
================================================
[nosetests]
attr=!slow
nocapture=1


================================================
FILE: setup.py
================================================
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (c) 2013 Spotify AB
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations under
# the License.

from setuptools import setup, Extension
import os
import platform
import sys

readme_note = """\
.. note::

   For the latest source, discussion, etc, please visit the
   `GitHub repository <https://github.com/spotify/annoy>`_\n\n

.. image:: https://img.shields.io/github/stars/spotify/annoy.svg
    :target: https://github.com/spotify/annoy

"""

with open('README.rst', encoding='utf-8') as fobj:
    long_description = readme_note + fobj.read()

# Various platform-dependent extras
extra_compile_args = ['-D_CRT_SECURE_NO_WARNINGS', '-fpermissive']
extra_link_args = []
if platform.machine() == 'ppc64le':
    extra_compile_args += ['-mcpu=native',]

if platform.machine() == 'x86_64':
    # do not apply march on Intel Darwin
    if platform.system() != 'Darwin':
        # Not all CPUs have march as a tuning parameter
        extra_compile_args += ['-march=native',]

if os.name != 'nt':
    extra_compile_args += ['-O3', '-ffast-math', '-fno-associative-math']

# Add multithreaded build flag for all platforms using Python 3 and
# for non-Windows Python 2 platforms
python_major_version = sys.version_info[0]
if python_major_version == 3 or (python_major_version == 2 and os.name != 'nt'):
    extra_compile_args += ['-DANNOYLIB_MULTITHREADED_BUILD']

    if os.name != 'nt':
        extra_compile_args += ['-std=c++14']

# #349: something with OS X Mojave causes libstd not to be found
if platform.system() == 'Darwin':
    extra_compile_args += ['-mmacosx-version-min=10.12']
    extra_link_args += ['-stdlib=libc++', '-mmacosx-version-min=10.12']

# Manual configuration, you're on your own here.
manual_compiler_args = os.environ.get('ANNOY_COMPILER_ARGS', None)
if manual_compiler_args:
    extra_compile_args = manual_compiler_args.split(',')
manual_linker_args = os.environ.get('ANNOY_LINKER_ARGS', None)
if manual_linker_args:
    extra_link_args = manual_linker_args.split(',')

setup(name='annoy',
      version='1.17.3',
      description='Approximate Nearest Neighbors in C++/Python optimized for memory usage and loading/saving to disk.',
      packages=['annoy'],
      package_data={'annoy': ['__init__.pyi', 'py.typed']},
      ext_modules=[
          Extension(
              'annoy.annoylib', ['src/annoymodule.cc'],
              depends=['src/annoylib.h', 'src/kissrandom.h', 'src/mman.h'],
              extra_compile_args=extra_compile_args,
              extra_link_args=extra_link_args,
          )
      ],
      long_description=long_description,
      long_description_content_type='text/x-rst',
      author='Erik Bernhardsson',
      author_email='mail@erikbern.com',
      url='https://github.com/spotify/annoy',
      license='Apache License 2.0',
      classifiers=[
          'Development Status :: 5 - Production/Stable',
          'Programming Language :: Python',
          'Programming Language :: Python :: 2.6',
          'Programming Language :: Python :: 2.7',
          'Programming Language :: Python :: 3.3',
          'Programming Language :: Python :: 3.4',
          'Programming Language :: Python :: 3.5',
          'Programming Language :: Python :: 3.6',
          'Programming Language :: Python :: 3.7',
          'Programming Language :: Python :: 3.8',
          'Programming Language :: Python :: 3.9',
          'Programming Language :: Python :: 3.10',
          'Programming Language :: Python :: 3.11',
          'Programming Language :: Python :: 3.12',
          'Programming Language :: Python :: 3.13',
      ],
      keywords='nns, approximate nearest neighbor search',
      setup_requires=['nose>=1.0'],
      tests_require=['numpy', 'h5py']
      )


================================================
FILE: src/annoygomodule.h
================================================
#include "annoylib.h"
#include "kissrandom.h"

using namespace Annoy;

namespace GoAnnoy {


class AnnoyVectorFloat {
    protected:
        float *ptr;
        int len;

    public:
      ~AnnoyVectorFloat() {
        free(ptr);
      };
      float* ArrayPtr() {
        return ptr;
      };
      int Len() {
        return len;
      };
      float Get(int i) {
        if (i >= len) {
            return 0.0;
        }
        return ptr[i];
      };
      void fill_from_vector(vector<float>* v) {
            if (ptr != NULL) {
               free(ptr);
            }
            ptr = (float*) malloc(v->size() * sizeof(float));
            for (int i = 0; i < v->size(); i++) {
                ptr[i] = (float)(*v)[i];
            }
            len = v->size();
      };
};

class AnnoyVectorInt {
    protected:
        int32_t *ptr;
        int len;

    public:
      ~AnnoyVectorInt() {
        free(ptr);
      };
      int32_t* ArrayPtr() {
        return ptr;
      };
      int Len() {
        return len;
      };
      int32_t Get(int i) {
        if (i >= len) {
            return 0.0;
        }
        return ptr[i];
      };
      void fill_from_vector(vector<int32_t>* v) {
            if (ptr != NULL) {
                free(ptr);
            }
            ptr = (int32_t*) malloc(v->size() * sizeof(int32_t));
            for (int i = 0; i < v->size(); i++) {
                ptr[i] = (int32_t)(*v)[i];
            }
            len = v->size();
      };
};

class AnnoyIndex {
 protected:
  ::AnnoyIndexInterface<int32_t, float> *ptr;

  int f;

 public:
  ~AnnoyIndex() {
    delete ptr;
  };
  void addItem(int item, const float* w) {
    ptr->add_item(item, w);
  };
  void build(int q) {
    ptr->build(q, 1);
  };
  bool save(const char* filename, bool prefault) {
    return ptr->save(filename, prefault);
  };
  bool save(const char* filename) {
    return ptr->save(filename, true);
  };
  void unload() {
    ptr->unload();
  };
  bool load(const char* filename, bool prefault) {
    return ptr->load(filename, prefault);
  };
  bool load(const char* filename) {
    return ptr->load(filename, true);
  };
  float getDistance(int i, int j) {
    return ptr->get_distance(i, j);
  };
  void getNnsByItem(int item, int n, int search_k, AnnoyVectorInt* out_result, AnnoyVectorFloat* out_distances) {
    vector<int32_t>* result = new vector<int32_t>();
    vector<float>* distances = new vector<float>();

    ptr->get_nns_by_item(item, n, search_k, result, distances);

    out_result->fill_from_vector(result);
    out_distances->fill_from_vector(distances);
    delete result;
    delete distances;
  };
  void getNnsByVector(const float* w, int n, int search_k, AnnoyVectorInt* out_result, AnnoyVectorFloat* out_distances) {
    vector<int32_t>* result = new vector<int32_t>();
    vector<float>* distances = new vector<float>();

    ptr->get_nns_by_vector(w, n, search_k, result, distances);

    out_result->fill_from_vector(result);
    out_distances->fill_from_vector(distances);
    delete result;
    delete distances;
  };
  void getNnsByItem(int item, int n, int search_k, AnnoyVectorInt* out_result) {
    vector<int32_t>* result = new vector<int32_t>();

    ptr->get_nns_by_item(item, n, search_k, result, NULL);

    out_result->fill_from_vector(result);
    delete result;
  };
  void getNnsByVector(const float* w, int n, int search_k, AnnoyVectorInt* out_result) {
    vector<int32_t>* result = new vector<int32_t>();

    ptr->get_nns_by_vector(w, n, search_k, result, NULL);

    out_result->fill_from_vector(result);
    delete result;
  };

  int getNItems() {
    return (int)ptr->get_n_items();
  };
  void verbose(bool v) {
    ptr->verbose(v);
  };
  void getItem(int item, AnnoyVectorFloat *v) {
    vector<float>* r = new vector<float>();
    r->resize(this->f);
    ptr->get_item(item, &r->front());
    v->fill_from_vector(r);
  };
  bool onDiskBuild(const char* filename) {
    return ptr->on_disk_build(filename);
  };
};

class AnnoyIndexAngular : public AnnoyIndex 
{
 public:
  AnnoyIndexAngular(int f) {
    ptr = new ::AnnoyIndex<int32_t, float, ::Angular, ::Kiss64Random, AnnoyIndexSingleThreadedBuildPolicy>(f);
    this->f = f;
  }
};

class AnnoyIndexEuclidean : public AnnoyIndex {
 public:
  AnnoyIndexEuclidean(int f) {
    ptr = new ::AnnoyIndex<int32_t, float, ::Euclidean, ::Kiss64Random, AnnoyIndexSingleThreadedBuildPolicy>(f);
    this->f = f;
  }
};

class AnnoyIndexManhattan : public AnnoyIndex {
 public:
  AnnoyIndexManhattan(int f) {
    ptr = new ::AnnoyIndex<int32_t, float, ::Manhattan, ::Kiss64Random, AnnoyIndexSingleThreadedBuildPolicy>(f);
    this->f = f;
  }
};

class AnnoyIndexDotProduct : public AnnoyIndex {
 public:
  AnnoyIndexDotProduct(int f) {
    ptr = new ::AnnoyIndex<int32_t, float, ::DotProduct, ::Kiss64Random, AnnoyIndexSingleThreadedBuildPolicy>(f);
    this->f = f;
  }
};
}


================================================
FILE: src/annoygomodule.i
================================================
%module annoy

namespace Annoy {}

%{
#include "annoygomodule.h"
%}


// const float *
%typemap(gotype) (const float *)  "[]float32"
%typemap(gotype) (int32_t)  "int32"

%typemap(in) (const float *)
%{
    float *v;
    vector<float> w;
    v = (float *)$input.array;
    for (int i = 0; i < $input.len; i++) {
       w.push_back(v[i]);
    }
    $1 = &w[0];
%}


%typemap(gotype) (const char *) "string"

%typemap(in) (const char *)
%{
  $1 = (char *)calloc((((_gostring_)$input).n + 1), sizeof(char));
  strncpy($1, (((_gostring_)$input).p), ((_gostring_)$input).n);
%}

%typemap(freearg) (const char *)
%{
  free($1);
%}


%ignore fill_from_vector;
%rename(X_RawAnnoyVectorInt) AnnoyVectorInt;
%rename(X_RawAnnoyVectorFloat) AnnoyVectorFloat;

%insert(go_wrapper) %{

type AnnoyVectorInt interface {
  X_RawAnnoyVectorInt
  ToSlice() []int32
  Copy(in *[]int32)
  InnerArray() []int32
  Free()
}

func NewAnnoyVectorInt() AnnoyVectorInt {
    vec := NewX_RawAnnoyVectorInt()
    return vec.(SwigcptrX_RawAnnoyVectorInt)
}

func (p SwigcptrX_RawAnnoyVectorInt) ToSlice() []int32 {
    var out []int32
    p.Copy(&out)
    return out
}

func (p SwigcptrX_RawAnnoyVectorInt) Copy(in *[]int32)  {
    out := *in
    inner := p.InnerArray()
    if cap(out) >= len(inner) {
        if len(out) != len(inner) {
          out = out[:len(inner)]
        }
    } else {
        out = make([]int32, len(inner))
    }

    copy(out, inner)
    *in = out
}

func (p SwigcptrX_RawAnnoyVectorInt) Free() {
    DeleteX_RawAnnoyVectorInt(p)
}

func (p SwigcptrX_RawAnnoyVectorInt) InnerArray() []int32 {
	length := p.Len()
    ptr := unsafe.Pointer(p.ArrayPtr())
	return ((*[1 << 30]int32)(ptr))[:length:length]
}

%}

%insert(go_wrapper) %{

type AnnoyVectorFloat interface {
  X_RawAnnoyVectorFloat
  ToSlice() []float32
  Copy(in *[]float32)
  InnerArray() []float32
  Free()
}

func NewAnnoyVectorFloat() AnnoyVectorFloat {
    vec := NewX_RawAnnoyVectorFloat()
    return vec.(SwigcptrX_RawAnnoyVectorFloat)
}

func (p SwigcptrX_RawAnnoyVectorFloat) ToSlice() []float32 {
    var out []float32
    p.Copy(&out)
    return out
}

func (p SwigcptrX_RawAnnoyVectorFloat) Copy(in *[]float32)  {
    out := *in
    inner := p.InnerArray()
    if cap(out) >= len(inner) {
        if len(out) != len(inner) {
          out = out[:len(inner)]
        }
    } else {
        out = make([]float32, len(inner))
    }

    copy(out, inner)
    *in = out
}

func (p SwigcptrX_RawAnnoyVectorFloat) Free() {
    DeleteX_RawAnnoyVectorFloat(p)
}

func (p SwigcptrX_RawAnnoyVectorFloat) InnerArray() []float32 {
    length := p.Len()
    ptr := unsafe.Pointer(p.ArrayPtr())
    return ((*[1 << 30]float32)(ptr))[:length:length]
}

%}

/* Let's just grab the original header file here */
%include "annoygomodule.h"

%feature("notabstract") GoAnnoyIndexAngular;
%feature("notabstract") GoAnnoyIndexEuclidean;
%feature("notabstract") GoAnnoyIndexManhattan;
%feature("notabstract") GoAnnoyIndexDotProduct;

================================================
FILE: src/annoylib.h
================================================
// Copyright (c) 2013 Spotify AB
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy of
// the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations under
// the License.


#ifndef ANNOY_ANNOYLIB_H
#define ANNOY_ANNOYLIB_H

#include <stdio.h>
#include <sys/stat.h>
#ifndef _MSC_VER
#include <unistd.h>
#endif
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <fcntl.h>
#include <stddef.h>

#if defined(_MSC_VER) && _MSC_VER == 1500
typedef unsigned char     uint8_t;
typedef signed __int32    int32_t;
typedef unsigned __int64  uint64_t;
typedef signed __int64    int64_t;
#else
#include <stdint.h>
#endif

#if defined(_MSC_VER) || defined(__MINGW32__)
 // a bit hacky, but override some definitions to support 64 bit
 #define off_t int64_t
 #define lseek_getsize(fd) _lseeki64(fd, 0, SEEK_END)
 #ifndef NOMINMAX
  #define NOMINMAX
 #endif
 #include "mman.h"
 #include <windows.h>
#else
 #include <sys/mman.h>
 #define lseek_getsize(fd) lseek(fd, 0, SEEK_END)
#endif

#include <cerrno>
#include <string.h>
#include <math.h>
#include <vector>
#include <algorithm>
#include <queue>
#include <limits>

#if __cplusplus >= 201103L
#include <type_traits>
#endif

#ifdef ANNOYLIB_MULTITHREADED_BUILD
#include <thread>
#include <mutex>
#include <shared_mutex>
#endif

#ifdef _MSC_VER
// Needed for Visual Studio to disable runtime checks for mempcy
#pragma runtime_checks("s", off)
#endif

// This allows others to supply their own logger / error printer without
// requiring Annoy to import their headers. See RcppAnnoy for a use case.
#ifndef __ERROR_PRINTER_OVERRIDE__
  #define annoylib_showUpdate(...) { fprintf(stderr, __VA_ARGS__ ); }
#else
  #define annoylib_showUpdate(...) { __ERROR_PRINTER_OVERRIDE__( __VA_ARGS__ ); }
#endif

// Portable alloc definition, cf Writing R Extensions, Section 1.6.4
#ifdef __GNUC__
  // Includes GCC, clang and Intel compilers
  # undef alloca
  # define alloca(x) __builtin_alloca((x))
#elif defined(__sun) || defined(_AIX)
  // this is necessary (and sufficient) for Solaris 10 and AIX 6:
  # include <alloca.h>
#endif

// We let the v array in the Node struct take whatever space is needed, so this is a mostly insignificant number.
// Compilers need *some* size defined for the v array, and some memory checking tools will flag for buffer overruns if this is set too low.
#define ANNOYLIB_V_ARRAY_SIZE 65536

#ifndef _MSC_VER
#define annoylib_popcount __builtin_popcountll
#else // See #293, #358
#define annoylib_popcount cole_popcount
#endif

#if !defined(NO_MANUAL_VECTORIZATION) && defined(__GNUC__) && (__GNUC__ >6) && defined(__AVX512F__)  // See #402
#define ANNOYLIB_USE_AVX512
#elif !defined(NO_MANUAL_VECTORIZATION) && defined(__AVX__) && defined (__SSE__) && defined(__SSE2__) && defined(__SSE3__)
#define ANNOYLIB_USE_AVX
#else
#endif

#if defined(ANNOYLIB_USE_AVX) || defined(ANNOYLIB_USE_AVX512)
#if defined(_MSC_VER)
#include <intrin.h>
#elif defined(__GNUC__)
#include <x86intrin.h>
#endif
#endif

#if !defined(__MINGW32__)
#define ANNOYLIB_FTRUNCATE_SIZE(x) static_cast<int64_t>(x)
#else
#define ANNOYLIB_FTRUNCATE_SIZE(x) (x)
#endif

namespace Annoy {

inline void set_error_from_errno(char **error, const char* msg) {
  annoylib_showUpdate("%s: %s (%d)\n", msg, strerror(errno), errno);
  if (error) {
    *error = (char *)malloc(256);  // TODO: win doesn't support snprintf
    snprintf(*error, 255, "%s: %s (%d)", msg, strerror(errno), errno);
  }
}

inline void set_error_from_string(char **error, const char* msg) {
  annoylib_showUpdate("%s\n", msg);
  if (error) {
    *error = (char *)malloc(strlen(msg) + 1);
    strcpy(*error, msg);
  }
}


using std::vector;
using std::pair;
using std::numeric_limits;
using std::make_pair;

inline bool remap_memory_and_truncate(void** _ptr, int _fd, size_t old_size, size_t new_size) {
#ifdef __linux__
    *_ptr = mremap(*_ptr, old_size, new_size, MREMAP_MAYMOVE);
    bool ok = ftruncate(_fd, new_size) != -1;
#else
    munmap(*_ptr, old_size);
    bool ok = ftruncate(_fd, ANNOYLIB_FTRUNCATE_SIZE(new_size)) != -1;
#ifdef MAP_POPULATE
    *_ptr = mmap(*_ptr, new_size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, _fd, 0);
#else
    *_ptr = mmap(*_ptr, new_size, PROT_READ | PROT_WRITE, MAP_SHARED, _fd, 0);
#endif
#endif
    return ok;
}

namespace {

template<typename S, typename Node>
inline Node* get_node_ptr(const void* _nodes, const size_t _s, const S i) {
  return (Node*)((uint8_t *)_nodes + (_s * i));
}

template<typename T>
inline T dot(const T* x, const T* y, int f) {
  T s = 0;
  for (int z = 0; z < f; z++) {
    s += (*x) * (*y);
    x++;
    y++;
  }
  return s;
}

template<typename T>
inline T manhattan_distance(const T* x, const T* y, int f) {
  T d = 0.0;
  for (int i = 0; i < f; i++)
    d += fabs(x[i] - y[i]);
  return d;
}

template<typename T>
inline T euclidean_distance(const T* x, const T* y, int f) {
  // Don't use dot-product: avoid catastrophic cancellation in #314.
  T d = 0.0;
  for (int i = 0; i < f; ++i) {
    const T tmp=*x - *y;
    d += tmp * tmp;
    ++x;
    ++y;
  }
  return d;
}

#ifdef ANNOYLIB_USE_AVX
// Horizontal single sum of 256bit vector.
inline float hsum256_ps_avx(__m256 v) {
  const __m128 x128 = _mm_add_ps(_mm256_extractf128_ps(v, 1), _mm256_castps256_ps128(v));
  const __m128 x64 = _mm_add_ps(x128, _mm_movehl_ps(x128, x128));
  const __m128 x32 = _mm_add_ss(x64, _mm_shuffle_ps(x64, x64, 0x55));
  return _mm_cvtss_f32(x32);
}

template<>
inline float dot<float>(const float* x, const float *y, int f) {
  float result = 0;
  if (f > 7) {
    __m256 d = _mm256_setzero_ps();
    for (; f > 7; f -= 8) {
      d = _mm256_add_ps(d, _mm256_mul_ps(_mm256_loadu_ps(x), _mm256_loadu_ps(y)));
      x += 8;
      y += 8;
    }
    // Sum all floats in dot register.
    result += hsum256_ps_avx(d);
  }
  // Don't forget the remaining values.
  for (; f > 0; f--) {
    result += *x * *y;
    x++;
    y++;
  }
  return result;
}

template<>
inline float manhattan_distance<float>(const float* x, const float* y, int f) {
  float result = 0;
  int i = f;
  if (f > 7) {
    __m256 manhattan = _mm256_setzero_ps();
    __m256 minus_zero = _mm256_set1_ps(-0.0f);
    for (; i > 7; i -= 8) {
      const __m256 x_minus_y = _mm256_sub_ps(_mm256_loadu_ps(x), _mm256_loadu_ps(y));
      const __m256 distance = _mm256_andnot_ps(minus_zero, x_minus_y); // Absolute value of x_minus_y (forces sign bit to zero)
      manhattan = _mm256_add_ps(manhattan, distance);
      x += 8;
      y += 8;
    }
    // Sum all floats in manhattan register.
    result = hsum256_ps_avx(manhattan);
  }
  // Don't forget the remaining values.
  for (; i > 0; i--) {
    result += fabsf(*x - *y);
    x++;
    y++;
  }
  return result;
}

template<>
inline float euclidean_distance<float>(const float* x, const float* y, int f) {
  float result=0;
  if (f > 7) {
    __m256 d = _mm256_setzero_ps();
    for (; f > 7; f -= 8) {
      const __m256 diff = _mm256_sub_ps(_mm256_loadu_ps(x), _mm256_loadu_ps(y));
      d = _mm256_add_ps(d, _mm256_mul_ps(diff, diff)); // no support for fmadd in AVX...
      x += 8;
      y += 8;
    }
    // Sum all floats in dot register.
    result = hsum256_ps_avx(d);
  }
  // Don't forget the remaining values.
  for (; f > 0; f--) {
    float tmp = *x - *y;
    result += tmp * tmp;
    x++;
    y++;
  }
  return result;
}

#endif

#ifdef ANNOYLIB_USE_AVX512
template<>
inline float dot<float>(const float* x, const float *y, int f) {
  float result = 0;
  if (f > 15) {
    __m512 d = _mm512_setzero_ps();
    for (; f > 15; f -= 16) {
      //AVX512F includes FMA
      d = _mm512_fmadd_ps(_mm512_loadu_ps(x), _mm512_loadu_ps(y), d);
      x += 16;
      y += 16;
    }
    // Sum all floats in dot register.
    result += _mm512_reduce_add_ps(d);
  }
  // Don't forget the remaining values.
  for (; f > 0; f--) {
    result += *x * *y;
    x++;
    y++;
  }
  return result;
}

template<>
inline float manhattan_distance<float>(const float* x, const float* y, int f) {
  float result = 0;
  int i = f;
  if (f > 15) {
    __m512 manhattan = _mm512_setzero_ps();
    for (; i > 15; i -= 16) {
      const __m512 x_minus_y = _mm512_sub_ps(_mm512_loadu_ps(x), _mm512_loadu_ps(y));
      manhattan = _mm512_add_ps(manhattan, _mm512_abs_ps(x_minus_y));
      x += 16;
      y += 16;
    }
    // Sum all floats in manhattan register.
    result = _mm512_reduce_add_ps(manhattan);
  }
  // Don't forget the remaining values.
  for (; i > 0; i--) {
    result += fabsf(*x - *y);
    x++;
    y++;
  }
  return result;
}

template<>
inline float euclidean_distance<float>(const float* x, const float* y, int f) {
  float result=0;
  if (f > 15) {
    __m512 d = _mm512_setzero_ps();
    for (; f > 15; f -= 16) {
      const __m512 diff = _mm512_sub_ps(_mm512_loadu_ps(x), _mm512_loadu_ps(y));
      d = _mm512_fmadd_ps(diff, diff, d);
      x += 16;
      y += 16;
    }
    // Sum all floats in dot register.
    result = _mm512_reduce_add_ps(d);
  }
  // Don't forget the remaining values.
  for (; f > 0; f--) {
    float tmp = *x - *y;
    result += tmp * tmp;
    x++;
    y++;
  }
  return result;
}

#endif


template<typename T, typename Random, typename Distance, typename Node>
inline void two_means(const vector<Node*>& nodes, int f, Random& random, bool cosine, Node* p, Node* q) {
  /*
    This algorithm is a huge heuristic. Empirically it works really well, but I
    can't motivate it well. The basic idea is to keep two centroids and assign
    points to either one of them. We weight each centroid by the number of points
    assigned to it, so to balance it. 
  */
  static int iteration_steps = 200;
  size_t count = nodes.size();

  size_t i = random.index(count);
  size_t j = random.index(count-1);
  j += (j >= i); // ensure that i != j

  Distance::template copy_node<T, Node>(p, nodes[i], f);
  Distance::template copy_node<T, Node>(q, nodes[j], f);

  if (cosine) { Distance::template normalize<T, Node>(p, f); Distance::template normalize<T, Node>(q, f); }
  Distance::init_node(p, f);
  Distance::init_node(q, f);

  int ic = 1, jc = 1;
  for (int l = 0; l < iteration_steps; l++) {
    size_t k = random.index(count);
    T di = ic * Distance::distance(p, nodes[k], f),
      dj = jc * Distance::distance(q, nodes[k], f);
    T norm = cosine ? Distance::template get_norm<T, Node>(nodes[k], f) : 1;
    if (!(norm > T(0))) {
      continue;
    }
    if (di < dj) {
      Distance::update_mean(p, nodes[k], norm, ic, f);
      Distance::init_node(p, f);
      ic++;
    } else if (dj < di) {
      Distance::update_mean(q, nodes[k], norm, jc, f);
      Distance::init_node(q, f);
      jc++;
    }
  }
}
} // namespace

struct Base {
  template<typename T, typename S, typename Node>
  static inline void preprocess(void* nodes, size_t _s, const S node_count, const int f) {
    // Override this in specific metric structs below if you need to do any pre-processing
    // on the entire set of nodes passed into this index.
  }

  template<typename T, typename S, typename Node>
  static inline void postprocess(void* nodes, size_t _s, const S node_count, const int f) {
    // Override this in specific metric structs below if you need to do any post-processing
    // on the entire set of nodes passed into this index.
  }

  template<typename Node>
  static inline void zero_value(Node* dest) {
    // Initialize any fields that require sane defaults within this node.
  }

  template<typename T, typename Node>
  static inline void copy_node(Node* dest, const Node* source, const int f) {
    memcpy(dest->v, source->v, f * sizeof(T));
  }

  template<typename T, typename Node>
  static inline T get_norm(Node* node, int f) {
      return sqrt(dot(node->v, node->v, f));
  }

  template<typename T, typename Node>
  static inline void normalize(Node* node, int f) {
    T norm = Base::get_norm<T, Node>(node, f);
    if (norm > 0) {
      for (int z = 0; z < f; z++)
        node->v[z] /= norm;
    }
  }

  template<typename T, typename Node>
  static inline void update_mean(Node* mean, Node* new_node, T norm, int c, int f) {
      for (int z = 0; z < f; z++)
        mean->v[z] = (mean->v[z] * c + new_node->v[z] / norm) / (c + 1);
  }
};

struct Angular : Base {
  template<typename S, typename T>
  struct Node {
    /*
     * We store a binary tree where each node has two things
     * - A vector associated with it
     * - Two children
     * All nodes occupy the same amount of memory
     * All nodes with n_descendants == 1 are leaf nodes.
     * A memory optimization is that for nodes with 2 <= n_descendants <= K,
     * we skip the vector. Instead we store a list of all descendants. K is
     * determined by the number of items that fits in the space of the vector.
     * For nodes with n_descendants == 1 the vector is a data point.
     * For nodes with n_descendants > K the vector is the normal of the split plane.
     * Note that we can't really do sizeof(node<T>) because we cheat and allocate
     * more memory to be able to fit the vector outside
     */
    S n_descendants;
    union {
      S children[2]; // Will possibly store more than 2
      T norm;
    };
    T v[ANNOYLIB_V_ARRAY_SIZE];
  };
  template<typename S, typename T>
  static inline T distance(const Node<S, T>* x, const Node<S, T>* y, int f) {
    // want to calculate (a/|a| - b/|b|)^2
    // = a^2 / a^2 + b^2 / b^2 - 2ab/|a||b|
    // = 2 - 2cos
    T pp = x->norm ? x->norm : dot(x->v, x->v, f); // For backwards compatibility reasons, we need to fall back and compute the norm here
    T qq = y->norm ? y->norm : dot(y->v, y->v, f);
    T pq = dot(x->v, y->v, f);
    T ppqq = pp * qq;
    if (ppqq > 0) return 2.0 - 2.0 * pq / sqrt(ppqq);
    else return 2.0; // cos is 0
  }
  template<typename S, typename T>
  static inline T margin(const Node<S, T>* n, const T* y, int f) {
    return dot(n->v, y, f);
  }
  template<typename S, typename T, typename Random>
  static inline bool side(const Node<S, T>* n, const T* y, int f, Random& random) {
    T dot = margin(n, y, f);
    if (dot != 0)
      return (dot > 0);
    else
      return (bool)random.flip();
  }
  template<typename S, typename T, typename Random>
  static inline bool side(const Node<S, T>* n, const Node<S, T>* y, int f, Random& random) {
    return side(n, y->v, f, random);
  }
  template<typename S, typename T, typename Random>
  static inline void create_split(const vector<Node<S, T>*>& nodes, int f, size_t s, Random& random, Node<S, T>* n) {
    Node<S, T>* p = (Node<S, T>*)alloca(s);
    Node<S, T>* q = (Node<S, T>*)alloca(s);
    two_means<T, Random, Angular, Node<S, T> >(nodes, f, random, true, p, q);
    for (int z = 0; z < f; z++)
      n->v[z] = p->v[z] - q->v[z];
    Base::normalize<T, Node<S, T> >(n, f);
  }
  template<typename T>
  static inline T normalized_distance(T distance) {
    // Used when requesting distances from Python layer
    // Turns out sometimes the squared distance is -0.0
    // so we have to make sure it's a positive number.
    return sqrt(std::max(distance, T(0)));
  }
  template<typename T>
  static inline T pq_distance(T distance, T margin, int child_nr) {
    if (child_nr == 0)
      margin = -margin;
    return std::min(distance, margin);
  }
  template<typename T>
  static inline T pq_initial_value() {
    return numeric_limits<T>::infinity();
  }
  template<typename S, typename T>
  static inline void init_node(Node<S, T>* n, int f) {
    n->norm = dot(n->v, n->v, f);
  }
  static const char* name() {
    return "angular";
  }
};


struct DotProduct : Angular {
  template<typename S, typename T>
  struct Node {
    /*
     * This is an extension of the Angular node with extra attributes for the DotProduct metric.
     * It has dot_factor which is needed to reduce the task to Angular distance metric (see the preprocess method)
     * and also a built flag that helps to compute exact dot products when an index is already built.
     */
    S n_descendants;
    S children[2]; // Will possibly store more than 2
    T dot_factor;
    T norm;
    bool built;
    T v[ANNOYLIB_V_ARRAY_SIZE];
  };

  static const char* name() {
    return "dot";
  }

  template<typename T, typename Node>
  static inline T get_norm(Node* node, int f) {
      return sqrt(dot(node->v, node->v, f) + node->dot_factor * node->dot_factor);
  }

  template<typename T, typename Node>
  static inline void update_mean(Node* mean, Node* new_node, T norm, int c, int f) {
      for (int z = 0; z < f; z++)
        mean->v[z] = (mean->v[z] * c + new_node->v[z] / norm) / (c + 1);
      mean->dot_factor = (mean->dot_factor * c + new_node->dot_factor / norm) / (c + 1);
  }

  template<typename S, typename T>
  static inline T distance(const Node<S, T>* x, const Node<S, T>* y, int f) {
    if (x->built || y->built) {
      // When index is already built, we don't need angular distances to retrieve NNs
      // Thus, we can return dot product scores itself
      return -dot(x->v, y->v, f);
    }

    // Calculated by analogy with the angular case
    T pp = x->norm ? x->norm : dot(x->v, x->v, f) + x->dot_factor * x->dot_factor;
    T qq = y->norm ? y->norm : dot(y->v, y->v, f) + y->dot_factor * y->dot_factor;
    T pq = dot(x->v, y->v, f) + x->dot_factor * y->dot_factor;
    T ppqq = pp * qq;

    if (ppqq > 0) return 2.0 - 2.0 * pq / sqrt(ppqq);
    else return 2.0;
  }

  template<typename Node>
  static inline void zero_value(Node* dest) {
    dest->dot_factor = 0;
  }

  template<typename S, typename T>
  static inline void init_node(Node<S, T>* n, int f) {
    n->built = false;
    n->norm = dot(n->v, n->v, f) + n->dot_factor * n->dot_factor;
  }

  template<typename T, typename Node>
  static inline void copy_node(Node* dest, const Node* source, const int f) {
    memcpy(dest->v, source->v, f * sizeof(T));
    dest->dot_factor = source->dot_factor;
  }

  template<typename S, typename T, typename Random>
  static inline void create_split(const vector<Node<S, T>*>& nodes, int f, size_t s, Random& random, Node<S, T>* n) {
    Node<S, T>* p = (Node<S, T>*)alloca(s);
    Node<S, T>* q = (Node<S, T>*)alloca(s);
    DotProduct::zero_value(p); 
    DotProduct::zero_value(q);
    two_means<T, Random, DotProduct, Node<S, T> >(nodes, f, random, true, p, q);
    for (int z = 0; z < f; z++)
      n->v[z] = p->v[z] - q->v[z];
    n->dot_factor = p->dot_factor - q->dot_factor;
    DotProduct::normalize<T, Node<S, T> >(n, f);
  }

  template<typename T, typename Node>
  static inline void normalize(Node* node, int f) {
    T norm = sqrt(dot(node->v, node->v, f) + pow(node->dot_factor, 2));
    if (norm > 0) {
      for (int z = 0; z < f; z++)
        node->v[z] /= norm;
      node->dot_factor /= norm;
    }
  }

  template<typename S, typename T>
  static inline T margin(const Node<S, T>* n, const T* y, int f) {
    return dot(n->v, y, f);
  }

  template<typename S, typename T>
  static inline T margin(const Node<S, T>* n, const Node<S, T>* y, int f) {
    return dot(n->v, y->v, f) + n->dot_factor * y->dot_factor;
  }

  template<typename S, typename T, typename Random>
  static inline bool side(const Node<S, T>* n, const Node<S, T>* y, int f, Random& random) {
    T dot = margin(n, y, f);
    if (dot != 0)
      return (dot > 0);
    else
      return (bool)random.flip();
  }

  template<typename S, typename T, typename Random>
  static inline bool side(const Node<S, T>* n, const T* y, int f, Random& random) {
    T dot = margin(n, y, f);
    if (dot != 0)
      return (dot > 0);
    else
      return (bool)random.flip();
  }

  template<typename T>
  static inline T normalized_distance(T distance) {
    return -distance;
  }

  template<typename T, typename S, typename Node>
  static inline void preprocess(void* nodes, size_t _s, const S node_count, const int f) {
    // This uses a method from Microsoft Research for transforming inner product spaces to cosine/angular-compatible spaces.
    // (Bachrach et al., 2014, see https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/XboxInnerProduct.pdf)

    // Step one: compute the norm of each vector and store that in its extra dimension (f-1)
    for (S i = 0; i < node_count; i++) {
      Node* node = get_node_ptr<S, Node>(nodes, _s, i);
      T d = dot(node->v, node->v, f);
      T norm = d < 0 ? 0 : sqrt(d);
      node->dot_factor = norm;
      node->built = false;
    }

    // Step two: find the maximum norm
    T max_norm = 0;
    for (S i = 0; i < node_count; i++) {
      Node* node = get_node_ptr<S, Node>(nodes, _s, i);
      if (node->dot_factor > max_norm) {
        max_norm = node->dot_factor;
      }
    }

    // Step three: set each vector's extra dimension to sqrt(max_norm^2 - norm^2)
    for (S i = 0; i < node_count; i++) {
      Node* node = get_node_ptr<S, Node>(nodes, _s, i);
      T node_norm = node->dot_factor;
      T squared_norm_diff = pow(max_norm, static_cast<T>(2.0)) - pow(node_norm, static_cast<T>(2.0));
      T dot_factor = squared_norm_diff < 0 ? 0 : sqrt(squared_norm_diff);

      node->norm = pow(max_norm, static_cast<T>(2.0));
      node->dot_factor = dot_factor;
    }
  }

  template<typename T, typename S, typename Node>
  static inline void postprocess(void* nodes, size_t _s, const S node_count, const int f) {
    for (S i = 0; i < node_count; i++) {
      Node* node = get_node_ptr<S, Node>(nodes, _s, i);
      // When an index is built, we will remember it in index item nodes to compute distances differently
      node->built = true;
    }
  }
};

struct Hamming : Base {
  template<typename S, typename T>
  struct Node {
    S n_descendants;
    S children[2];
    T v[ANNOYLIB_V_ARRAY_SIZE];
  };

  static const size_t max_iterations = 20;

  template<typename T>
  static inline T pq_distance(T distance, T margin, int child_nr) {
    return distance - (margin != (unsigned int) child_nr);
  }

  template<typename T>
  static inline T pq_initial_value() {
    return numeric_limits<T>::max();
  }
  template<typename T>
  static inline int cole_popcount(T v) {
    // Note: Only used with MSVC 9, which lacks intrinsics and fails to
    // calculate std::bitset::count for v > 32bit. Uses the generalized
    // approach by Eric Cole.
    // See https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSet64
    v = v - ((v >> 1) & (T)~(T)0/3);
    v = (v & (T)~(T)0/15*3) + ((v >> 2) & (T)~(T)0/15*3);
    v = (v + (v >> 4)) & (T)~(T)0/255*15;
    return (T)(v * ((T)~(T)0/255)) >> (sizeof(T) - 1) * 8;
  }
  template<typename S, typename T>
  static inline T distance(const Node<S, T>* x, const Node<S, T>* y, int f) {
    size_t dist = 0;
    for (int i = 0; i < f; i++) {
      dist += annoylib_popcount(x->v[i] ^ y->v[i]);
    }
    return dist;
  }
  template<typename S, typename T>
  static inline bool margin(const Node<S, T>* n, const T* y, int f) {
    static const size_t n_bits = sizeof(T) * 8;
    T chunk = n->v[0] / n_bits;
    return (y[chunk] & (static_cast<T>(1) << (n_bits - 1 - (n->v[0] % n_bits)))) != 0;
  }
  template<typename S, typename T, typename Random>
  static inline bool side(const Node<S, T>* n, const T* y, int f, Random& random) {
    return margin(n, y, f);
  }
  template<typename S, typename T, typename Random>
  static inline bool side(const Node<S, T>* n, const Node<S, T>* y, int f, Random& random) {
    return side(n, y->v, f, random);
  }
  template<typename S, typename T, typename Random>
  static inline void create_split(const vector<Node<S, T>*>& nodes, int f, size_t s, Random& random, Node<S, T>* n) {
    size_t cur_size = 0;
    size_t i = 0;
    int dim = f * 8 * sizeof(T);
    for (; i < max_iterations; i++) {
      // choose random position to split at
      n->v[0] = random.index(dim);
      cur_size = 0;
      for (typename vector<Node<S, T>*>::const_iterator it = nodes.begin(); it != nodes.end(); ++it) {
        if (margin(n, (*it)->v, f)) {
          cur_size++;
        }
      }
      if (cur_size > 0 && cur_size < nodes.size()) {
        break;
      }
    }
    // brute-force search for splitting coordinate
    if (i == max_iterations) {
      int j = 0;
      for (; j < dim; j++) {
        n->v[0] = j;
        cur_size = 0;
        for (typename vector<Node<S, T>*>::const_iterator it = nodes.begin(); it != nodes.end(); ++it) {
          if (margin(n, (*it)->v, f)) {
            cur_size++;
          }
        }
        if (cur_size > 0 && cur_size < nodes.size()) {
          break;
        }
      }
    }
  }
  template<typename T>
  static inline T normalized_distance(T distance) {
    return distance;
  }
  template<typename S, typename T>
  static inline void init_node(Node<S, T>* n, int f) {
  }
  static const char* name() {
    return "hamming";
  }
};


struct Minkowski : Base {
  template<typename S, typename T>
  struct Node {
    S n_descendants;
    T a; // need an extra constant term to determine the offset of the plane
    S children[2];
    T v[ANNOYLIB_V_ARRAY_SIZE];
  };
  template<typename S, typename T>
  static inline T margin(const Node<S, T>* n, const T* y, int f) {
    return n->a + dot(n->v, y, f);
  }
  template<typename S, typename T, typename Random>
  static inline bool side(const Node<S, T>* n, const T* y, int f, Random& random) {
    T dot = margin(n, y, f);
    if (dot != 0)
      return (dot > 0);
    else
      return (bool)random.flip();
  }
  template<typename S, typename T, typename Random>
  static inline bool side(const Node<S, T>* n, const Node<S, T>* y, int f, Random& random) {
    return side(n, y->v, f, random);
  }
  template<typename T>
  static inline T pq_distance(T distance, T margin, int child_nr) {
    if (child_nr == 0)
      margin = -margin;
    return std::min(distance, margin);
  }
  template<typename T>
  static inline T pq_initial_value() {
    return numeric_limits<T>::infinity();
  }
};


struct Euclidean : Minkowski {
  template<typename S, typename T>
  static inline T distance(const Node<S, T>* x, const Node<S, T>* y, int f) {
    return euclidean_distance(x->v, y->v, f);    
  }
  template<typename S, typename T, typename Random>
  static inline void create_split(const vector<Node<S, T>*>& nodes, int f, size_t s, Random& random, Node<S, T>* n) {
    Node<S, T>* p = (Node<S, T>*)alloca(s);
    Node<S, T>* q = (Node<S, T>*)alloca(s);
    two_means<T, Random, Euclidean, Node<S, T> >(nodes, f, random, false, p, q);

    for (int z = 0; z < f; z++)
      n->v[z] = p->v[z] - q->v[z];
    Base::normalize<T, Node<S, T> >(n, f);
    n->a = 0.0;
    for (int z = 0; z < f; z++)
      n->a += -n->v[z] * (p->v[z] + q->v[z]) / 2;
  }
  template<typename T>
  static inline T normalized_distance(T distance) {
    return sqrt(std::max(distance, T(0)));
  }
  template<typename S, typename T>
  static inline void init_node(Node<S, T>* n, int f) {
  }
  static const char* name() {
    return "euclidean";
  }

};

struct Manhattan : Minkowski {
  template<typename S, typename T>
  static inline T distance(const Node<S, T>* x, const Node<S, T>* y, int f) {
    return manhattan_distance(x->v, y->v, f);
  }
  template<typename S, typename T, typename Random>
  static inline void create_split(const vector<Node<S, T>*>& nodes, int f, size_t s, Random& random, Node<S, T>* n) {
    Node<S, T>* p = (Node<S, T>*)alloca(s);
    Node<S, T>* q = (Node<S, T>*)alloca(s);
    two_means<T, Random, Manhattan, Node<S, T> >(nodes, f, random, false, p, q);

    for (int z = 0; z < f; z++)
      n->v[z] = p->v[z] - q->v[z];
    Base::normalize<T, Node<S, T> >(n, f);
    n->a = 0.0;
    for (int z = 0; z < f; z++)
      n->a += -n->v[z] * (p->v[z] + q->v[z]) / 2;
  }
  template<typename T>
  static inline T normalized_distance(T distance) {
    return std::max(distance, T(0));
  }
  template<typename S, typename T>
  static inline void init_node(Node<S, T>* n, int f) {
  }
  static const char* name() {
    return "manhattan";
  }
};

template<typename S, typename T, typename R = uint64_t>
class AnnoyIndexInterface {
 public:
  // Note that the methods with an **error argument will allocate memory and write the pointer to that string if error is non-NULL
  virtual ~AnnoyIndexInterface() {};
  virtual bool add_item(S item, const T* w, char** error=NULL) = 0;
  virtual bool build(int q, int n_threads=-1, char** error=NULL) = 0;
  virtual bool unbuild(char** error=NULL) = 0;
  virtual bool save(const char* filename, bool prefault=false, char** error=NULL) = 0;
  virtual void unload() = 0;
  virtual bool load(const char* filename, bool prefault=false, char** error=NULL) = 0;
  virtual T get_distance(S i, S j) const = 0;
  virtual void get_nns_by_item(S item, size_t n, int search_k, vector<S>* result, vector<T>* distances) const = 0;
  virtual void get_nns_by_vector(const T* w, size_t n, int search_k, vector<S>* result, vector<T>* distances) const = 0;
  virtual S get_n_items() const = 0;
  virtual S get_n_trees() const = 0;
  virtual void verbose(bool v) = 0;
  virtual void get_item(S item, T* v) const = 0;
  virtual void set_seed(R q) = 0;
  virtual bool on_disk_build(const char* filename, char** error=NULL) = 0;
};

template<typename S, typename T, typename Distance, typename Random, class ThreadedBuildPolicy>
  class AnnoyIndex : public AnnoyIndexInterface<S, T, 
#if __cplusplus >= 201103L
    typename std::remove_const<decltype(Random::default_seed)>::type
#else
    typename Random::seed_type
#endif
    > {
  /*
   * We use random projection to build a forest of binary trees of all items.
   * Basically just split the hyperspace into two sides by a hyperplane,
   * then recursively split each of those subtrees etc.
   * We create a tree like this q times. The default q is determined automatically
   * in such a way that we at most use 2x as much memory as the vectors take.
   */
public:
  typedef Distance D;
  typedef typename D::template Node<S, T> Node;
#if __cplusplus >= 201103L
  typedef typename std::remove_const<decltype(Random::default_seed)>::type R;
#else
  typedef typename Random::seed_type R;
#endif

protected:
  const int _f;
  size_t _s;
  S _n_items;
  void* _nodes; // Could either be mmapped, or point to a memory buffer that we reallocate
  S _n_nodes;
  S _nodes_size;
  vector<S> _roots;
  S _K;
  R _seed;
  bool _loaded;
  bool _verbose;
  int _fd;
  bool _on_disk;
  bool _built;
public:

   AnnoyIndex(int f) : _f(f), _seed(Random::default_seed) {
    _s = offsetof(Node, v) + _f * sizeof(T); // Size of each node
    _verbose = false;
    _built = false;
    _K = (S) (((size_t) (_s - offsetof(Node, children))) / sizeof(S)); // Max number of descendants to fit into node
    reinitialize(); // Reset everything
  }
  ~AnnoyIndex() {
    unload();
  }

  int get_f() const {
    return _f;
  }

  bool add_item(S item, const T* w, char** error=NULL) {
    return add_item_impl(item, w, error);
  }

  template<typename W>
  bool add_item_impl(S item, const W& w, char** error=NULL) {
    if (_loaded) {
      set_error_from_string(error, "You can't add an item to a loaded index");
      return false;
    }
    _allocate_size(item + 1);
    Node* n = _get(item);

    D::zero_value(n);

    n->children[0] = 0;
    n->children[1] = 0;
    n->n_descendants = 1;

    for (int z = 0; z < _f; z++)
      n->v[z] = w[z];

    D::init_node(n, _f);

    if (item >= _n_items)
      _n_items = item + 1;

    return true;
  }
    
  bool on_disk_build(const char* file, char** error=NULL) {
    _on_disk = true;
#ifndef _MSC_VER
    _fd = open(file, O_RDWR | O_CREAT | O_TRUNC, (int) 0600);
#else
    _fd = _open(file, _O_RDWR | _O_CREAT | _O_TRUNC, (int) 0600);
#endif
    if (_fd == -1) {
      set_error_from_errno(error, "Unable to open");
      _fd = 0;
      return false;
    }
    _nodes_size = 1;
    if (ftruncate(_fd, ANNOYLIB_FTRUNCATE_SIZE(_s) * ANNOYLIB_FTRUNCATE_SIZE(_nodes_size)) == -1) {
      set_error_from_errno(error, "Unable to truncate");
      return false;
    }
#ifdef MAP_POPULATE
    _nodes = (Node*) mmap(0, _s * _nodes_size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, _fd, 0);
#else
    _nodes = (Node*) mmap(0, _s * _nodes_size, PROT_READ | PROT_WRITE, MAP_SHARED, _fd, 0);
#endif
    return true;
  }
    
  bool build(int q, int n_threads=-1, char** error=NULL) {
    if (_loaded) {
      set_error_from_string(error, "You can't build a loaded index");
      return false;
    }

    if (_built) {
      set_error_from_string(error, "You can't build a built index");
      return false;
    }

    D::template preprocess<T, S, Node>(_nodes, _s, _n_items, _f);

    _n_nodes = _n_items;

    ThreadedBuildPolicy::template build<S, T>(this, q, n_threads);

    // Also, copy the roots into the last segment of the array
    // This way we can load them faster without reading the whole file
    _allocate_size(_n_nodes + (S)_roots.size());
    for (size_t i = 0; i < _roots.size(); i++)
      memcpy(_get(_n_nodes + (S)i), _get(_roots[i]), _s);
    _n_nodes += _roots.size();

    if (_verbose) annoylib_showUpdate("has %d nodes\n", _n_nodes);
    
    if (_on_disk) {
      if (!remap_memory_and_truncate(&_nodes, _fd,
          static_cast<size_t>(_s) * static_cast<size_t>(_nodes_size),
          static_cast<size_t>(_s) * static_cast<size_t>(_n_nodes))) {
        // TODO: this probably creates an index in a corrupt state... not sure what to do
        set_error_from_errno(error, "Unable to truncate");
        return false;
      }
      _nodes_size = _n_nodes;
    }

    D::template postprocess<T, S, Node>(_nodes, _s, _n_items, _f);

    _built = true;
    return true;
  }
  
  bool unbuild(char** error=NULL) {
    if (_loaded) {
      set_error_from_string(error, "You can't unbuild a loaded index");
      return false;
    }

    _roots.clear();
    _n_nodes = _n_items;
    _built = false;

    return true;
  }

  bool save(const char* filename, bool prefault=false, char** error=NULL) {
    if (!_built) {
      set_error_from_string(error, "You can't save an index that hasn't been built");
      return false;
    }
    if (_on_disk) {
      return true;
    } else {
      // Delete file if it already exists (See issue #335)
#ifndef _MSC_VER
      unlink(filename);
#else
      _unlink(filename);
#endif

      FILE *f = fopen(filename, "wb");
      if (f == NULL) {
        set_error_from_errno(error, "Unable to open");
        return false;
      }

      if (fwrite(_nodes, _s, _n_nodes, f) != (size_t) _n_nodes) {
        set_error_from_errno(error, "Unable to write");
        return false;
      }

      if (fclose(f) == EOF) {
        set_error_from_errno(error, "Unable to close");
        return false;
      }

      unload();
      return load(filename, prefault, error);
    }
  }

  void reinitialize() {
    _fd = 0;
    _nodes = NULL;
    _loaded = false;
    _n_items = 0;
    _n_nodes = 0;
    _nodes_size = 0;
    _on_disk = false;
    _seed = Random::default_seed;
    _roots.clear();
  }

  void unload() {
    if (_on_disk && _fd) {
#ifndef _MSC_VER
      close(_fd);
#else
      _close(_fd);
#endif
      munmap(_nodes, _s * _nodes_size);
    } else {
      if (_fd) {
        // we have mmapped data
#ifndef _MSC_VER
        close(_fd);
#else
        _close(_fd);
#endif
        munmap(_nodes, _n_nodes * _s);
      } else if (_nodes) {
        // We have heap allocated data
        free(_nodes);
      }
    }
    reinitialize();
    if (_verbose) annoylib_showUpdate("unloaded\n");
  }

  bool load(const char* filename, bool prefault=false, char** error=NULL) {
#ifndef _MSC_VER
    _fd = open(filename, O_RDONLY, (int)0400);
#else
    _fd = _open(filename, _O_RDONLY, (int)0400);
#endif
    if (_fd == -1) {
      set_error_from_errno(error, "Unable to open");
      _fd = 0;
      return false;
    }
    off_t size = lseek_getsize(_fd);
    if (size == -1) {
      set_error_from_errno(error, "Unable to get size");
      return false;
    } else if (size == 0) {
      set_error_from_errno(error, "Size of file is zero");
      return false;
    } else if (size % _s) {
      // Something is fishy with this index!
      set_error_from_errno(error, "Index size is not a multiple of vector size. Ensure you are opening using the same metric you used to create the index.");
      return false;
    }

    int flags = MAP_SHARED;
    if (prefault) {
#ifdef MAP_POPULATE
      flags |= MAP_POPULATE;
#else
      annoylib_showUpdate("prefault is set to true, but MAP_POPULATE is not defined on this platform");
#endif
    }
    _nodes = (Node*)mmap(0, size, PROT_READ, flags, _fd, 0);
    _n_nodes = (S)(size / _s);

    // Find the roots by scanning the end of the file and taking the nodes with most descendants
    _roots.clear();
    S m = -1;
    for (S i = _n_nodes - 1; i >= 0; i--) {
      S k = _get(i)->n_descendants;
      if (m == -1 || k == m) {
        _roots.push_back(i);
        m = k;
      } else {
        break;
      }
    }
    // hacky fix: since the last root precedes the copy of all roots, delete it
    if (_roots.size() > 1 && _get(_roots.front())->children[0] == _get(_roots.back())->children[0])
      _roots.pop_back();
    _loaded = true;
    _built = true;
    _n_items = m;
    if (_verbose) annoylib_showUpdate("found %zu roots with degree %d\n", _roots.size(), m);
    return true;
  }

  T get_distance(S i, S j) const {
    return D::normalized_distance(D::distance(_get(i), _get(j), _f));
  }

  void get_nns_by_item(S item, size_t n, int search_k, vector<S>* result, vector<T>* distances) const {
    // TODO: handle OOB
    const Node* m = _get(item);
    _get_all_nns(m->v, n, search_k, result, distances);
  }

  void get_nns_by_vector(const T* w, size_t n, int search_k, vector<S>* result, vector<T>* distances) const {
    _get_all_nns(w, n, search_k, result, distances);
  }

  S get_n_items() const {
    return _n_items;
  }

  S get_n_trees() const {
    return (S)_roots.size();
  }

  void verbose(bool v) {
    _verbose = v;
  }

  void get_item(S item, T* v) const {
    // TODO: handle OOB
    Node* m = _get(item);
    memcpy(v, m->v, (_f) * sizeof(T));
  }

  void set_seed(R seed) {
    _seed = seed;
  }

  void thread_build(int q, int thread_idx, ThreadedBuildPolicy& threaded_build_policy) {
    // Each thread needs its own seed, otherwise each thread would be building the same tree(s)
    Random _random(_seed + thread_idx);

    vector<S> thread_roots;
    while (1) {
      if (q == -1) {
        threaded_build_policy.lock_n_nodes();
        if (_n_nodes >= 2 * _n_items) {
          threaded_build_policy.unlock_n_nodes();
          break;
        }
        threaded_build_policy.unlock_n_nodes();
      } else {
        if (thread_roots.size() >= (size_t)q) {
          break;
        }
      }

      if (_verbose) annoylib_showUpdate("pass %zd...\n", thread_roots.size());

      vector<S> indices;
      threaded_build_policy.lock_shared_nodes();
      for (S i = 0; i < _n_items; i++) {
        if (_get(i)->n_descendants >= 1) { // Issue #223
          indices.push_back(i);
        }
      }
      threaded_build_policy.unlock_shared_nodes();

      thread_roots.push_back(_make_tree(indices, true, _random, threaded_build_policy));
    }

    threaded_build_policy.lock_roots();
    _roots.insert(_roots.end(), thread_roots.begin(), thread_roots.end());
    threaded_build_policy.unlock_roots();
  }

protected:
  void _reallocate_nodes(S n) {
    const double reallocation_factor = 1.3;
    S new_nodes_size = std::max(n, (S) ((_nodes_size + 1) * reallocation_factor));
    void *old = _nodes;
    
    if (_on_disk) {
      if (!remap_memory_and_truncate(&_nodes, _fd, 
          static_cast<size_t>(_s) * static_cast<size_t>(_nodes_size), 
          static_cast<size_t>(_s) * static_cast<size_t>(new_nodes_size)) && 
          _verbose)
          annoylib_showUpdate("File truncation error\n");
    } else {
      _nodes = realloc(_nodes, _s * new_nodes_size);
      memset((char *) _nodes + (_nodes_size * _s) / sizeof(char), 0, (new_nodes_size - _nodes_size) * _s);
    }
    
    _nodes_size = new_nodes_size;
    if (_verbose) annoylib_showUpdate("Reallocating to %d nodes: old_address=%p, new_address=%p\n", new_nodes_size, old, _nodes);
  }

  void _allocate_size(S n, ThreadedBuildPolicy& threaded_build_policy) {
    if (n > _nodes_size) {
      threaded_build_policy.lock_nodes();
      _reallocate_nodes(n);
      threaded_build_policy.unlock_nodes();
    }
  }

  void _allocate_size(S n) {
    if (n > _nodes_size) {
      _reallocate_nodes(n);
    }
  }

  Node* _get(const S i) const {
    return get_node_ptr<S, Node>(_nodes, _s, i);
  }

  double _split_imbalance(const vector<S>& left_indices, const vector<S>& right_indices) {
    double ls = (float)left_indices.size();
    double rs = (float)right_indices.size();
    float f = ls / (ls + rs + 1e-9);  // Avoid 0/0
    return std::max(f, 1-f);
  }

  S _make_tree(const vector<S>& indices, bool is_root, Random& _random, ThreadedBuildPolicy& threaded_build_policy) {
    // The basic rule is that if we have <= _K items, then it's a leaf node, otherwise it's a split node.
    // There's some regrettable complications caused by the problem that root nodes have to be "special":
    // 1. We identify root nodes by the arguable logic that _n_items == n->n_descendants, regardless of how many descendants they actually have
    // 2. Root nodes with only 1 child need to be a "dummy" parent
    // 3. Due to the _n_items "hack", we need to be careful with the cases where _n_items <= _K or _n_items > _K
    if (indices.size() == 1 && !is_root)
      return indices[0];

    if (indices.size() <= (size_t)_K && (!is_root || (size_t)_n_items <= (size_t)_K || indices.size() == 1)) {
      threaded_build_policy.lock_n_nodes();
      _allocate_size(_n_nodes + 1, threaded_build_policy);
      S item = _n_nodes++;
      threaded_build_policy.unlock_n_nodes();

      threaded_build_policy.lock_shared_nodes();
      Node* m = _get(item);
      m->n_descendants = is_root ? _n_items : (S)indices.size();

      // Using std::copy instead of a loop seems to resolve issues #3 and #13,
      // probably because gcc 4.8 goes overboard with optimizations.
      // Using memcpy instead of std::copy for MSVC compatibility. #235
      // Only copy when necessary to avoid crash in MSVC 9. #293
      if (!indices.empty())
        memcpy(m->children, &indices[0], indices.size() * sizeof(S));

      threaded_build_policy.unlock_shared_nodes();
      return item;
    }

    threaded_build_policy.lock_shared_nodes();
    vector<Node*> children;
    for (size_t i = 0; i < indices.size(); i++) {
      S j = indices[i];
      Node* n = _get(j);
      if (n)
        children.push_back(n);
    }

    vector<S> children_indices[2];
    Node* m = (Node*)alloca(_s);

    for (int attempt = 0; attempt < 3; attempt++) {
      children_indices[0].clear();
      children_indices[1].clear();
      D::create_split(children, _f, _s, _random, m);

      for (size_t i = 0; i < indices.size(); i++) {
        S j = indices[i];
        Node* n = _get(j);
        if (n) {
          bool side = D::side(m, n, _f, _random);
          children_indices[side].push_back(j);
        } else {
          annoylib_showUpdate("No node for index %d?\n", j);
        }
      }

      if (_split_imbalance(children_indices[0], children_indices[1]) < 0.95)
        break;
    }
    threaded_build_policy.unlock_shared_nodes();

    // If we didn't find a hyperplane, just randomize sides as a last option
    while (_split_imbalance(children_indices[0], children_indices[1]) > 0.99) {
      if (_verbose)
        annoylib_showUpdate("\tNo hyperplane found (left has %zu children, right has %zu children)\n",
          children_indices[0].size(), children_indices[1].size());

      children_indices[0].clear();
      children_indices[1].clear();

      // Set the vector to 0.0
      for (int z = 0; z < _f; z++)
        m->v[z] = 0;

      for (size_t i = 0; i < indices.size(); i++) {
        S j = indices[i];
        // Just randomize...
        children_indices[_random.flip()].push_back(j);
      }
    }

    int flip = (children_indices[0].size() > children_indices[1].size());

    m->n_descendants = is_root ? _n_items : (S)indices.size();
    for (int side = 0; side < 2; side++) {
      // run _make_tree for the smallest child first (for cache locality)
      m->children[side^flip] = _make_tree(children_indices[side^flip], false, _random, threaded_build_policy);
    }

    threaded_build_policy.lock_n_nodes();
    _allocate_size(_n_nodes + 1, threaded_build_policy);
    S item = _n_nodes++;
    threaded_build_policy.unlock_n_nodes();

    threaded_build_policy.lock_shared_nodes();
    memcpy(_get(item), m, _s);
    threaded_build_policy.unlock_shared_nodes();

    return item;
  }

  void _get_all_nns(const T* v, size_t n, int search_k, vector<S>* result, vector<T>* distances) const {
    Node* v_node = (Node *)alloca(_s);
    D::template zero_value<Node>(v_node);
    memcpy(v_node->v, v, sizeof(T) * _f);
    D::init_node(v_node, _f);

    std::priority_queue<pair<T, S> > q;

    if (search_k == -1) {
      search_k = n * _roots.size();
    }

    for (size_t i = 0; i < _roots.size(); i++) {
      q.push(make_pair(Distance::template pq_initial_value<T>(), _roots[i]));
    }

    std::vector<S> nns;
    while (nns.size() < (size_t)search_k && !q.empty()) {
      const pair<T, S>& top = q.top();
      T d = top.first;
      S i = top.second;
      Node* nd = _get(i);
      q.pop();
      if (nd->n_descendants == 1 && i < _n_items) {
        nns.push_back(i);
      } else if (nd->n_descendants <= _K) {
        const S* dst = nd->children;
        nns.insert(nns.end(), dst, &dst[nd->n_descendants]);
      } else {
        T margin = D::margin(nd, v, _f);
        q.push(make_pair(D::pq_distance(d, margin, 1), static_cast<S>(nd->children[1])));
        q.push(make_pair(D::pq_distance(d, margin, 0), static_cast<S>(nd->children[0])));
      }
    }

    // Get distances for all items
    // To avoid calculating distance multiple times for any items, sort by id
    std::sort(nns.begin(), nns.end());
    vector<pair<T, S> > nns_dist;
    S last = -1;
    for (size_t i = 0; i < nns.size(); i++) {
      S j = nns[i]; 
      if (j == last)
        continue;
      last = j;
      if (_get(j)->n_descendants == 1)  // This is only to guard a really obscure case, #284
        nns_dist.push_back(make_pair(D::distance(v_node, _get(j), _f), j));
    }

    size_t m = nns_dist.size();
    size_t p = n < m ? n : m; // Return this many items
    std::partial_sort(nns_dist.begin(), nns_dist.begin() + p, nns_dist.end());
    for (size_t i = 0; i < p; i++) {
      if (distances)
        distances->push_back(D::normalized_distance(nns_dist[i].first));
      result->push_back(nns_dist[i].second);
    }
  }
};

class AnnoyIndexSingleThreadedBuildPolicy {
public:
  template<typename S, typename T, typename D, typename Random>
  static void build(AnnoyIndex<S, T, D, Random, AnnoyIndexSingleThreadedBuildPolicy>* annoy, int q, int n_threads) {
    AnnoyIndexSingleThreadedBuildPolicy threaded_build_policy;
    annoy->thread_build(q, 0, threaded_build_policy);
  }

  void lock_n_nodes() {}
  void unlock_n_nodes() {}

  void lock_nodes() {}
  void unlock_nodes() {}

  void lock_shared_nodes() {}
  void unlock_shared_nodes() {}

  void lock_roots() {}
  void unlock_roots() {}
};

#ifdef ANNOYLIB_MULTITHREADED_BUILD
class AnnoyIndexMultiThreadedBuildPolicy {
private:
  std::shared_timed_mutex nodes_mutex;
  std::mutex n_nodes_mutex;
  std::mutex roots_mutex;

public:
  template<typename S, typename T, typename D, typename Random>
  static void build(AnnoyIndex<S, T, D, Random, AnnoyIndexMultiThreadedBuildPolicy>* annoy, int q, int n_threads) {
    AnnoyIndexMultiThreadedBuildPolicy threaded_build_policy;
    if (n_threads == -1) {
      // If the hardware_concurrency() value is not well defined or not computable, it returns 0.
      // We guard against this by using at least 1 thread.
      n_threads = std::max(1, (int)std::thread::hardware_concurrency());
    }

    vector<std::thread> threads(n_threads);

    for (int thread_idx = 0; thread_idx < n_threads; thread_idx++) {
      int trees_per_thread = q == -1 ? -1 : (int)floor((q + thread_idx) / n_threads);

      threads[thread_idx] = std::thread(
        &AnnoyIndex<S, T, D, Random, AnnoyIndexMultiThreadedBuildPolicy>::thread_build,
        annoy,
        trees_per_thread,
        thread_idx,
        std::ref(threaded_build_policy)
      );
    }

    for (auto& thread : threads) {
      thread.join();
    }
  }

  void lock_n_nodes() {
    n_nodes_mutex.lock();
  }
  void unlock_n_nodes() {
    n_nodes_mutex.unlock();
  }

  void lock_nodes() {
    nodes_mutex.lock();
  }
  void unlock_nodes() {
    nodes_mutex.unlock();
  }

  void lock_shared_nodes() {
    nodes_mutex.lock_shared();
  }
  void unlock_shared_nodes() {
    nodes_mutex.unlock_shared();
  }

  void lock_roots() {
    roots_mutex.lock();
  }
  void unlock_roots() {
    roots_mutex.unlock();
  }
};
#endif

}

#endif
// vim: tabstop=2 shiftwidth=2


================================================
FILE: src/annoyluamodule.cc
================================================
// Copyright (c) 2016 Boris Nagaev
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy of
// the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations under
// the License.

#include <cstring>
#include <typeinfo>

#include <lua.hpp>

#include "annoylib.h"
#include "kissrandom.h"

#if LUA_VERSION_NUM == 501
#define compat_setfuncs(L, funcs) luaL_register(L, NULL, funcs)
#define compat_rawlen lua_objlen
#else
#define compat_setfuncs(L, funcs) luaL_setfuncs(L, funcs, 0)
#define compat_rawlen lua_rawlen
#endif

using namespace Annoy;

template<typename Distance>
class LuaAnnoy {
public:
  typedef int32_t AnnoyS;
  typedef float AnnoyT;
  typedef AnnoyIndex<AnnoyS, AnnoyT, Distance, Kiss64Random, AnnoyIndexSingleThreadedBuildPolicy> Impl;
  typedef LuaAnnoy<Distance> ThisClass;

  class LuaArrayProxy {
  public:
    LuaArrayProxy(lua_State* L, int object, int f)
      : L_(L)
      , object_(object)
    {
      luaL_checktype(L, object, LUA_TTABLE);
      int v_len = compat_rawlen(L, object);
      luaL_argcheck(L, v_len == f, object, "Length of v != f");
    }

    double operator[](int index) const {
      lua_rawgeti(L_, object_, index + 1);
      double result = lua_tonumber(L_, -1);
      lua_pop(L_, 1);
      return result;
    }

  private:
    lua_State* L_;
    int object_;
  };

  static void toVector(lua_State* L, int object, int f, AnnoyT* dst) {
    LuaArrayProxy proxy(L, object, f);
    for (int i = 0; i < f; i++) {
      dst[i] = proxy[i];
    }
  }

  template <typename Vector>
  static void pushVector(lua_State* L, const Vector& v) {
    lua_createtable(L, v.size(), 0);
    for (int j = 0; j < v.size(); j++) {
      lua_pushnumber(L, v[j]);
      lua_rawseti(L, -2, j + 1);
    }
  }

  static const char* typeAsString() {
    return typeid(Impl).name();
  }

  static Impl* getAnnoy(lua_State* L, int object) {
    return reinterpret_cast<Impl*>(
      luaL_checkudata(L, object, typeAsString())
    );
  }

  static int getItemIndex(lua_State* L, int object, int size = -1) {
    int item = luaL_checkinteger(L, object);
    luaL_argcheck(L, item >= 0, object, "Index must be >= 0");
    if (size != -1) {
      luaL_argcheck(L, item < size, object, "Index must be < size");
    }
    return item;
  }

  static int gc(lua_State* L) {
    Impl* self = getAnnoy(L, 1);
    self->~Impl();
    return 0;
  }

  static int tostring(lua_State* L) {
    Impl* self = getAnnoy(L, 1);
    lua_pushfstring(
      L,
      "annoy.AnnoyIndex object (%dx%d, %s distance)",
      self->get_n_items(), self->get_f(), Distance::name()
    );
    return 1;
  }

  static int add_item(lua_State* L) {
    Impl* self = getAnnoy(L, 1);
    int item = getItemIndex(L, 2);
    self->add_item_impl(item, LuaArrayProxy(L, 3, self->get_f()));
    return 0;
  }

  static int build(lua_State* L) {
    int nargs = lua_gettop(L);
    Impl* self = getAnnoy(L, 1);
    int n_trees = luaL_checkinteger(L, 2);
    self->build(n_trees, 1);
    lua_pushboolean(L, true);
    return 1;
  }

  static int on_disk_build(lua_State* L) {
    Impl* self = getAnnoy(L, 1);
    const char* filename = luaL_checkstring(L, 2);
    self->on_disk_build(filename);
    lua_pushboolean(L, true);
    return 1;
  }

  static int save(lua_State* L) {
    int nargs = lua_gettop(L);
    Impl* self = getAnnoy(L, 1);
    const char* filename = luaL_checkstring(L, 2);
    bool prefault = true;
    if (nargs >= 3) {
      prefault = lua_toboolean(L, 3);
    }
    self->save(filename, prefault);
    lua_pushboolean(L, true);
    return 1;
  }

  static int load(lua_State* L) {
    Impl* self = getAnnoy(L, 1);
    int nargs = lua_gettop(L);
    const char* filename = luaL_checkstring(L, 2);
    bool prefault = true;
    if (nargs >= 3) {
      prefault = lua_toboolean(L, 3);
    }
    if (!self->load(filename, prefault)) {
      return luaL_error(L, "Can't load file: %s", filename);
    }
    lua_pushboolean(L, true);
    return 1;
  }

  static int unload(lua_State* L) {
    Impl* self = getAnnoy(L, 1);
    self->unload();
    lua_pushboolean(L, true);
    return 1;
  }

  struct Searcher {
    std::vector<AnnoyS> result;
    std::vector<AnnoyT> distances;
    Impl* self;
    int n;
    int search_k;
    bool include_distances;

    Searcher(lua_State* L) {
      int nargs = lua_gettop(L);
      self = getAnnoy(L, 1);
      n = luaL_checkinteger(L, 3);
      search_k = -1;
      if (nargs >= 4) {
        search_k = luaL_checkinteger(L, 4);
      }
      include_distances = false;
      if (nargs >= 5) {
        include_distances = lua_toboolean(L, 5);
      }
    }

    int pushResults(lua_State* L) {
      pushVector(L, result);
      if (include_distances) {
        pushVector(L, distances);
      }
      return include_distances ? 2 : 1;
    }
  };

  static int get_nns_by_item(lua_State* L) {
    Searcher s(L);
    int item = getItemIndex(L, 2, s.self->get_n_items());
    s.self->get_nns_by_item(item, s.n, s.search_k, &s.result,
        s.include_distances ? &s.distances : NULL);
    return s.pushResults(L);
  }

  static int get_nns_by_vector(lua_State* L) {
    Searcher s(L);
    std::vector<AnnoyT> _vec(s.self->get_f());
    AnnoyT* vec = &(_vec[0]);
    toVector(L, 2, s.self->get_f(), vec);
    s.self->get_nns_by_vector(vec, s.n, s.search_k, &s.result,
        s.include_distances ? &s.distances : NULL);
    return s.pushResults(L);
  }

  static int get_item_vector(lua_State* L) {
    Impl* self = getAnnoy(L, 1);
    int item = getItemIndex(L, 2, self->get_n_items());
    std::vector<AnnoyT> _vec(self->get_f());
    AnnoyT* vec = &(_vec[0]);
    self->get_item(item, vec);
    pushVector(L, _vec);
    return 1;
  }

  static int get_distance(lua_State* L) {
    Impl* self = getAnnoy(L, 1);
    int i = getItemIndex(L, 2, self->get_n_items());
    int j = getItemIndex(L, 3, self->get_n_items());
    AnnoyT distance = self->get_distance(i, j);
    lua_pushnumber(L, distance);
    return 1;
  }

  static int get_n_items(lua_State* L) {
    Impl* self = getAnnoy(L, 1);
    lua_pushnumber(L, self->get_n_items());
    return 1;
  }

  static const luaL_Reg* getMetatable() {
    static const luaL_Reg funcs[] = {
      {"__gc", &ThisClass::gc},
      {"__tostring", &ThisClass::tostring},
      {NULL, NULL},
    };
    return funcs;
  }

  static const luaL_Reg* getMethods() {
    static const luaL_Reg funcs[] = {
      {"add_item", &ThisClass::add_item},
      {"build", &ThisClass::build},
      {"save", &ThisClass::save},
      {"load", &ThisClass::load},
      {"unload", &ThisClass::unload},
      {"get_nns_by_item", &ThisClass::get_nns_by_item},
      {"get_nns_by_vector", &ThisClass::get_nns_by_vector},
      {"get_item_vector", &ThisClass::get_item_vector},
      {"get_distance", &ThisClass::get_distance},
      {"get_n_items", &ThisClass::get_n_items},
      {"on_disk_build", &ThisClass::on_disk_build},
      {NULL, NULL},
    };
    return funcs;
  }

  static void createNew(lua_State* L, int f) {
    void* self = lua_newuserdata(L, sizeof(Impl));
    if (luaL_newmetatable(L, typeAsString())) {
      compat_setfuncs(L, getMetatable());
      lua_newtable(L);
      compat_setfuncs(L, getMethods());
      lua_setfield(L, -2, "__index");
    }
    new (self) Impl(f);
    lua_setmetatable(L, -2);
  }
};

static int lua_an_make(lua_State* L) {
  int f = luaL_checkinteger(L, 1);
  const char* metric = "angular";
  if (lua_gettop(L) >= 2) {
      metric = luaL_checkstring(L, 2);
  }
  if (strcmp(metric, "angular") == 0) {
    LuaAnnoy<Angular>::createNew(L, f);
    return 1;
  } else if (strcmp(metric, "euclidean") == 0) {
    LuaAnnoy<Euclidean>::createNew(L, f);
    return 1;
  } else if (strcmp(metric, "manhattan") == 0) {
    LuaAnnoy<Manhattan>::createNew(L, f);
    return 1;
  } else {
    return luaL_error(L, "Unknown metric: %s", metric);
  }
}

static const luaL_Reg LUA_ANNOY_FUNCS[] = {
  {"AnnoyIndex", lua_an_make},
  {NULL, NULL},
};

extern "C" {
int luaopen_annoy(lua_State* L) {
  lua_newtable(L);
  compat_setfuncs(L, LUA_ANNOY_FUNCS);
  return 1;
}
}

// vim: tabstop=2 shiftwidth=2


================================================
FILE: src/annoymodule.cc
================================================
// Copyright (c) 2013 Spotify AB
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy of
// the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations under
// the License.

#include "annoylib.h"
#include "kissrandom.h"
#include "Python.h"
#include "structmember.h"
#include <exception>
#if defined(_MSC_VER) && _MSC_VER == 1500
typedef signed __int32    int32_t;
#else
#include <stdint.h>
#endif


#if defined(ANNOYLIB_USE_AVX512)
#define AVX_INFO "Using 512-bit AVX instructions"
#elif defined(ANNOYLIB_USE_AVX128)
#define AVX_INFO "Using 128-bit AVX instructions"
#else
#define AVX_INFO "Not using AVX instructions"
#endif

#if defined(_MSC_VER)
#define COMPILER_INFO "Compiled using MSC"
#elif defined(__GNUC__)
#define COMPILER_INFO "Compiled on GCC"
#else
#define COMPILER_INFO "Compiled on unknown platform"
#endif

#define ANNOY_DOC (COMPILER_INFO ". " AVX_INFO ".")

#if PY_MAJOR_VERSION >= 3
#define IS_PY3K
#endif

#ifndef Py_TYPE
    #define Py_TYPE(ob) (((PyObject*)(ob))->ob_type)
#endif

#ifdef IS_PY3K
    #define PyInt_FromLong PyLong_FromLong 
#endif

using namespace Annoy;

#ifdef ANNOYLIB_MULTITHREADED_BUILD
  typedef AnnoyIndexMultiThreadedBuildPolicy AnnoyIndexThreadedBuildPolicy;
#else
  typedef AnnoyIndexSingleThreadedBuildPolicy AnnoyIndexThreadedBuildPolicy;
#endif

template class Annoy::AnnoyIndexInterface<int32_t, float>;

class HammingWrapper : public AnnoyIndexInterface<int32_t, float> {
  // Wrapper class for Hamming distance, using composition.
  // This translates binary (float) vectors into packed uint64_t vectors.
  // This is questionable from a performance point of view. Should reconsider this solution.
private:
  int32_t _f_external, _f_internal;
  AnnoyIndex<int32_t, uint64_t, Hamming, Kiss64Random, AnnoyIndexThreadedBuildPolicy> _index;
  void _pack(const float* src, uint64_t* dst) const {
    for (int32_t i = 0; i < _f_internal; i++) {
      dst[i] = 0;
      for (int32_t j = 0; j < 64 && i*64+j < _f_external; j++) {
	dst[i] |= (uint64_t)(src[i * 64 + j] > 0.5) << j;
      }
    }
  };
  void _unpack(const uint64_t* src, float* dst) const {
    for (int32_t i = 0; i < _f_external; i++) {
      dst[i] = (src[i / 64] >> (i % 64)) & 1;
    }
  };
public:
  HammingWrapper(int f) : _f_external(f), _f_internal((f + 63) / 64), _index((f + 63) / 64) {};
  bool add_item(int32_t item, const float* w, char**error) {
    vector<uint64_t> w_internal(_f_internal, 0);
    _pack(w, &w_internal[0]);
    return _index.add_item(item, &w_internal[0], error);
  };
  bool build(int q, int n_threads, char** error) { return _index.build(q, n_threads, error); };
  bool unbuild(char** error) { return _index.unbuild(error); };
  bool save(const char* filename, bool prefault, char** error) { return _index.save(filename, prefault, error); };
  void unload() { _index.unload(); };
  bool load(const char* filename, bool prefault, char** error) { return _index.load(filename, prefault, error); };
  float get_distance(int32_t i, int32_t j) const { return _index.get_distance(i, j); };
  void get_nns_by_item(int32_t item, size_t n, int search_k, vector<int32_t>* result, vector<float>* distances) const {
    if (distances) {
      vector<uint64_t> distances_internal;
      _index.get_nns_by_item(item, n, search_k, result, &distances_internal);
      distances->insert(distances->begin(), distances_internal.begin(), distances_internal.end());
    } else {
      _index.get_nns_by_item(item, n, search_k, result, NULL);
    }
  };
  void get_nns_by_vector(const float* w, size_t n, int search_k, vector<int32_t>* result, vector<float>* distances) const {
    vector<uint64_t> w_internal(_f_internal, 0);
    _pack(w, &w_internal[0]);
    if (distances) {
      vector<uint64_t> distances_internal;
      _index.get_nns_by_vector(&w_internal[0], n, search_k, result, &distances_internal);
      distances->insert(distances->begin(), distances_internal.begin(), distances_internal.end());
    } else {
      _index.get_nns_by_vector(&w_internal[0], n, search_k, result, NULL);
    }
  };
  int32_t get_n_items() const { return _index.get_n_items(); };
  int32_t get_n_trees() const { return _index.get_n_trees(); };
  void verbose(bool v) { _index.verbose(v); };
  void get_item(int32_t item, float* v) const {
    vector<uint64_t> v_internal(_f_internal, 0);
    _index.get_item(item, &v_internal[0]);
    _unpack(&v_internal[0], v);
  };
  void set_seed(uint64_t q) { _index.set_seed(q); };
  bool on_disk_build(const char* filename, char** error) { return _index.on_disk_build(filename, error); };
};

// annoy python object
typedef struct {
  PyObject_HEAD
  int f;
  AnnoyIndexInterface<int32_t, float>* ptr;
} py_annoy;


static PyObject *
py_an_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) {
  py_annoy *self = (py_annoy *)type->tp_alloc(type, 0);
  if (self == NULL) {
    return NULL;
  }
  const char *metric = NULL;

  static char const * kwlist[] = {"f", "metric", NULL};
  if (!PyArg_ParseTupleAndKeywords(args, kwargs, "i|s", (char**)kwlist, &self->f, &metric))
    return NULL;
  if (!metric) {
    // This keeps coming up, see #368 etc
    PyErr_WarnEx(PyExc_FutureWarning, "The default argument for metric will be removed "
		 "in future version of Annoy. Please pass metric='angular' explicitly.", 1);
    self->ptr = new AnnoyIndex<int32_t, float, Angular, Kiss64Random, AnnoyIndexThreadedBuildPolicy>(self->f);
  } else if (!strcmp(metric, "angular")) {
   self->ptr = new AnnoyIndex<int32_t, float, Angular, Kiss64Random, AnnoyIndexThreadedBuildPolicy>(self->f);
  } else if (!strcmp(metric, "euclidean")) {
    self->ptr = new AnnoyIndex<int32_t, float, Euclidean, Kiss64Random, AnnoyIndexThreadedBuildPolicy>(self->f);
  } else if (!strcmp(metric, "manhattan")) {
    self->ptr = new AnnoyIndex<int32_t, float, Manhattan, Kiss64Random, AnnoyIndexThreadedBuildPolicy>(self->f);
  } else if (!strcmp(metric, "hamming")) {
    self->ptr = new HammingWrapper(self->f);
  } else if (!strcmp(metric, "dot")) {
    self->ptr = new AnnoyIndex<int32_t, float, DotProduct, Kiss64Random, AnnoyIndexThreadedBuildPolicy>(self->f);
  } else {
    PyErr_SetString(PyExc_ValueError, "No such metric");
    return NULL;
  }

  return (PyObject *)self;
}


static int 
py_an_init(py_annoy *self, PyObject *args, PyObject *kwargs) {
  // Seems to be needed for Python 3
  const char *metric = NULL;
  int f;
  static char const * kwlist[] = {"f", "metric", NULL};
  if (!PyArg_ParseTupleAndKeywords(args, kwargs, "i|s", (char**)kwlist, &f, &metric))
    return (int) NULL;
  return 0;
}


static void 
py_an_dealloc(py_annoy* self) {
  delete self->ptr;
  Py_TYPE(self)->tp_free((PyObject*)self);
}


static PyMemberDef py_annoy_members[] = {
  {(char*)"f", T_INT, offsetof(py_annoy, f), 0,
   (char*)""},
  {NULL}	/* Sentinel */
};


static PyObject *
py_an_load(py_annoy *self, PyObject *args, PyObject *kwargs) {
  char *filename, *error;
  bool prefault = false;
  if (!self->ptr) 
    return NULL;
  static char const * kwlist[] = {"fn", "prefault", NULL};
  if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s|b", (char**)kwlist, &filename, &prefault))
    return NULL;

  if (!self->ptr->load(filename, prefault, &error)) {
    PyErr_SetString(PyExc_IOError, error);
    free(error);
    return NULL;
  }
  Py_RETURN_TRUE;
}


static PyObject *
py_an_save(py_annoy *self, PyObject *args, PyObject *kwargs) {
  char *filename, *error;
  bool prefault = false;
  if (!self->ptr) 
    return NULL;
  static char const * kwlist[] = {"fn", "prefault", NULL};
  if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s|b", (char**)kwlist, &filename, &prefault))
    return NULL;

  if (!self->ptr->save(filename, prefault, &error)) {
    PyErr_SetString(PyExc_IOError, error);
    free(error);
    return NULL;
  }
  Py_RETURN_TRUE;
}


PyObject*
get_nns_to_python(const vector<int32_t>& result, const vector<float>& distances, int include_distances) {
  PyObject* l = NULL;
  PyObject* d = NULL;
  PyObject* t = NULL;

  if ((l = PyList_New(result.size())) == NULL) {
    goto error;
  }
  for (size_t i = 0; i < result.size(); i++) {
    PyObject* res = PyInt_FromLong(result[i]);
    if (res == NULL) {
      goto error;
    }
    PyList_SetItem(l, i, res);
  }
  if (!include_distances)
    return l;

  if ((d = PyList_New(distances.size())) == NULL) {
    goto error;
  }

  for (size_t i = 0; i < distances.size(); i++) {
    PyObject* dist = PyFloat_FromDouble(distances[i]);
    if (dist == NULL) {
      goto error;
    }
    PyList_SetItem(d, i, dist);
  }

  if ((t = PyTuple_Pack(2, l, d)) == NULL) {
    goto error;
  }
  Py_XDECREF(l);
  Py_XDECREF(d);

  return t;

  error:
    Py_XDECREF(l);
    Py_XDECREF(d);
    Py_XDECREF(t);
    return NULL;
}


bool check_constraints(py_annoy *self, int32_t item, bool building) {
  if (item < 0) {
    PyErr_SetString(PyExc_IndexError, "Item index can not be negative");
    return false;
  } else if (!building && item >= self->ptr->get_n_items()) {
    PyErr_SetString(PyExc_IndexError, "Item index larger than the largest item index");
    return false;
  } else {
    return true;
  }
}

static PyObject* 
py_an_get_nns_by_item(py_annoy *self, PyObject *args, PyObject *kwargs) {
  int32_t item, n, search_k=-1, include_distances=0;
  if (!self->ptr) 
    return NULL;

  static char const * kwlist[] = {"i", "n", "search_k", "include_distances", NULL};
  if (!PyArg_ParseTupleAndKeywords(args, kwargs, "ii|ii", (char**)kwlist, &item, &n, &search_k, &include_distances))
    return NULL;

  if (!check_constraints(self, item, false)) {
    return NULL;
  }

  vector<int32_t> result;
  vector<float> distances;

  Py_BEGIN_ALLOW_THREADS;
  self->ptr->get_nns_by_item(item, n, search_k, &result, include_distances ? &distances : NULL);
  Py_END_ALLOW_THREADS;

  return get_nns_to_python(result, distances, include_distances);
}


bool
convert_list_to_vector(PyObject* v, int f, vector<float>* w) {
  Py_ssize_t length = PyObject_Size(v);
  if (length == -1) {
    return false;
  }
  if (length != f) {
    PyErr_Format(PyExc_IndexError, "Vector has wrong length (expected %d, got %ld)", f, length);
    return false;
  }

  for (int z = 0; z < f; z++) {
    PyObject *key = PyInt_FromLong(z);
    if (key == NULL) {
      return false;
    }
    PyObject *pf = PyObject_GetItem(v, key);
    Py_DECREF(key);
    if (pf == NULL) {
      return false;
    }
    double value = PyFloat_AsDouble(pf);
    Py_DECREF(pf);
    if (value == -1.0 && PyErr_Occurred()) {
      return false;
    }
    (*w)[z] = value;
  }
  return true;
}

static PyObject* 
py_an_get_nns_by_vector(py_annoy *self, PyObject *args, PyObject *kwargs) {
  PyObject* v;
  int32_t n, search_k=-1, include_distances=0;
  if (!self->ptr) 
    return NULL;

  static char const * kwlist[] = {"vector", "n", "search_k", "include_distances", NULL};
  if (!PyArg_ParseTupleAndKeywords(args, kwargs, "Oi|ii", (char**)kwlist, &v, &n, &search_k, &include_distances))
    return NULL;

  vector<float> w(self->f);
  if (!convert_list_to_vector(v, self->f, &w)) {
    return NULL;
  }

  vector<int32_t> result;
  vector<float> distances;

  Py_BEGIN_ALLOW_THREADS;
  self->ptr->get_nns_by_vector(&w[0], n, search_k, &result, include_distances ? &distances : NULL);
  Py_END_ALLOW_THREADS;

  return get_nns_to_python(result, distances, include_distances);
}


static PyObject* 
py_an_get_item_vector(py_annoy *self, PyObject *args) {
  int32_t item;
  if (!self->ptr) 
    return NULL;
  if (!PyArg_ParseTuple(args, "i", &item))
    return NULL;

  if (!check_constraints(self, item, false)) {
    return NULL;
  }

  vector<float> v(self->f);
  self->ptr->get_item(item, &v[0]);
  PyObject* l = PyList_New(self->f);
  if (l == NULL) {
    return NULL;
  }
  for (int z = 0; z < self->f; z++) {
    PyObject* dist = PyFloat_FromDouble(v[z]);
    if (dist == NULL) {
      goto error;
    }
    PyList_SetItem(l, z, dist);
  }

  return l;

  error:
    Py_XDECREF(l);
    return NULL;
}


static PyObject* 
py_an_add_item(py_annoy *self, PyObject *args, PyObject* kwargs) {
  PyObject* v;
  int32_t item;
  if (!self->ptr) 
    return NULL;
  static char const * kwlist[] = {"i", "vector", NULL};
  if (!PyArg_ParseTupleAndKeywords(args, kwargs, "iO", (char**)kwlist, &item, &v))
    return NULL;

  if (!check_constraints(self, item, true)) {
    return NULL;
  }

  vector<float> w(self->f);
  if (!convert_list_to_vector(v, self->f, &w)) {
    return NULL;
  }
  char* error;
  if (!self->ptr->add_item(item, &w[0], &error)) {
    PyErr_SetString(PyExc_Exception, error);
    free(error);
    return NULL;
  }

  Py_RETURN_NONE;
}

static PyObject *
py_an_on_disk_build(py_annoy *self, PyObject *args, PyObject *kwargs) {
  char *filename, *error;
  if (!self->ptr)
    return NULL;
  static char const * kwlist[] = {"fn", NULL};
  if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s", (char**)kwlist, &filename))
    return NULL;

  if (!self->ptr->on_disk_build(filename, &error)) {
    PyErr_SetString(PyExc_IOError, error);
    free(error);
    return NULL;
  }
  Py_RETURN_TRUE;
}

static PyObject *
py_an_build(py_annoy *self, PyObject *args, PyObject *kwargs) {
  int q;
  int n_jobs = -1;
  if (!self->ptr) 
    return NULL;
  static char const * kwlist[] = {"n_trees", "n_jobs", NULL};
  if (!PyArg_ParseTupleAndKeywords(args, kwargs, "i|i", (char**)kwlist, &q, &n_jobs))
    return NULL;

  bool res;
  char* error;
  Py_BEGIN_ALLOW_THREADS;
  res = self->ptr->build(q, n_jobs, &error);
  Py_END_ALLOW_THREADS;
  if (!res) {
    PyErr_SetString(PyExc_Exception, error);
    free(error);
    return NULL;
  }

  Py_RETURN_TRUE;
}


static PyObject *
py_an_unbuild(py_annoy *self) {
  if (!self->ptr) 
    return NULL;

  char* error;
  if (!self->ptr->unbuild(&error)) {
    PyErr_SetString(PyExc_Exception, error);
    free(error);
    return NULL;
  }

  Py_RETURN_TRUE;
}


static PyObject *
py_an_unload(py_annoy *self) {
  if (!self->ptr) 
    return NULL;

  self->ptr->unload();

  Py_RETURN_TRUE;
}


static PyObject *
py_an_get_distance(py_annoy *self, PyObject *args) {
  int32_t i, j;
  if (!self->ptr) 
    return NULL;
  if (!PyArg_ParseTuple(args, "ii", &i, &j))
    return NULL;

  if (!check_constraints(self, i, false) || !check_constraints(self, j, false)) {
    return NULL;
  }

  double d = self->ptr->get_distance(i,j);
  return PyFloat_FromDouble(d);
}


static PyObject *
py_an_get_n_items(py_annoy *self) {
  if (!self->ptr) 
    return NULL;

  int32_t n = self->ptr->get_n_items();
  return PyInt_FromLong(n);
}

static PyObject *
py_an_get_n_trees(py_annoy *self) {
  if (!self->ptr) 
    return NULL;

  int32_t n = self->ptr->get_n_trees();
  return PyInt_FromLong(n);
}

static PyObject *
py_an_verbose(py_annoy *self, PyObject *args) {
  int verbose;
  if (!self->ptr) 
    return NULL;
  if (!PyArg_ParseTuple(args, "i", &verbose))
    return NULL;

  self->ptr->verbose((bool)verbose);

  Py_RETURN_TRUE;
}


static PyObject *
py_an_set_seed(py_annoy *self, PyObject *args) {
  int q;
  if (!self->ptr)
    return NULL;
  if (!PyArg_ParseTuple(args, "i", &q))
    return NULL;

  self->ptr->set_seed(q);

  Py_RETURN_NONE;
}


static PyMethodDef AnnoyMethods[] = {
  {"load",	(PyCFunction)py_an_load, METH_VARARGS | METH_KEYWORDS, "Loads (mmaps) an index from disk."},
  {"save",	(PyCFunction)py_an_save, METH_VARARGS | METH_KEYWORDS, "Saves the index to disk."},
  {"get_nns_by_item",(PyCFunction)py_an_get_nns_by_item, METH_VARARGS | METH_KEYWORDS, "Returns the `n` closest items to item `i`.\n\n:param search_k: the query will inspect up to `search_k` nodes.\n`search_k` gives you a run-time tradeoff between better accuracy and speed.\n`search_k` defaults to `n_trees * n` if not provided.\n\n:param include_distances: If `True`, this function will return a\n2 element tuple of lists. The first list contains the `n` closest items.\nThe second list contains the corresponding distances."},
  {"get_nns_by_vector",(PyCFunction)py_an_get_nns_by_vector, METH_VARARGS | METH_KEYWORDS, "Returns the `n` closest items to vector `vector`.\n\n:param search_k: the query will inspect up to `search_k` nodes.\n`search_k` gives you a run-time tradeoff between better accuracy and speed.\n`search_k` defaults to `n_trees * n` if not provided.\n\n:param include_distances: If `True`, this function will return a\n2 element tuple of lists. The first list contains the `n` closest items.\nThe second list contains the corresponding distances."},
  {"get_item_vector",(PyCFunction)py_an_get_item_vector, METH_VARARGS, "Returns the vector for item `i` that was previously added."},
  {"add_item",(PyCFunction)py_an_add_item, METH_VARARGS | METH_KEYWORDS, "Adds item `i` (any nonnegative integer) with vector `v`.\n\nNote that it will allocate memory for `max(i)+1` items."},
  {"on_disk_build",(PyCFunction)py_an_on_disk_build, METH_VARARGS | METH_KEYWORDS, "Build will be performed with storage on disk instead of RAM."},
  {"build",(PyCFunction)py_an_build, METH_VARARGS | METH_KEYWORDS, "Builds a forest of `n_trees` trees.\n\nMore trees give higher precision when querying. After calling `build`,\nno more items can be added. `n_jobs` specifies the number of threads used to build the trees. `n_jobs=-1` uses all available CPU cores."},
  {"unbuild",(PyCFunction)py_an_unbuild, METH_NOARGS, "Unbuilds the tree in order to allows adding new items.\n\nbuild() has to be called again afterwards in order to\nrun queries."},
  {"unload",(PyCFunction)py_an_unload, METH_NOARGS, "Unloads an index from disk."},
  {"get_distance",(PyCFunction)py_an_get_distance, METH_VARARGS, "Returns the distance between items `i` and `j`."},
  {"get_n_items",(PyCFunction)py_an_get_n_items, METH_NOARGS, "Returns the number of items in the index."},
  {"get_n_trees",(PyCFunction)py_an_get_n_trees, METH_NOARGS, "Returns the number of trees in the index."},
  {"verbose",(PyCFunction)py_an_verbose, METH_VARARGS, ""},
  {"set_seed",(PyCFunction)py_an_set_seed, METH_VARARGS, "Sets the seed of Annoy's random number generator."},
  {NULL, NULL, 0, NULL}		 /* Sentinel */
};


static PyTypeObject PyAnnoyType = {
  PyVarObject_HEAD_INIT(NULL, 0)
  "annoy.Annoy",          /*tp_name*/
  sizeof(py_annoy),       /*tp_basicsize*/
  0,                      /*tp_itemsize*/
  (destructor)py_an_dealloc, /*tp_dealloc*/
  0,                      /*tp_print*/
  0,                      /*tp_getattr*/
  0,                      /*tp_setattr*/
  0,                      /*tp_compare*/
  0,                      /*tp_repr*/
  0,                      /*tp_as_number*/
  0,                      /*tp_as_sequence*/
  0,                      /*tp_as_mapping*/
  0,                      /*tp_hash */
  0,                      /*tp_call*/
  0,                      /*tp_str*/
  0,                      /*tp_getattro*/
  0,                      /*tp_setattro*/
  0,                      /*tp_as_buffer*/
  Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /*tp_flags*/
  ANNOY_DOC,              /* tp_doc */
  0,                      /* tp_traverse */
  0,                      /* tp_clear */
  0,                      /* tp_richcompare */
  0,                      /* tp_weaklistoffset */
  0,                      /* tp_iter */
  0,                      /* tp_iternext */
  AnnoyMethods,           /* tp_methods */
  py_annoy_members,       /* tp_members */
  0,                      /* tp_getset */
  0,                      /* tp_base */
  0,                      /* tp_dict */
  0,                      /* tp_descr_get */
  0,                      /* tp_descr_set */
  0,                      /* tp_dictoffset */
  (initproc)py_an_init,   /* tp_init */
  0,                      /* tp_alloc */
  py_an_new,              /* tp_new */
};

static PyMethodDef module_methods[] = {
  {NULL}	/* Sentinel */
};

#if PY_MAJOR_VERSION >= 3
  static struct PyModuleDef moduledef = {
    PyModuleDef_HEAD_INIT,
    "annoylib",          /* m_name */
    ANNOY_DOC,           /* m_doc */
    -1,                  /* m_size */
    module_methods,      /* m_methods */
    NULL,                /* m_reload */
    NULL,                /* m_traverse */
    NULL,                /* m_clear */
    NULL,                /* m_free */
  };
#endif

PyObject *create_module(void) {
  PyObject *m;

  if (PyType_Ready(&PyAnnoyType) < 0)
    return NULL;

#if PY_MAJOR_VERSION >= 3
  m = PyModule_Create(&moduledef);
#else
  m = Py_InitModule("annoylib", module_methods);
#endif

  if (m == NULL)
    return NULL;

  Py_INCREF(&PyAnnoyType);
  PyModule_AddObject(m, "Annoy", (PyObject *)&PyAnnoyType);
  return m;
}

#if PY_MAJOR_VERSION >= 3
  PyMODINIT_FUNC PyInit_annoylib(void) {
    return create_module();      // it should return moudule object in py3
  }
#else
  PyMODINIT_FUNC initannoylib(void) {
    create_module();
  }
#endif


// vim: tabstop=2 shiftwidth=2


================================================
FILE: src/kissrandom.h
================================================
#ifndef ANNOY_KISSRANDOM_H
#define ANNOY_KISSRANDOM_H

#if defined(_MSC_VER) && _MSC_VER == 1500
typedef unsigned __int32    uint32_t;
typedef unsigned __int64    uint64_t;
#else
#include <stdint.h>
#endif

namespace Annoy {

// KISS = "keep it simple, stupid", but high quality random number generator
// http://www0.cs.ucl.ac.uk/staff/d.jones/GoodPracticeRNG.pdf -> "Use a good RNG and build it into your code"
// http://mathforum.org/kb/message.jspa?messageID=6627731
// https://de.wikipedia.org/wiki/KISS_(Zufallszahlengenerator)

// 32 bit KISS
struct Kiss32Random {
  uint32_t x;
  uint32_t y;
  uint32_t z;
  uint32_t c;

  static const uint32_t default_seed = 123456789;
#if __cplusplus < 201103L
  typedef uint32_t seed_type;
#endif

  // seed must be != 0
  Kiss32Random(uint32_t seed = default_seed) {
    x = seed;
    y = 362436000;
    z = 521288629;
    c = 7654321;
  }

  uint32_t kiss() {
    // Linear congruence generator
    x = 69069 * x + 12345;

    // Xor shift
    y ^= y << 13;
    y ^= y >> 17;
    y ^= y << 5;

    // Multiply-with-carry
    uint64_t t = 698769069ULL * z + c;
    c = t >> 32;
    z = (uint32_t) t;

    return x + y + z;
  }
  inline int flip() {
    // Draw random 0 or 1
    return kiss() & 1;
  }
  inline size_t index(size_t n) {
    // Draw random integer between 0 and n-1 where n is at most the number of data points you have
    return kiss() % n;
  }
  inline void set_seed(uint32_t seed) {
    x = seed;
  }
};

// 64 bit KISS. Use this if you have more than about 2^24 data points ("big data" ;) )
struct Kiss64Random {
  uint64_t x;
  uint64_t y;
  uint64_t z;
  uint64_t c;

  static const uint64_t default_seed = 1234567890987654321ULL;
#if __cplusplus < 201103L
  typedef uint64_t seed_type;
#endif

  // seed must be != 0
  Kiss64Random(uint64_t seed = default_seed) {
    x = seed;
    y = 362436362436362436ULL;
    z = 1066149217761810ULL;
    c = 123456123456123456ULL;
  }

  uint64_t kiss() {
    // Linear congruence generator
    z = 6906969069LL*z+1234567;

    // Xor shift
    y ^= (y<<13);
    y ^= (y>>17);
    y ^= (y<<43);

    // Multiply-with-carry (uint128_t t = (2^58 + 1) * x + c; c = t >> 64; x = (uint64_t) t)
    uint64_t t = (x<<58)+c;
    c = (x>>6);
    x += t;
    c += (x<t);

    return x + y + z;
  }
  inline int flip() {
    // Draw random 0 or 1
    return kiss() & 1;
  }
  inline size_t index(size_t n) {
    // Draw random integer between 0 and n-1 where n is at most the number of data points you have
    return kiss() % n;
  }
  inline void set_seed(uint64_t seed) {
    x = seed;
  }
};

}

#endif
// vim: tabstop=2 shiftwidth=2


================================================
FILE: src/mman.h
================================================

// This is from https://code.google.com/p/mman-win32/
// 
// Licensed under MIT

#ifndef _MMAN_WIN32_H
#define _MMAN_WIN32_H

#ifndef _WIN32_WINNT		// Allow use of features specific to Windows XP or later.                   
#define _WIN32_WINNT 0x0501	// Change this to the appropriate value to target other versions of Windows.
#endif						

#include <sys/types.h>
#include <windows.h>
#include <errno.h>
#include <io.h>

#define PROT_NONE       0
#define PROT_READ       1
#define PROT_WRITE      2
#define PROT_EXEC       4

#define MAP_FILE        0
#define MAP_SHARED      1
#define MAP_PRIVATE     2
#define MAP_TYPE        0xf
#define MAP_FIXED       0x10
#define MAP_ANONYMOUS   0x20
#define MAP_ANON        MAP_ANONYMOUS

#define MAP_FAILED      ((void *)-1)

/* Flags for msync. */
#define MS_ASYNC        1
#define MS_SYNC         2
#define MS_INVALIDATE   4

#ifndef FILE_MAP_EXECUTE
#define FILE_MAP_EXECUTE    0x0020
#endif

static int __map_mman_error(const DWORD err, const int deferr)
{
    if (err == 0)
        return 0;
    //TODO: implement
    return err;
}

static DWORD __map_mmap_prot_page(const int prot)
{
    DWORD protect = 0;
    
    if (prot == PROT_NONE)
        return protect;
        
    if ((prot & PROT_EXEC) != 0)
    {
        protect = ((prot & PROT_WRITE) != 0) ? 
                    PAGE_EXECUTE_READWRITE : PAGE_EXECUTE_READ;
    }
    else
    {
        protect = ((prot & PROT_WRITE) != 0) ?
                    PAGE_READWRITE : PAGE_READONLY;
    }
    
    return protect;
}

static DWORD __map_mmap_prot_file(const int prot)
{
    DWORD desiredAccess = 0;
    
    if (prot == PROT_NONE)
        return desiredAccess;
        
    if ((prot & PROT_READ) != 0)
        desiredAccess |= FILE_MAP_READ;
    if ((prot & PROT_WRITE) != 0)
        desiredAccess |= FILE_MAP_WRITE;
    if ((prot & PROT_EXEC) != 0)
        desiredAccess |= FILE_MAP_EXECUTE;
    
    return desiredAccess;
}

inline void* mmap(void *addr, size_t len, int prot, int flags, int fildes, off_t off)
{
    HANDLE fm, h;
    
    void * map = MAP_FAILED;
    
#ifdef _MSC_VER
#pragma warning(push)
#pragma warning(disable: 4293)
#endif

    const DWORD dwFileOffsetLow = (sizeof(off_t) <= sizeof(DWORD)) ? 
                    (DWORD)off : (DWORD)(off & 0xFFFFFFFFL);
    const DWORD dwFileOffsetHigh = (sizeof(off_t) <= sizeof(DWORD)) ?
                    (DWORD)0 : (DWORD)((off >> 32) & 0xFFFFFFFFL);
    const DWORD protect = __map_mmap_prot_page(prot);
    const DWORD desiredAccess = __map_mmap_prot_file(prot);

    const off_t maxSize = off + (off_t)len;

    const DWORD dwMaxSizeLow = (sizeof(off_t) <= sizeof(DWORD)) ? 
                    (DWORD)maxSize : (DWORD)(maxSize & 0xFFFFFFFFL);
    const DWORD dwMaxSizeHigh = (sizeof(off_t) <= sizeof(DWORD)) ?
                    (DWORD)0 : (DWORD)((maxSize >> 32) & 0xFFFFFFFFL);

#ifdef _MSC_VER
#pragma warning(pop)
#endif

    errno = 0;
    
    if (len == 0 
        /* Unsupported flag combinations */
        || (flags & MAP_FIXED) != 0
        /* Usupported protection combinations */
        || prot == PROT_EXEC)
    {
        errno = EINVAL;
        return MAP_FAILED;
    }
    
    h = ((flags & MAP_ANONYMOUS) == 0) ? 
                    (HANDLE)_get_osfhandle(fildes) : INVALID_HANDLE_VALUE;

    if ((flags & MAP_ANONYMOUS) == 0 && h == INVALID_HANDLE_VALUE)
    {
        errno = EBADF;
        return MAP_FAILED;
    }

    fm = CreateFileMapping(h, NULL, protect, dwMaxSizeHigh, dwMaxSizeLow, NULL);

    if (fm == NULL)
    {
        errno = __map_mman_error(GetLastError(), EPERM);
        return MAP_FAILED;
    }
  
    map = MapViewOfFile(fm, desiredAccess, dwFileOffsetHigh, dwFileOffsetLow, len);

    CloseHandle(fm);
  
    if (map == NULL)
    {
        errno = __map_mman_error(GetLastError(), EPERM);
        return MAP_FAILED;
    }

    return map;
}

inline int munmap(void *addr, size_t len)
{
    if (UnmapViewOfFile(addr))
        return 0;
        
    errno =  __map_mman_error(GetLastError(), EPERM);
    
    return -1;
}

inline int mprotect(void *addr, size_t len, int prot)
{
    DWORD newProtect = __map_mmap_prot_page(prot);
    DWORD oldProtect = 0;
    
    if (VirtualProtect(addr, len, newProtect, &oldProtect))
        return 0;
    
    errno =  __map_mman_error(GetLastError(), EPERM);
    
    return -1;
}

inline int msync(void *addr, size_t len, int flags)
{
    if (FlushViewOfFile(addr, len))
        return 0;
    
    errno =  __map_mman_error(GetLastError(), EPERM);
    
    return -1;
}

inline int mlock(const void *addr, size_t len)
{
    if (VirtualLock((LPVOID)addr, len))
        return 0;
        
    errno =  __map_mman_error(GetLastError(), EPERM);
    
    return -1;
}

inline int munlock(const void *addr, size_t len)
{
    if (VirtualUnlock((LPVOID)addr, len))
        return 0;
        
    errno =  __map_mman_error(GetLastError(), EPERM);
    
    return -1;
}

#if !defined(__MINGW32__)
inline int ftruncate(const int fd, const int64_t size) {
    if (fd < 0) {
        errno = EBADF;
        return -1;
    }

    HANDLE h = reinterpret_cast<HANDLE>(_get_osfhandle(fd));
    LARGE_INTEGER li_start, li_size;
    li_start.QuadPart = static_cast<int64_t>(0);
    li_size.QuadPart = size;
    if (SetFilePointerEx(h, li_start, NULL, FILE_CURRENT) == ~0 ||
        SetFilePointerEx(h, li_size, NULL, FILE_BEGIN) == ~0 ||
        !SetEndOfFile(h)) {
        unsigned long error = GetLastError();
        fprintf(stderr, "I/O error while truncating: %lu\n", error);
        switch (error) {
            case ERROR_INVALID_HANDLE:
                errno = EBADF;
                break;
            default:
                errno = EIO;
                break;
        }
        return -1;
    }        
    return 0;
}
#endif

#endif 


================================================
FILE: test/accuracy_test.py
================================================
# Copyright (c) 2013 Spotify AB
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations under
# the License.

from __future__ import print_function

import os

import h5py

from annoy import AnnoyIndex

try:
    from urllib import urlretrieve
except ImportError:
    from urllib.request import urlretrieve  # Python 3


def _get_index(dataset, custom_distance=None, custom_dim=None):
    url = 'http://ann-benchmarks.com/%s.hdf5' % dataset
    vectors_fn = os.path.join("test", dataset + ".hdf5")
    index_fn = os.path.join("test", dataset + ".annoy")

    if not os.path.exists(vectors_fn):
        print("downloading", url, "->", vectors_fn)
        urlretrieve(url, vectors_fn)

    dataset_f = h5py.File(vectors_fn, "r")
    distance = dataset_f.attrs["distance"]
    if custom_distance is not None:
         distance = custom_distance
    f = dataset_f["train"].shape[1]
    if custom_dim:
         f = custom_dim
    if custom_distance:
        dataset = dataset.rsplit('-', 2)[0] + "-%d-%s" % (f, custom_distance)
        index_fn = os.path.join('test', dataset + '.annoy')


    annoy = AnnoyIndex(f, distance)

    if not os.path.exists(index_fn):
        print("adding items", distance, f)
        for i, v in enumerate(dataset_f["train"]):
            if len(v) > f:
                v = v[:f]
            annoy.add_item(i, v)

        print("building index")
        annoy.build(10)
        annoy.save(index_fn)
    else:
        annoy.load(index_fn)
    return annoy, dataset_f, dataset


def _test_index(dataset, exp_accuracy, custom_metric=None, custom_dim=None):
    annoy, dataset_f, dataset = _get_index(dataset, custom_metric, custom_dim)

    n, k = 0, 0

    for i, v in enumerate(dataset_f["test"]):
        if custom_dim:
            v = v[:custom_dim]
        js_fast = annoy.get_nns_by_vector(v, 10, 10000)
        js_real = dataset_f["neighbors"][i][:10]
        assert len(js_fast) == 10
        assert len(js_real) == 10

        n += 10
        k += len(set(js_fast).intersection(js_real))

    accuracy = 100.0 * k / n
    print(
        "%50s accuracy: %5.2f%% (expected %5.2f%%)" % (dataset, accuracy, exp_accuracy)
    )


    assert accuracy > exp_accuracy - 1.0  # should be within 1%


def test_glove_25():
    _test_index("glove-25-angular", 69.00)


def test_nytimes_16():
    _test_index("nytimes-16-angular", 80.00)


def test_lastfm_dot():
    _test_index('lastfm-64-dot', 60.00, 'dot', 64)


def test_lastfm_angular():
    _test_index('lastfm-64-dot', 60.00, 'angular', 65)


================================================
FILE: test/angular_index_test.py
================================================
# Copyright (c) 2013 Spotify AB
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations under
# the License.

import random

import numpy
import pytest

from annoy import AnnoyIndex


def test_get_nns_by_vector():
    f = 3
    i = AnnoyIndex(f, "angular")
    i.add_item(0, [0, 0, 1])
    i.add_item(1, [0, 1, 0])
    i.add_item(2, [1, 0, 0])
    i.build(10)

    assert i.get_nns_by_vector([3, 2, 1], 3) == [2, 1, 0]
    assert i.get_nns_by_vector([1, 2, 3], 3) == [0, 1, 2]
    assert i.get_nns_by_vector([2, 0, 1], 3) == [2, 0, 1]


def test_get_nns_by_item():
    f = 3
    i = AnnoyIndex(f, "angular")
    i.add_item(0, [2, 1, 0])
    i.add_item(1, [1, 2, 0])
    i.add_item(2, [0, 0, 1])
    i.build(10)

    assert i.get_nns_by_item(0, 3) == [0, 1, 2]
    assert i.get_nns_by_item(1, 3) == [1, 0, 2]
    assert i.get_nns_by_item(2, 3) in [[2, 0, 1], [2, 1, 0]]  # could be either


def test_dist():
    f = 2
    i = AnnoyIndex(f, "angular")
    i.add_item(0, [0, 1])
    i.add_item(1, [1, 1])

    assert i.get_distance(0, 1) == pytest.approx((2 * (1.0 - 2**-0.5)) ** 0.5)


def test_dist_2():
    f = 2
    i = AnnoyIndex(f, "angular")
    i.add_item(0, [1000, 0])
    i.add_item(1, [10, 0])

    assert i.get_distance(0, 1) == pytest.approx(0)


def test_dist_3():
    f = 2
    i = AnnoyIndex(f, "angular")
    i.add_item(0, [97, 0])
    i.add_item(1, [42, 42])

    dist = ((1 - 2**-0.5) ** 2 + (2**-0.5) ** 2) ** 0.5

    assert i.get_distance(0, 1) == pytest.approx(dist)


def test_dist_degen():
    f = 2
    i = AnnoyIndex(f, "angular")
    i.add_item(0, [1, 0])
    i.add_item(1, [0, 0])

    assert i.get_distance(0, 1) == pytest.approx(2.0**0.5)


def test_large_index():
    # Generate pairs of random points where the pair is super close
    f = 10
    i = AnnoyIndex(f, "angular")
    for j in range(0, 10000, 2):
        p = [random.gauss(0, 1) for z in range(f)]
        f1 = random.random() + 1
        f2 = random.random() + 1
        x = [f1 * pi + random.gauss(0, 1e-2) for pi in p]
        y = [f2 * pi + random.gauss(0, 1e-2) for pi in p]
        i.add_item(j, x)
        i.add_item(j + 1, y)

    i.build(10)
    for j in range(0, 10000, 2):
        assert i.get_nns_by_item(j, 2) == [j, j + 1]
        assert i.get_nns_by_item(j + 1, 2) == [j + 1, j]


def precision(n, n_trees=10, n_points=10000, n_rounds=10, search_k=100000):
    found = 0
    for r in range(n_rounds):
        # create random points at distance x from (1000, 0, 0, ...)
        f = 10
        i = AnnoyIndex(f, "angular")
        for j in range(n_points):
            p = [random.gauss(0, 1) for z in range(f - 1)]
            norm = sum([pi**2 for pi in p]) ** 0.5
            x = [1000] + [pi / norm * j for pi in p]
            i.add_item(j, x)

        i.build(n_trees)

        nns = i.get_nns_by_vector([1000] + [0] * (f - 1), n, search_k)
        assert nns == sorted(nns)  # should be in order
        # The number of gaps should be equal to the last item minus n-1
        found += len([x for x in nns if x < n])

    return 1.0 * found / (n * n_rounds)


def test_precision_1():
    assert precision(1) >= 0.98


def test_precision_10():
    assert precision(10) >= 0.98


def test_precision_100():
    assert precision(100) >= 0.98


def test_precision_1000():
    assert precision(1000) >= 0.98


def test_load_save_get_item_vector():
    f = 3
    i = AnnoyIndex(f, "angular")
    i.add_item(0, [1.1, 2.2, 3.3])
    i.add_item(1, [4.4, 5.5, 6.6])
    i.add_item(2, [7.7, 8.8, 9.9])

    numpy.testing.assert_array_almost_equal(i.get_item_vector(0), [1.1, 2.2, 3.3])
    assert i.build(10)
    assert i.save("blah.ann")
    numpy.testing.assert_array_almost_equal(i.get_item_vector(1), [4.4, 5.5, 6.6])
    j = AnnoyIndex(f, "angular")
    assert j.load("blah.ann")
    numpy.testing.assert_array_almost_equal(j.get_item_vector(2), [7.7, 8.8, 9.9])


def test_get_nns_search_k():
    f = 3
    i = AnnoyIndex(f, "angular")
    i.add_item(0, [0, 0, 1])
    i.add_item(1, [0, 1, 0])
    i.add_item(2, [1, 0, 0])
    i.build(10)

    assert i.get_nns_by_item(0, 3, 10) == [0, 1, 2]
    assert i.get_nns_by_vector([3, 2, 1], 3, 10) == [2, 1, 0]


def test_include_dists():
    # Double checking issue 112
    f = 40
    i = AnnoyIndex(f, "angular")
    v = numpy.random.normal(size=f)
    i.add_item(0, v)
    i.add_item(1, -v)
    i.build(10)

    indices, dists = i.get_nns_by_item(0, 2, 10, True)
    assert indices == [0, 1]
    assert dists[0] == pytest.approx(0.0)
    assert dists[1] == pytest.approx(2.0)


def test_include_dists_check_ranges():
    f = 3
    i = AnnoyIndex(f, "angular")
    for j in range(100000):
        i.add_item(j, numpy.random.normal(size=f))
    i.build(10)
    indices, dists = i.get_nns_by_item(0, 100000, include_distances=True)
    assert max(dists) <= 2.0
    assert min(dists) == pytest.approx(0.0)


def test_distance_consistency():
    n, f = 1000, 3
    i = AnnoyIndex(f, "angular")
    for j in range(n):
        while True:
            v = numpy.random.normal(size=f)
            if numpy.dot(v, v) > 0.1:
                break
        i.add_item(j, v)
    i.build(10)
    for a in random.sample(range(n), 100):
        indices, dists = i.get_nns_by_item(a, 100, include_distances=True)
        for b, dist in zip(indices, dists):
            u = i.get_item_vector(a)
            v = i.get_item_vector(b)
            assert dist == pytest.approx(i.get_distance(a, b), rel=1e-3, abs=1e-3)
            u_norm = numpy.array(u) * numpy.dot(u, u) ** -0.5
            v_norm = numpy.array(v) * numpy.dot(v, v) ** -0.5
            # cos = numpy.clip(1 - cosine(u, v), -1, 1) # scipy returns 1 - cos
            assert dist**2 == pytest.approx(
                numpy.dot(u_norm - v_norm, u_norm - v_norm), rel=1e-3, abs=1e-3
            )
            # self.assertAlmostEqual(dist, (2*(1 - cos))**0.5)
            assert dist**2 == pytest.approx(
                sum([(x - y) ** 2 for x, y in zip(u_norm, v_norm)]),
                rel=1e-3,
                abs=1e-3,
            )


def test_only_one_item():
    # reported to annoy-user by Kireet Reddy
    idx = AnnoyIndex(100, "angular")
    idx.add_item(0, numpy.random.randn(100))
    idx.build(n_trees=10)
    idx.save("foo.idx")
    idx = AnnoyIndex(100, "angular")
    idx.load("foo.idx")
    assert idx.get_n_items() == 1
    assert idx.get_nns_by_vector(
        vector=numpy.random.randn(100), n=50, include_distances=False
    ) == [0]


def test_no_items():
    idx = AnnoyIndex(100, "angular")
    idx.build(n_trees=10)
    idx.save("foo.idx")
    idx = AnnoyIndex(100, "angular")
    idx.load("foo.idx")
    assert idx.get_n_items() == 0
    assert (
        idx.get_nns_by_vector(
            vector=numpy.random.randn(100), n=50, include_distances=False
        )
        == []
    )


def test_single_vector():
    # https://github.com/spotify/annoy/issues/194
    a = AnnoyIndex(3, "angular")
    a.add_item(0, [1, 0, 0])
    a.build(10)
    a.save("1.ann")
    indices, dists = a.get_nns_by_vector([1, 0, 0], 3, include_distances=True)
    assert indices == [0]
    assert dists[0] ** 2 == pytest.approx(0.0)


================================================
FILE: test/annoy_test.go
================================================
/*
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations under
# the License.
*/

package annoy_test

import (
	"math"
	"math/rand"
	"os"
	"testing"

	"github.com/spotify/annoy"
	"github.com/stretchr/testify/assert"
	"github.com/stretchr/testify/require"
	"github.com/stretchr/testify/suite"
)

type AnnoyTestSuite struct {
	suite.Suite
}

func Round(f float64) float64 {
	return math.Floor(f + 0.5)
}

func RoundPlus(f float64, places int) float64 {
	shift := math.Pow(10, float64(places))
	return Round(f*shift) / shift
}

func (suite *AnnoyTestSuite) SetupTest() {
}

func (suite *AnnoyTestSuite) TestFileHandling() {
	index := annoy.NewAnnoyIndexAngular(3)
	index.AddItem(0, []float32{0, 0, 1})
	index.AddItem(1, []float32{0, 1, 0})
	index.AddItem(2, []float32{1, 0, 0})
	index.Build(10)

	index.Save("go_test.ann")

	info, err := os.Stat("go_test.ann")
	if err != nil {
		assert.Fail(suite.T(), "Failed to create file, file not found")
	}
	if info.Size() == 0 {
		assert.Fail(suite.T(), "Failed to create file, file size zero")
	}

	annoy.DeleteAnnoyIndexAngular(index)

	index = annoy.NewAnnoyIndexAngular(3)
	if ret := index.Load("go_test.ann"); ret == false {
		assert.Fail(suite.T(), "Failed to load file")
	}

	os.Remove("go_test.ann")
	index.Save("go_test2.ann", false)

	info, err = os.Stat("go_test2.ann")
	if err != nil {
		assert.Fail(suite.T(), "Failed to create file without prefault, file not found")
	}
	if info.Size() == 0 {
		assert.Fail(suite.T(), "Failed to create file without prefault, file size zero")
	}

	annoy.DeleteAnnoyIndexAngular(index)

	index = annoy.NewAnnoyIndexAngular(3)
	if ret := index.Load("go_test2.ann", false); ret == false {
		assert.Fail(suite.T(), "Failed to load file without prefault")
	}

	os.Remove("go_test2.ann")
	index.Save("go_test3.ann", true)

	info, err = os.Stat("go_test3.ann")
	if err != nil {
		assert.Fail(suite.T(), "Failed to create file allowing prefault, file not found")
	}
	if info.Size() == 0 {
		assert.Fail(suite.T(), "Failed to create file allowing prefault, file size zero")
	}

	annoy.DeleteAnnoyIndexAngular(index)

	index = annoy.NewAnnoyIndexAngular(3)
	if ret := index.Load("go_test3.ann", true); ret == false {
		assert.Fail(suite.T(), "Failed to load file allowing prefault")
	}
	annoy.DeleteAnnoyIndexAngular(index)

	os.Remove("go_test3.ann")
}

func (suite *AnnoyTestSuite) TestOnDiskBuild() {
	index := annoy.NewAnnoyIndexAngular(3)
	index.OnDiskBuild("go_test.ann")

	info, err := os.Stat("go_test.ann")
	if err != nil {
		assert.Fail(suite.T(), "Failed to create file, file not found")
	}
	if info.Size() == 0 {
		assert.Fail(suite.T(), "Failed to create file, file size zero")
	}

	index.AddItem(0, []float32{0, 0, 1})
	index.AddItem(1, []float32{0, 1, 0})
	index.AddItem(2, []float32{1, 0, 0})
	index.Build(10)

	index.Unload()
	index.Load("go_test.ann")

	result := annoy.NewAnnoyVectorInt()
	defer result.Free()

	index.GetNnsByVector([]float32{3, 2, 1}, 3, -1, result)
	assert.Equal(suite.T(), []int32{2, 1, 0}, result.ToSlice())

	index.GetNnsByVector([]float32{1, 2, 3}, 3, -1, result)
	assert.Equal(suite.T(), []int32{0, 1, 2}, result.ToSlice())

	index.GetNnsByVector([]float32{2, 0, 1}, 3, -1, result)
	assert.Equal(suite.T(), []int32{2, 0, 1}, result.ToSlice())

	annoy.DeleteAnnoyIndexAngular(index)

	os.Remove("go_test.ann")
}

func (suite *AnnoyTestSuite) TestGetNnsByVector() {
	t := suite.T()
	index := annoy.NewAnnoyIndexAngular(3)
	index.AddItem(0, []float32{0, 0, 1})
	index.AddItem(1, []float32{0, 1, 0})
	index.AddItem(2, []float32{1, 0, 0})
	index.Build(10)

	t.Run("regular", func(t *testing.T) {
		result := annoy.NewAnnoyVectorInt()
		defer result.Free()

		index.GetNnsByVector([]float32{3, 2, 1}, 3, -1, result)
		assert.Equal(t, []int32{2, 1, 0}, result.ToSlice())

		index.GetNnsByVector([]float32{1, 2, 3}, 3, -1, result)
		assert.Equal(t, []int32{0, 1, 2}, result.ToSlice())

		index.GetNnsByVector([]float32{2, 0, 1}, 3, -1, result)
		assert.Equal(t, []int32{2, 0, 1}, result.ToSlice())
	})

	t.Run("with copying", func(t *testing.T) {
		result := annoy.NewAnnoyVectorInt()
		defer result.Free()

		var notAllocated []int32
		index.GetNnsByVector([]float32{3, 2, 1}, 3, -1, result)
		result.Copy(&notAllocated)
		assert.Equal(t, []int32{2, 1, 0}, notAllocated)

		// to make sure it will be overwritten
		var alreadyAllocated = make([]int32, 10)
		for i := 0; i < len(alreadyAllocated); i++ {
			alreadyAllocated[i] = -1
		}
		index.GetNnsByVector([]float32{3, 2, 1}, 3, -1, result)
		result.Copy(&alreadyAllocated)
		assert.Equal(t, []int32{2, 1, 0}, alreadyAllocated)

		var alreadyAllocatedCap = make([]int32, 0, 00)
		index.GetNnsByVector([]float32{3, 2, 1}, 3, -1, result)
		result.Copy(&alreadyAllocatedCap)
		assert.Equal(t, []int32{2, 1, 0}, alreadyAllocatedCap)
	})

	t.Run("with inner array", func(t *testing.T) {
		result := annoy.NewAnnoyVectorInt()
		defer result.Free()

		index.GetNnsByVector([]float32{3, 2, 1}, 3, -1, result)
		assert.Equal(t, []int32{2, 1, 0}, result.InnerArray())
	})

	annoy.DeleteAnnoyIndexAngular(index)
}

func (suite *AnnoyTestSuite) TestGetNnsByItem() {
	index := annoy.NewAnnoyIndexAngular(3)
	index.AddItem(0, []float32{2, 1, 0})
	index.AddItem(1, []float32{1, 2, 0})
	index.AddItem(2, []float32{0, 0, 1})
	index.Build(10)

	var result = annoy.NewAnnoyVectorInt()
	defer result.Free()

	index.GetNnsByItem(0, 3, -1, result)
	assert.Equal(suite.T(), []int32{0, 1, 2}, result.ToSlice())

	index.GetNnsByItem(1, 3, -1, result)
	assert.Equal(suite.T(), []int32{1, 0, 2}, result.ToSlice())

	annoy.DeleteAnnoyIndexAngular(index)
}

func (suite *AnnoyTestSuite) TestGetItem() {
	index := annoy.NewAnnoyIndexAngular(3)
	index.AddItem(0, []float32{2, 1, 0})
	index.AddItem(1, []float32{1, 2, 0})
	index.AddItem(2, []float32{0, 0, 1})
	index.Build(10)

	var result = annoy.NewAnnoyVectorFloat()
	defer result.Free()

	index.GetItem(0, result)
	assert.Equal(suite.T(), []float32{2, 1, 0}, result.ToSlice())

	index.GetItem(1, result)
	assert.Equal(suite.T(), []float32{1, 2, 0}, result.ToSlice())

	index.GetItem(2, result)
	assert.Equal(suite.T(), []float32{0, 0, 1}, result.ToSlice())

	annoy.DeleteAnnoyIndexAngular(index)
}

func (suite *AnnoyTestSuite) TestGetDistance() {
	index := annoy.NewAnnoyIndexAngular(2)
	index.AddItem(0, []float32{0, 1})
	index.AddItem(1, []float32{1, 1})
	index.Build(10)

	assert.Equal(suite.T(), RoundPlus(math.Pow(2*(1.0-math.Pow(2, -0.5)), 0.5), 3), RoundPlus(float64(index.GetDistance(0, 1)), 3))

	annoy.DeleteAnnoyIndexAngular(index)
}

func (suite *AnnoyTestSuite) TestGetDotProductDistance() {
	index := annoy.NewAnnoyIndexDotProduct(2)
	index.AddItem(0, []float32{0, 1})
	index.AddItem(1, []float32{1, 1})
	index.Build(10)

	assert.True(suite.T(),
		math.Abs(1.0-float64(index.GetDistance(0, 1))) < 0.00001)

	annoy.DeleteAnnoyIndexDotProduct(index)
}

func (suite *AnnoyTestSuite) TestLargeEuclideanIndex() {
	index := annoy.NewAnnoyIndexEuclidean(10)

	for j := 0; j < 10000; j += 2 {
		p := make([]float32, 0, 10)
		for i := 0; i < 10; i++ {
			p = append(p, rand.Float32())
		}
		x := make([]float32, 0, 10)
		for i := 0; i < 10; i++ {
			x = append(x, 1+p[i]+rand.Float32()*1e-2)
		}
		y := make([]float32, 0, 10)
		for i := 0; i < 10; i++ {
			y = append(y, 1+p[i]+rand.Float32()*1e-2)
		}
		index.AddItem(j, x)
		index.AddItem(j+1, y)
	}
	index.Build(10)
	result := annoy.NewAnnoyVectorInt()
	defer result.Free()
	for j := 0; j < 10000; j += 2 {
		index.GetNnsByItem(j, 2, -1, result)

		require.Equal(suite.T(), result.ToSlice(), []int32{int32(j), int32(j + 1)})

		index.GetNnsByItem(j+1, 2, -1, result)
		require.Equal(suite.T(), result.ToSlice(), []int32{int32(j) + 1, int32(j)})
	}
	annoy.DeleteAnnoyIndexEuclidean(index)
}

func TestAnnoyTestSuite(t *testing.T) {
	suite.Run(t, new(AnnoyTestSuite))
}


================================================
FILE: test/annoy_test.lua
================================================
-- Copyright (c) 2016 Boris Nagaev
--
-- Licensed under the Apache License, Version 2.0 (the "License"); you may not
-- use this file except in compliance with the License. You may obtain a copy of
-- the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-- License for the specific language governing permissions and limitations under
-- the License.

local AnnoyIndex = require 'annoy'.AnnoyIndex

local function gauss(mu, sigma)
  local sum = -6
  for _ = 1, 12 do
    sum = sum + math.random()
  end
  return mu + sum * sigma
end

local function randomVector(f, mu, sigma)
  local v = {}
  for i = 1, f do
    v[i] = gauss(mu, sigma)
  end
  return v
end

local function round(x)
    return ("%.3f"):format(x)
end

local function roundArray(array)
    local rounded_array = {}
    for k, v in ipairs(array) do
        rounded_array[k] = round(v)
    end
    return rounded_array
end

local function isSorted(v)
    for i = 2, #v do
        if v[i-1] > v[i] then
            return false
        end
    end
    return true
end

local function max(array)
    local ans = assert(array[1])
    for _, v in ipairs(array) do
        ans = math.max(ans, v)
    end
    return ans
end

local function min(array)
    local ans = assert(array[1])
    for _, v in ipairs(array) do
        ans = math.min(ans, v)
    end
    return ans
end

local function precision(first1000, n, n_trees, n_points, n_rounds)
    if not n_trees then
        n_trees = 10
    end
    if not n_points then
        n_points = 10000
    end
    if not n_rounds then
        n_rounds = 10
    end
    local found = 0
    for _ = 1, n_rounds do
        local f = 10
        local p_size
        if first1000 then
            -- create random points at distance x from (1000, 0, 0, ...)
            p_size = f - 1
        else
            -- create random points at distance x
            p_size = f
        end
        local i = AnnoyIndex(f, 'euclidean')
        for j = 0, n_points - 1 do
            local p = randomVector(p_size, 0, 1)
            local norm
            do
                norm = 0
                for _, pi in ipairs(p) do
                    norm = norm + pi ^ 2
                end
                norm = norm ^ 0.5
            end
            local x = {}
            do
                if first1000 then
                    x[1] = 1000
                end
                for _, pi in ipairs(p) do
                    table.insert(x, pi / norm * j)
                end
            end
            i:add_item(j, x)
        end
        i:build(n_trees)
        local v = {}
        do
            for k = 1, f do
                v[k] = 0
            end
            if first1000 then
                v[1] = 1000
            end
        end
        local nns = i:get_nns_by_vector(v, n)
        assert(isSorted(nns))
        -- The number of gaps should be equal to the last item minus n-1
        for _, x in ipairs(nns) do
            if x < n then
                found = found + 1
            end
        end
    end
    return 1.0 * found / (n * n_rounds)
end

describe("angular annoy test", function()

    it("get_nns_by_vector", function()
        local f = 3
        local i = AnnoyIndex(f)
        i:add_item(0, {0, 0, 1})
        i:add_item(1, {0, 1, 0})
        i:add_item(2, {1, 0, 0})
        i:build(10)
        assert.same({2, 1, 0}, i:get_nns_by_vector({3, 2, 1}, 3))
        assert.same({0, 1, 2}, i:get_nns_by_vector({1, 2, 3}, 3))
        assert.same({2, 0, 1}, i:get_nns_by_vector({2, 0, 1}, 3))
    end)

    it("get_nns_by_item", function()
        local f = 3
        local i = AnnoyIndex(f)
        i:add_item(0, {2, 1, 0})
        i:add_item(1, {1, 2, 0})
        i:add_item(2, {0, 0, 1})
        i:build(10)
        assert.same({0, 1, 2}, i:get_nns_by_item(0, 3))
        assert.same({1, 0, 2}, i:get_nns_by_item(1, 3))
        do
            local close_to_2 = i:get_nns_by_item(2, 3)
            assert.equal(close_to_2[1], 2)
            assert.truthy(
                (close_to_2[2] == 0 and close_to_2[3] == 1)
                or
                (close_to_2[2] == 1 and close_to_2[3] == 0)
            )
        end
    end)

    it("dist", function()
        local f = 2
        local i = AnnoyIndex(f)
        i:add_item(0, {0, 1})
        i:add_item(1, {1, 1})
        assert.equal(round((2 * (1.0 - 2 ^ -0.5)) ^ 0.5), round(i:get_distance(0, 1)))
    end)

    it("dist_2", function()
        local f = 2
        local i = AnnoyIndex(f)
        i:add_item(0, {1000, 0})
        i:add_item(1, {10, 0})
        assert.equal(round(0), round(i:get_distance(0, 1)))
    end)

    it("dist_3", function()
        local f = 2
        local i = AnnoyIndex(f)
        i:add_item(0, {97, 0})
        i:add_item(1, {42, 42})
        local dist = ((1 - 2 ^ -0.5) ^ 2 + (2 ^ -0.5) ^ 2) ^ 0.5
        assert.equal(round(dist), round(i:get_distance(0, 1)))
    end)

    it("dist_degen", function()
        local f = 2
        local i = AnnoyIndex(f)
        i:add_item(0, {1, 0})
        i:add_item(1, {0, 0})
        assert.equal(round(2.0 ^ 0.5), round(i:get_distance(0, 1)))
    end)

    it("large_index", function()
        -- Generate pairs of random points where the pair is super close
        local f = 10
        local i = AnnoyIndex(f)
        for j = 0, 10000 - 1, 2 do
            local p = randomVector(f, 0, 1)
            local f1 = math.random() + 1
            local f2 = math.random() + 1
            local x = {}
            local y = {}
            for k, pi in ipairs(p) do
                x[k] = f1 * pi + gauss(0, 1e-2)
                y[k] = f2 * pi + gauss(0, 1e-2)
            end
            i:add_item(j, x)
            i:add_item(j+1, y)
        end
        i:build(10)
        for j = 0, 10000 - 1, 2 do
            assert.same({j, j+1}, i:get_nns_by_item(j, 2))
            assert.same({j+1, j}, i:get_nns_by_item(j+1, 2))
        end
    end)

    it("precision_1", function()
        assert.truthy(precision(true, 1) >= 0.98)
    end)

    it("precision_10", function()
        assert.truthy(precision(true, 10) >= 0.98)
    end)

    it("precision_100", function()
        assert.truthy(precision(true, 100) >= 0.98)
    end)

    it("precision_1000", function()
        assert.truthy(precision(true, 1000) >= 0.98)
    end)

    it("load_save_get_item_vector", function()
        local f = 3
        local i = AnnoyIndex(f)
        i:add_item(0, {1.1, 2.2, 3.3})
        i:add_item(1, {4.4, 5.5, 6.6})
        i:add_item(2, {7.7, 8.8, 9.9})
        assert.same(roundArray({1.1, 2.2, 3.3}), roundArray(i:get_item_vector(0)))
        assert.truthy(i:build(10))
        assert.truthy(i:save('blah.ann'))
        assert.same(roundArray({4.4, 5.5, 6.6}), roundArray(i:get_item_vector(1)))
        local j = AnnoyIndex(f)
        assert.truthy(j:load('blah.ann'))
        assert.same(roundArray({7.7, 8.8, 9.9}), roundArray(i:get_item_vector(2)))
    end)

    it("get_nns_search_k", function()
        local f = 3
        local i = AnnoyIndex(f)
        i:add_item(0, {0, 0, 1})
        i:add_item(1, {0, 1, 0})
        i:add_item(2, {1, 0, 0})
        i:build(10)
        assert.same({0, 1, 2}, i:get_nns_by_item(0, 3, 10))
        assert.same({2, 1, 0}, i:get_nns_by_vector({3, 2, 1}, 3, 10))
    end)

    it("include_dists", function()
        -- Double checking issue 112
        local f = 40
        local i = AnnoyIndex(f)
        local v = randomVector(f, 0, 1)
        i:add_item(0, v)
        local neg_v = {}
        do
            for k, value in ipairs(v) do
                neg_v[k] = -value
            end
        end
        i:add_item(1, neg_v)
        i:build(10)
        local indices, dists = i:get_nns_by_item(0, 2, 10, true)
        assert.same({0, 1}, indices)
        assert.same(roundArray({0.0, 2.0}), roundArray(dists))
    end)


    it("include_dists_check_ranges", function()
        local f = 3
        local i = AnnoyIndex(f)
        for j = 0, 100000 - 1 do
            i:add_item(j, randomVector(f, 0, 1))
        end
        i:build(10)
        local include_distances = true
        local _, dists = i:get_nns_by_item(0, 100000, -1, include_distances)
        assert.truthy(max(dists) < 2.0)
        assert.equal(round(0.0), round(min(dists)))
    end)

end)

describe("euclidean annoy test", function()

    it("get_nns_by_vector", function()
        local f = 2
        local i = AnnoyIndex(f, 'euclidean')
        i:add_item(0, {2, 2})
        i:add_item(1, {3, 2})
        i:add_item(2, {3, 3})
        i:build(10)
        assert.same({2, 1, 0}, i:get_nns_by_vector({4, 4}, 3))
        assert.same({0, 1, 2}, i:get_nns_by_vector({1, 1}, 3))
        assert.same({1, 2, 0}, i:get_nns_by_vector({4, 2}, 3))
    end)

    it("get_nns_by_item", function()
        local f = 2
        local i = AnnoyIndex(f, 'euclidean')
        i:add_item(0, {2, 2})
        i:add_item(1, {3, 2})
        i:add_item(2, {3, 3})
        i:build(10)
        assert.same({0, 1, 2}, i:get_nns_by_item(0, 3))
        assert.same({2, 1, 0}, i:get_nns_by_item(2, 3))
    end)

    it("dist", function()
        local f = 2
        local i = AnnoyIndex(f, 'euclidean')
        i:add_item(0, {0, 1})
        i:add_item(1, {1, 1})
        assert.equal(round(1.0), round(i:get_distance(0, 1)))
    end)

    it("large_index", function()
        -- Generate pairs of random points where the pair is super close
        local f = 10
        -- local q = randomVector(f, 0, 10)
        local i = AnnoyIndex(f, 'euclidean')
        for j = 0, 10000 - 1, 2 do
            local p = randomVector(f, 0, 1)
            local x = {}
            local y = {}
            for k, pi in ipairs(p) do
                x[k] = 1 + pi + gauss(0, 1e-2) -- todo: should be q[i]
                y[k] = 1 + pi + gauss(0, 1e-2)
            end
            i:add_item(j, x)
            i:add_item(j+1, y)
        end
        i:build(10)
        for j = 0, 10000 - 1, 2 do
            assert.same({j, j+1}, i:get_nns_by_item(j, 2))
            assert.same({j+1, j}, i:get_nns_by_item(j+1, 2))
        end
    end)

    it("precision_1", function()
        assert.truthy(precision(false, 1) >= 0.98)
    end)

    it("precision_10", function()
        assert.truthy(precision(false, 10) >= 0.98)
    end)

    it("precision_100", function()
        assert.truthy(precision(false, 100) >= 0.98)
    end)

    it("precision_1000", function()
        assert.truthy(precision(false, 1000) >= 0.98)
    end)

    it("get_nns_with_distances", function()
        local f = 3
        local i = AnnoyIndex(f, 'euclidean')
        i:add_item(0, {0, 0, 2})
        i:add_item(1, {0, 1, 1})
        i:add_item(2, {1, 0, 0})
        i:build(10)
        do
            local l, d = i:get_nns_by_item(0, 3, -1, true)
            assert.same({0, 1, 2}, l)
            assert.same(
                roundArray({0, 2, 5}),
                roundArray({d[1]^2, d[2]^2, d[3]^2})
            )
        end
        do
            local l, d = i:get_nns_by_vector({2, 2, 2}, 3, -1, true)
            assert.same({1, 0, 2}, l)
            assert.same(
                roundArray({6, 8, 9}),
                roundArray({d[1]^2, d[2]^2, d[3]^2})
            )
        end
    end)

    it("include_dists", function()
        local f = 40
        local i = AnnoyIndex(f)
        local v = randomVector(f, 0, 1)
        i:add_item(0, v)
        local neg_v = {}
        do
            for k, value in ipairs(v) do
                neg_v[k] = -value
            end
        end
        i:add_item(1, neg_v)
        i:build(10)
        local indices, dists = i:get_nns_by_item(0, 2, 10, true)
        assert.same({0, 1}, indices)
        assert.same(round(0.0), round(dists[1]))
    end)

end)

describe("index test", function()

    it("not_found_tree", function()
        local i = AnnoyIndex(10)
        assert.has_error(function()
            i:load('nonexists.tree')
        end)
    end)

    it("binary_compatibility", function()
        local i = AnnoyIndex(10)
        i:load('test/test.tree')

        -- This might change in the future if we change the search
        -- algorithm, but in that case let's update the test
        assert.same(
            {0, 85, 42, 11, 54, 38, 53, 66, 19, 31},
            i:get_nns_by_item(0, 10)
        )
    end)

    it("load_unload", function()
        -- Issue #108
        local i = AnnoyIndex(10)
        for _ = 1, 100000 do
            i:load('test/test.tree')
            i:unload()
        end
    end)

    it("construct_load_destruct", function()
        for x = 1, 100000 do
            local i = AnnoyIndex(10)
            i:load('test/test.tree')
            if x % 100 == 0 then
                collectgarbage()
            end
        end
    end)

    it("construct_destruct", function()
        for _ = 1, 100000 do
            local i = AnnoyIndex(10)
            i:add_item(1000, randomVector(10, 0, 1))
        end
    end)

    it("save_twice", function()
        -- Issue #100
        local t = AnnoyIndex(10)
        t:save("t.ann")
        t:save("t.ann")
    end)

    it("load_save", function()
        -- Issue #61
        local i = AnnoyIndex(10)
        i:load('test/test.tree')
        local u = i:get_item_vector(99)
        i:save('i.tree')
        local v = i:get_item_vector(99)
        assert.same(u, v)
        local j = AnnoyIndex(10)
        j:load('test/test.tree')
        local w = i:get_item_vector(99) -- maybe s/i/j/?
        assert.same(u, w)
        -- Ensure specifying if prefault is allowed does not impact result
        j:save('j.tree', true)
        local k = AnnoyIndex(10)
        k:load('j.tree', true)
        local x = k:get_item_vector(99)
        assert.same(u, x)
        k:save('k.tree', false)
        local l = AnnoyIndex(10)
        l:load('k.tree', false)
        local y = l:get_item_vector(99)
        assert.same(u, y)
    end)

    it("on_disk_build", function()
        local f = 2
        local i = AnnoyIndex(f, 'euclidean')
        i:on_disk_build('x.tree')
        i:add_item(0, {2, 2})
        i:add_item(1, {3, 2})
        i:add_item(2, {3, 3})
        i:build(10)
        
        i:unload()
        i:load('x.tree')
        
        assert.same({2, 1, 0}, i:get_nns_by_vector({4, 4}, 3))
        assert.same({0, 1, 2}, i:get_nns_by_vector({1, 1}, 3))
        assert.same({1, 2, 0}, i:get_nns_by_vector({4, 2}, 3))
    end)
end)

describe("types test", function()

    local n_points = 1000
    local n_trees = 10

    -- tests "numpy" and "tuple" are not applicable to Lua

    it("wrong_length", function()
        local f = 10
        local i = AnnoyIndex(f, 'euclidean')
        i:add_item(0, randomVector(f, 0, 1))
        assert.has_error(function()
            i:add_item(1, randomVector(f + 1000, 0, 1))
        end)
        assert.has_error(function()
            i:add_item(2, {})
        end)
        i:build(n_trees)
    end)

    it("range_errors", function()
        local f = 10
        local i = AnnoyIndex(f, 'euclidean')
        for j = 0, n_points - 1 do
            i:add_item(j, randomVector(f, 0, 1))
        end
        assert.has_error(function()
            i:add_item(-1, randomVector(f))
        end)
        i:build(n_trees)
        for _, bad_index in ipairs({-1000, -1, n_points, n_points + 1000}) do
            assert.has_error(function()
                i:get_distance(0, bad_index)
            end)
            assert.has_error(function()
                i:get_nns_by_item(bad_index, 1)
            end)
            assert.has_error(function()
                i:get_item_vector(bad_index)
            end)
        end
    end)

end)

describe("memory leaks", function()

    it("get_item_vector", function()
        local f = 10
        local i = AnnoyIndex(f, 'euclidean')
        i:add_item(0, randomVector(f, 0, 1))
        for j = 0, 100 - 1 do
            print(j, '...')
            for _ = 1, 1000 * 1000 do
                i:get_item_vector(0)
            end
        end
    end)

    it("get_lots_of_nns", function()
        local f = 10
        local i = AnnoyIndex(f, 'euclidean')
        i:add_item(0, randomVector(f, 0, 1))
        i:build(10)
        for _ = 1, 100 do
            assert.same({0}, i:get_nns_by_item(0, 999999999))
        end
    end)

end)


================================================
FILE: test/dot_index_test.py
================================================
# Copyright (c) 2018 Spotify AB
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations under
# the License.

import random

import numpy
import pytest

from annoy import AnnoyIndex


def dot_metric(a, b):
    return -numpy.dot(a, b)


def recall(retrieved, relevant):
    return float(len(set(relevant) & set(retrieved))) / float(len(set(relevant)))


def test_get_nns_by_vector():
    f = 2
    i = AnnoyIndex(f, "dot")
    i.add_item(0, [2, 2])
    i.add_item(1, [3, 2])
    i.add_item(2, [3, 3])
    i.build(10)

    assert i.get_nns_by_vector([4, 4], 3) == [2, 1, 0]
    assert i.get_nns_by_vector([1, 1], 3) == [2, 1, 0]
    assert i.get_nns_by_vector([4, 2], 3) == [2, 1, 0]


def test_get_nns_by_item():
    f = 2
    i = AnnoyIndex(f, "dot")
    i.add_item(0, [2, 2])
    i.add_item(1, [3, 2])
    i.add_item(2, [3, 3])
    i.build(10)

    assert i.get_nns_by_item(0, 3) == [2, 1, 0]
    assert i.get_nns_by_item(2, 3) == [2, 1, 0]


def test_dist():
    f = 2
    i = AnnoyIndex(f, "dot")
    i.add_item(0, [0, 1])
    i.add_item(1, [1, 1])
    i.add_item(2, [0, 0])
    i.build(10)

    assert i.get_distance(0, 1) == pytest.approx(1.0)
    assert i.get_distance(1, 2) == pytest.approx(0.0)


def recall_at(n, n_trees=10, n_points=1000, n_rounds=5):
    # the best movie/variable name
    total_recall = 0.0

    for r in range(n_rounds):
        # create random points at distance x
        f = 10
        idx = AnnoyIndex(f, "dot")

        data = numpy.array(
            [[random.gauss(0, 1) for z in range(f)] for j in range(n_points)]
        )

        expected_results = [
            sorted(range(n_points), key=lambda j: dot_metric(data[i], data[j]))[:n]
            for i in range(n_points)
        ]

        for i, vec in enumerate(data):
            idx.add_item(i, vec)

        idx.build(n_trees)

        for i in range(n_points):
            nns = idx.get_nns_by_vector(data[i], n)
            total_recall += recall(nns, expected_results[i])

    return total_recall / float(n_rounds * n_points)


def test_recall_at_10():
    value = recall_at(10)
    assert value >= 0.65


def test_recall_at_100():
    value = recall_at(100)
    assert value >= 0.95


def test_recall_at_1000():
    value = recall_at(1000)
    assert value >= 0.99


def test_recall_at_1000_fewer_trees():
    value = recall_at(1000, n_trees=4)
    assert value >= 0.99


def test_get_nns_with_distances():
    f = 3
    i = AnnoyIndex(f, "dot")
    i.add_item(0, [0, 0, 2])
    i.add_item(1, [0, 1, 1])
    i.add_item(2, [1, 0, 0])
    i.build(10)

    l, d = i.get_nns_by_item(0, 3, -1, True)
    assert l == [0, 1, 2]
    assert d[0] == pytest.approx(4)
    assert d[1] == pytest.approx(2)
    assert d[2] == pytest.approx(0)

    l, d = i.get_nns_by_vector([2, 2, 2], 3, -1, True)
    assert l == [0, 1, 2]
    assert d[0] == pytest.approx(4)
    assert d[1] == pytest.approx(4)
    assert d[2] == pytest.approx(2)


def test_include_dists():
    f = 40
    i = AnnoyIndex(f, "dot")
    v = numpy.random.normal(size=f)
    i.add_item(0, v)
    i.add_item(1, -v)
    i.build(10)

    indices, dists = i.get_nns_by_item(0, 2, 10, True)
    assert indices == [0, 1]
    assert dists[0] == pytest.approx(numpy.dot(v, v))


def test_distance_consistency():
    n, f = 1000, 3
    i = AnnoyIndex(f, "dot")
    for j in range(n):
        i.add_item(j, numpy.random.normal(size=f))
    i.build(10)
    for a in random.sample(range(n), 100):
        indices, dists = i.get_nns_by_item(a, 100, include_distances=True)
        for b, dist in zip(indices, dists):
            assert dist == pytest.approx(
                numpy.dot(i.get_item_vector(a), i.get_item_vector(b))
            )
        assert dist == pytest.approx(i.get_distance(a, b))


================================================
FILE: test/euclidean_index_test.py
================================================
# Copyright (c) 2013 Spotify AB
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations under
# the License.

import random

import numpy
import pytest

from annoy import AnnoyIndex


def test_get_nns_by_vector():
    f = 2
    i = AnnoyIndex(f, "euclidean")
    i.add_item(0, [2, 2])
    i.add_item(1, [3, 2])
    i.add_item(2, [3, 3])
    i.build(10)

    assert i.get_nns_by_vector([4, 4], 3) == [2, 1, 0]
    assert i.get_nns_by_vector([1, 1], 3) == [0, 1, 2]
    assert i.get_nns_by_vector([4, 2], 3) == [1, 2, 0]


def test_get_nns_by_item():
    f = 2
    i = AnnoyIndex(f, "euclidean")
    i.add_item(0, [2, 2])
    i.add_item(1, [3, 2])
    i.add_item(2, [3, 3])
    i.build(10)

    assert i.get_nns_by_item(0, 3) == [0, 1, 2]
    assert i.get_nns_by_item(2, 3) == [2, 1, 0]


def test_dist():
    f = 2
    i = AnnoyIndex(f, "euclidean")
    i.add_item(0, [0, 1])
    i.add_item(1, [1, 1])
    i.add_item(2, [0, 0])

    assert i.get_distance(0, 1) == pytest.approx(1.0**0.5)
    assert i.get_distance(1, 2) == pytest.approx(2.0**0.5)


def test_large_index():
    # Generate pairs of random points where the pair is super close
    f = 10
    [random.gauss(0, 10) for z in range(f)]
    i = AnnoyIndex(f, "euclidean")
    for j in range(0, 10000, 2):
        p = [random.gauss(0, 1) for z in range(f)]
        x = [1 + pi + random.gauss(0, 1e-2) for pi in p]  # todo: should be q[i]
        y = [1 + pi + random.gauss(0, 1e-2) for pi in p]
        i.add_item(j, x)
        i.add_item(j + 1, y)

    i.build(10)
    for j in range(0, 10000, 2):
        assert i.get_nns_by_item(j, 2) == [j, j + 1]
        assert i.get_nns_by_item(j + 1, 2) == [j + 1, j]


def precision(n, n_trees=10, n_points=10000, n_rounds=10):
    found = 0
    for r in range(n_rounds):
        # create random points at distance x
        f = 10
        i = AnnoyIndex(f, "euclidean")
        for j in range(n_points):
            p = [random.gauss(0, 1) for z in range(f)]
            norm = sum([pi**2 for pi in p]) ** 0.5
            x = [pi / norm * j for pi in p]
            i.add_item(j, x)

        i.build(n_trees)

        nns = i.get_nns_by_vector([0] * f, n)
        assert nns == sorted(nns)  # should be in order
        # The number of gaps should be equal to the last item minus n-1
        found += len([x for x in nns if x < n])

    return 1.0 * found / (n * n_rounds)


def test_precision_1():
    assert precision(1) >= 0.98


def test_precision_10():
    assert precision(10) >= 0.98


def test_precision_100():
    assert precision(100) >= 0.98


def test_precision_1000():
    assert precision(1000) >= 0.98


def test_get_nns_with_distances():
    f = 3
    i = AnnoyIndex(f, "euclidean")
    i.add_item(0, [0, 0, 2])
    i.add_item(1, [0, 1, 1])
    i.add_item(2, [1, 0, 0])
    i.build(10)

    l, d = i.get_nns_by_item(0, 3, -1, True)
    assert l == [0, 1, 2]
    assert d[0] ** 2 == pytest.approx(0)
    assert d[1] ** 2 == pytest.approx(2)
    assert d[2] ** 2 == pytest.approx(5)

    l, d = i.get_nns_by_vector([2, 2, 2], 3, -1, True)
    assert l == [1, 0, 2]
    assert d[0] ** 2 == pytest.approx(6)
    assert d[1] ** 2 == pytest.approx(8)
    assert d[2] ** 2 == pytest.approx(9)


def test_include_dists():
    f = 40
    i = AnnoyIndex(f, "euclidean")
    v = numpy.random.normal(size=f)
    i.add_item(0, v)
    i.add_item(1, -v)
    i.build(10)

    indices, dists = i.get_nns_by_item(0, 2, 10, True)
    assert indices == [0, 1]
    assert dists[0] == pytest.approx(0)


def test_distance_consistency():
    n, f = 1000, 3
    i = AnnoyIndex(f, "euclidean")
    for j in range(n):
        i.add_item(j, numpy.random.normal(size=f))
    i.build(10)
    for a in random.sample(range(n), 100):
        indices, dists = i.get_nns_by_item(a, 100, include_distances=True)
        for b, dist in zip(indices, dists):
            assert dist == pytest.approx(i.get_distance(a, b))
            u = numpy.array(i.get_item_vector(a))
            v = numpy.array(i.get_item_vector(b))
            assert dist == pytest.approx(numpy.dot(u - v, u - v) ** 0.5)
            assert dist == pytest.approx(
                sum([(x - y) ** 2 for x, y in zip(u, v)]) ** 0.5
            )


def test_rounding_error():
    # https://github.com/spotify/annoy/issues/314
    i = AnnoyIndex(1, "euclidean")
    i.add_item(0, [0.7125930])
    i.add_item(1, [0.7123166])
    assert i.get_distance(0, 1) >= 0.0


================================================
FILE: test/examples_test.py
================================================
def execfile(fn):
    with open(fn) as f:
        exec(f.read())


def simple_test():
    execfile("examples/simple_test.py")


def mmap_test():
    execfile("examples/mmap_test.py")


def precision_test():
    execfile("examples/precision_test.py")


================================================
FILE: test/hamming_index_test.py
================================================
# Copyright (c) 2013 Spotify AB
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations under
# the License.


import numpy
import pytest

from annoy import AnnoyIndex


def test_basic_conversion():
    f = 100
    i = AnnoyIndex(f, "hamming")
    u = numpy.random.binomial(1, 0.5, f)
    v = numpy.random.binomial(1, 0.5, f)
    i.add_item(0, u)
    i.add_item(1, v)
    u2 = i.get_item_vector(0)
    v2 = i.get_item_vector(1)
    assert numpy.dot(u - u2, u - u2) == pytest.approx(0.0)
    assert numpy.dot(v - v2, v - v2) == pytest.approx(0.0)
    assert i.get_distance(0, 0) == pytest.approx(0.0)
    assert i.get_distance(1, 1) == pytest.approx(0.0)
    assert i.get_distance(0, 1) == pytest.approx(numpy.dot(u - v, u - v))
    assert i.get_distance(1, 0) == pytest.approx(numpy.dot(u - v, u - v))


def test_basic_nns():
    f = 100
    i = AnnoyIndex(f, "hamming")
    u = numpy.random.binomial(1, 0.5, f)
    v = numpy.random.binomial(1, 0.5, f)
    i.add_item(0, u)
    i.add_item(1, v)
    i.build(10)
    assert i.get_nns_by_item(0, 99) == [0, 1]
    assert i.get_nns_by_item(1, 99) == [1, 0]
    rs, ds = i.get_nns_by_item(0, 99, include_distances=True)
    assert rs == [0, 1]
    assert ds[0] == pytest.approx(0)
    assert ds[1] == pytest.approx(numpy.dot(u - v, u - v))


def test_save_load():
    f = 100
    i = AnnoyIndex(f, "hamming")
    u = numpy.random.binomial(1, 0.5, f)
    v = numpy.random.binomial(1, 0.5, f)
    i.add_item(0, u)
    i.add_item(1, v)
    i.build(10)
    i.save("blah.ann")
    j = AnnoyIndex(f, "hamming")
    j.load("blah.ann")
    rs, ds = j.get_nns_by_item(0, 99, include_distances=True)
    assert rs == [0, 1]
    assert ds[0] == pytest.approx(0)
    assert ds[1] == pytest.approx(numpy.dot(u - v, u - v))


def test_many_vectors():
    f = 10
    i = AnnoyIndex(f, "hamming")
    for x in range(100000):
        i.add_item(x, numpy.random.binomial(1, 0.5, f))
    i.build(10)

    rs, ds = i.get_nns_by_vector([0] * f, 10000, include_distances=True)
    assert min(ds) >= 0
    assert max(ds) <= f

    dists = []
    for x in range(1000):
        rs, ds = i.get_nns_by_vector(
            numpy.random.binomial(1, 0.5, f), 1, search_k=1000, include_distances=True
        )
        dists.append(ds[0])
    avg_dist = 1.0 * sum(dists) / len(dists)
    assert avg_dist <= 0.42


@pytest.mark.skip  # will fix later
def test_zero_vectors():
    # Mentioned on the annoy-user list
    bitstrings = [
        "0000000000011000001110000011111000101110111110000100000100000000",
        "0000000000011000001110000011111000101110111110000100000100000001",
        "0000000000011000001110000011111000101110111110000100000100000010",
        "0010010100011001001000010001100101011110000000110000011110001100",
        "1001011010000110100101101001111010001110100001101000111000001110",
        "0111100101111001011110010010001100010111000111100001101100011111",
        "0011000010011101000011010010111000101110100101111000011101001011",
        "0011000010011100000011010010111000101110100101111000011101001011",
        "1001100000111010001010000010110000111100100101001001010000000111",
        "0000000000111101010100010001000101101001000000011000001101000000",
        "1000101001010001011100010111001100110011001100110011001111001100",
        "1110011001001111100110010001100100001011000011010010111100100111",
    ]
    vectors = [[int(bit) for bit in bitstring] for bitstring in bitstrings]

    f = 64
    idx = AnnoyIndex(f, "hamming")
    for i, v in enumerate(vectors):
        idx.add_item(i, v)

    idx.build(10)
    idx.save("idx.ann")
    idx = AnnoyIndex(f, "hamming")
    idx.load("idx.ann")
    js, ds = idx.get_nns_by_item(0, 5, include_distances=True)
    assert js[0] == 0
    assert ds[:4] == [0, 1, 1, 22]


================================================
FILE: test/holes_test.py
================================================
# Copyright (c) 2013 Spotify AB
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations under
# the License.

import random

import numpy

from annoy import AnnoyIndex


def test_random_holes():
    f = 10
    index = AnnoyIndex(f, "angular")
    valid_indices = random.sample(range(2000), 1000)  # leave holes
    for i in valid_indices:
        v = numpy.random.normal(size=(f,))
        index.add_item(i, v)
    index.build(10)
    for i in valid_indices:
        js = index.get_nns_by_item(i, 10000)
        for j in js:
            assert j in valid_indices
    for i in range(1000):
        v = numpy.random.normal(size=(f,))
        js = index.get_nns_by_vector(v, 10000)
        for j in js:
            assert j in valid_indices


def _test_holes_base(n, f=100, base_i=100000):
    annoy = AnnoyIndex(f, "angular")
    for i in range(n):
        annoy.add_item(base_i + i, numpy.random.normal(size=(f,)))
    annoy.build(100)
    res = annoy.get_nns_by_item(base_i, n)
    assert set(res) == set([base_i + i for i in range(n)])


def test_root_one_child():
    # See https://github.com/spotify/annoy/issues/223
    _test_holes_base(1)


def test_root_two_children():
    _test_holes_base(2)


def test_root_some_children():
    # See https://github.com/spotify/annoy/issues/295
    _test_holes_base(10)


def test_root_many_children():
    _test_holes_base(1000)


================================================
FILE: test/index_test.py
================================================
# Copyright (c) 2013 Spotify AB
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations under
# the License.

import os
import random

import pytest

from annoy import AnnoyIndex


def test_not_found_tree():
    i = AnnoyIndex(10, "angular")
    with pytest.raises(IOError):
        i.load("nonexists.tree")


def test_binary_compatibility():
    i = AnnoyIndex(10, "angular")
    i.load("test/test.tree")

    # This might change in the future if we change the search algorithm, but in that case let's update the test
    assert i.get_nns_by_item(0, 10) == [0, 85, 42, 11, 54, 38, 53, 66, 19, 31]


def test_load_unload():
    # Issue #108
    i = AnnoyIndex(10, "angular")
    for x in range(100000):
        i.load("test/test.tree")
        i.unload()


def test_construct_load_destruct():
    for x in range(100000):
        i = AnnoyIndex(10, "angular")
        i.load("test/test.tree")


def test_construct_destruct():
    for x in range(100000):
        i = AnnoyIndex(10, "angular")
        i.add_item(1000, [random.gauss(0, 1) for z in range(10)])


def test_save_twice():
    # Issue #100
    t = AnnoyIndex(10, "angular")
    for i in range(100):
        t.add_item(i, [random.gauss(0, 1) for z in range(10)])
    t.build(10)
    t.save("t1.ann")
    t.save("t2.ann")


def test_load_save():
    # Issue #61
    i = AnnoyIndex(10, "angular")
    i.load("test/test.tree")
    u = i.get_item_vector(99)
    i.save("i.tree")
    v = i.get_item_vector(99)
    assert u == v
    j = AnnoyIndex(10, "angular")
    j.load("test/test.tree")
    w = i.get_item_vector(99)
    assert u == w
    # Ensure specifying if prefault is allowed does not impact result
    j.save("j.tree", True)
    k = AnnoyIndex(10, "angular")
    k.load("j.tree", True)
    x = k.get_item_vector(99)
    assert u == x
    k.save("k.tree", False)
    l = AnnoyIndex(10, "angular")
    l.load("k.tree", False)
    y = l.get_item_vector(99)
    assert u == y


def test_save_without_build():
    t = AnnoyIndex(10, "angular")
    for i in range(100):
        t.add_item(i, [random.gauss(0, 1) for z in range(10)])
    # Note: in earlier version, this was allowed (see eg #61)
    with pytest.raises(Exception):
        t.save("x.tree")


def test_unbuild_with_loaded_tree():
    i = AnnoyIndex(10, "angular")
    i.load("test/test.tree")
    with pytest.raises(Exception):
        i.unbuild()


def test_seed():
    i = AnnoyIndex(10, "angular")
    i.load("test/test.tree")
    i.set_seed(42)


def test_unknown_distance():
    with pytest.raises(Exception):
        AnnoyIndex(10, "banana")


def test_metric_kwarg():
    # Issue 211
    i = AnnoyIndex(2, metric="euclidean")
    i.add_item(0, [1, 0])
    i.add_item(1, [9, 0])
    assert i.get_distance(0, 1) == pytest.approx(8)
    assert i.f == 2


def test_metric_f_kwargs():
    AnnoyIndex(f=3, metric="euclidean")


def test_item_vector_after_save():
    # Issue #279
    a = AnnoyIndex(3, "angular")
    a.verbose(True)
    a.add_item(1, [1, 0, 0])
    a.add_item(2, [0, 1, 0])
    a.add_item(3, [0, 0, 1])
    a.build(-1)
    assert a.get_n_items() == 4
    assert a.get_item_vector(3) == [0, 0, 1]
    assert set(a.get_nns_by_item(1, 999)) == set([1, 2, 3])
    a.save("something.annoy")
    assert a.get_n_items() == 4
    assert a.get_item_vector(3) == [0, 0, 1]
    assert set(a.get_nns_by_item(1, 999)) == set([1, 2, 3])


def test_prefault():
    i = AnnoyIndex(10, "angular")
    i.load("test/test.tree", prefault=True)
    assert i.get_nns_by_item(0, 10) == [0, 85, 42, 11, 54, 38, 53, 66, 19, 31]


def test_fail_save():
    t = AnnoyIndex(40, "angular")
    with pytest.raises(IOError):
        t.save("")


def test_overwrite_index():
    # Issue #335
    f = 40

    # Build the initial index
    t = AnnoyIndex(f, "angular")
    for i in range(1000):
        v = [random.gauss(0, 1) for z in range(f)]
        t.add_item(i, v)
    t.build(10)
    t.save("test.ann")

    # Load index file
    t2 = AnnoyIndex(f, "angular")
    t2.load("test.ann")

    # Overwrite index file
    t3 = AnnoyIndex(f, "angular")
    for i in range(500):
        v = [random.gauss(0, 1) for z in range(f)]
        t3.add_item(i, v)
    t3.build(10)
    if os.name == "nt":
        # Can't overwrite on Windows
        with pytest.raises(IOError):
            t3.save("test.ann")
    else:
        t3.save("test.ann")
        # Get nearest neighbors
        v = [random.gauss(0, 1) for z in range(f)]
        t2.get_nns_by_vector(v, 1000)  # Should not crash


def test_get_n_trees():
    i = AnnoyIndex(10, "angular")
    i.load("test/test.tree")
    assert i.get_n_trees() == 10


def test_write_failed():
    f = 40

    # Build the initial index
    t = AnnoyIndex(f, "angular")
    t.verbose(True)
    for i in range(1000):
        v = [random.gauss(0, 1) for z in range(f)]
        t.add_item(i, v)
    t.build(10)

    if os.name == "nt":
        path = "Z:\\xyz.annoy"
    else:
        path = "/x/y/z.annoy"
    with pytest.raises(Exception):
        t.save(path)


def test_dimension_mismatch():
    t = AnnoyIndex(100, "angular")
    for i in range(1000):
        t.add_item(i, [random.gauss(0, 1) for z in range(100)])
    t.build(10)
    t.save("test.annoy")

    u = AnnoyIndex(200, "angular")
    with pytest.raises(IOError):
        u.load("test.annoy")
    u = AnnoyIndex(50, "angular")
    with pytest.raises(IOError):
        u.load("test.annoy")


def test_add_after_save():
    # 398
    t = AnnoyIndex(100, "angular")
    for i in range(1000):
        t.add_item(i, [random.gauss(0, 1) for z in range(100)])
    t.build(10)
    t.save("test.annoy")

    # Used to segfault:
    v = [random.gauss(0, 1) for z in range(100)]
    with pytest.raises(Exception):
        t.add_item(i, v)


def test_build_twice():
    # 420
    t = AnnoyIndex(100, "angular")
    for i in range(1000):
        t.add_item(i, [random.gauss(0, 1) for z in range(100)])
    t.build(10)
    # Used to segfault:
    with pytest.raises(Exception):
        t.build(10)


def test_very_large_index():
    # 388
    f = 3
    dangerous_size = 2**31
    size_per_vector = 4 * (f + 3)
    n_vectors = int(dangerous_size / size_per_vector)
    m = AnnoyIndex(3, "angular")
    m.verbose(True)
    for i in range(100):
        m.add_item(n_vectors + i, [random.gauss(0, 1) for z in range(f)])
    n_trees = 10
    m.build(n_trees)
    path = "test_big.annoy"
    m.save(path)  # Raises on Windows

    # Sanity check size of index
    assert os.path.getsize(path) >= dangerous_size
    assert os.path.getsize(path) < dangerous_size + 100e3

    # Sanity check number of trees
    assert m.get_n_trees() == n_trees


================================================
FILE: test/manhattan_index_test.py
================================================
# Copyright (c) 2013 Spotify AB
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations under
# the License.

import random

import numpy
import pytest

from annoy import AnnoyIndex


def test_get_nns_by_vector():
    f = 2
    i = AnnoyIndex(f, "manhattan")
    i.add_item(0, [2, 2])
    i.add_item(1, [3, 2])
    i.add_item(2, [3, 3])
    i.build(10)

    assert i.get_nns_by_vector([4, 4], 3) == [2, 1, 0]
    assert i.get_nns_by_vector([1, 1], 3) == [0, 1, 2]
    assert i.get_nns_by_vector([5, 3], 3) == [2, 1, 0]


def test_get_nns_by_item():
    f = 2
    i = AnnoyIndex(f, "manhattan")
    i.add_item(0, [2, 2])
    i.add_item(1, [3, 2])
    i.add_item(2, [3, 3])
    i.build(10)

    assert i.get_nns_by_item(0, 3) == [0, 1, 2]
    assert i.get_nns_by_item(2, 3) == [2, 1, 0]


def test_dist():
    f = 2
    i = AnnoyIndex(f, "manhattan")
    i.add_item(0, [0, 1])
    i.add_item(1, [1, 1])
    i.add_item(2, [0, 0])

    assert i.get_distance(0, 1) == pytest.approx(1.0)
    assert i.get_distance(1, 2) == pytest.approx(2.0)


def test_large_index():
    # Generate pairs of random points where the pair is super close
    f = 10
    i = AnnoyIndex(f, "manhattan")
    for j in range(0, 10000, 2):
        p = [random.gauss(0, 1) for z in range(f)]
        x = [1 + pi + random.gauss(0, 1e-2) for pi in p]
        y = [1 + pi + random.gauss(0, 1e-2) for pi in p]
        i.add_item(j, x)
        i.add_item(j + 1, y)

    i.build(10)
    for j in range(0, 10000, 2):
        assert i.get_nns_by_item(j, 2) == [j, j + 1]
        assert i.get_nns_by_item(j + 1, 2) == [j + 1, j]


def precision(n, n_trees=10, n_points=10000, n_rounds=10):
    found = 0
    for r in range(n_rounds):
        # create random points at distance x
        f = 10
        i = AnnoyIndex(f, "manhattan")
        for j in range(n_points):
            p = [random.gauss(0, 1) for z in range(f)]
            norm = sum([pi**2 for pi in p]) ** 0.5
            x = [pi / norm + j for pi in p]
            i.add_item(j, x)

        i.build(n_trees)

        nns = i.get_nns_by_vector([0] * f, n)
        assert nns == sorted(nns)  # should be in order
        # The number of gaps should be equal to the last item minus n-1
        found += len([x for x in nns if x < n])

    return 1.0 * found / (n * n_rounds)


def test_precision_1():
    assert precision(1) >= 0.98


def test_precision_10():
    assert precision(10) >= 0.98


def test_precision_100():
    assert precision(100) >= 0.98


def test_precision_1000():
    assert precision(1000) >= 0.98


def test_get_nns_with_distances():
    f = 3
    i = AnnoyIndex(f, "manhattan")
    i.add_item(0, [0, 0, 2])
    i.add_item(1, [0, 1, 1])
    i.add_item(2, [1, 0, 0])
    i.build(10)

    l, d = i.get_nns_by_item(0, 3, -1, True)
    assert l == [0, 1, 2]
    assert d[0] == pytest.approx(0)
    assert d[1] == pytest.approx(2)
    assert d[2] == pytest.approx(3)

    l, d = i.get_nns_by_vector([2, 2, 1], 3, -1, True)
    assert l == [1, 2, 0]
    assert d[0] == pytest.approx(3)
    assert d[1] == pytest.approx(4)
    assert d[2] == pytest.approx(5)


def test_include_dists():
    f = 40
    i = AnnoyIndex(f, "manhattan")
    v = numpy.random.normal(size=f)
    i.add_item(0, v)
    i.add_item(1, -v)
    i.build(10)

    indices, dists = i.get_nns_by_item(0, 2, 10, True)
    assert indices == [0, 1]
    assert dists[0] == pytest.approx(0)


def test_distance_consistency():
    n, f = 1000, 3
    i = AnnoyIndex(f, "manhattan")
    for j in range(n):
        i.add_item(j, numpy.random.normal(size=f))
    i.build(10)
    for a in random.sample(range(n), 100):
        indices, dists = i.get_nns_by_item(a, 100, include_distances=True)
        for b, dist in zip(indices, dists):
            assert dist == pytest.approx(i.get_distance(a, b))
            u = numpy.array(i.get_item_vector(a))
            v = numpy.array(i.get_item_vector(b))
            assert dist == pytest.approx(numpy.sum(numpy.fabs(u - v)))
            assert dist == pytest.approx(
                sum([abs(float(x) - float(y)) for x, y in zip(u, v)])
            )


================================================
FILE: test/memory_leak_test.py
================================================
# Copyright (c) 2013 Spotify AB
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations under
# the License.

import pytest
import random

from annoy import AnnoyIndex


def test_get_item_vector():
    f = 10
    i = AnnoyIndex(f, "euclidean")
    i.add_item(0, [random.gauss(0, 1) for x in range(f)])
    for j in range(100):
        print(j, "...")
        for k in range(1000 * 1000):
            i.get_item_vector(0)


def test_get_lots_of_nns():
    f = 10
    i = AnnoyIndex(f, "euclidean")
    i.add_item(0, [random.gauss(0, 1) for x in range(f)])
    i.build(10)
    for j in range(100):
        assert i.get_nns_by_item(0, 999999999) == [0]


def test_build_unbuid():
    f = 10
    i = AnnoyIndex(f, "euclidean")
    for j in range(1000):
        i.add_item(j, [random.gauss(0, 1) for x in range(f)])
    i.build(10)

    for j in range(100):
        i.unbuild()
        i.build(10)

    assert i.get_n_items() == 1000


def test_include_distances():
    # See #633
    # (Not able to repro it though)
    f = 10
    i = AnnoyIndex(f, "euclidean")
    for j in range(10000):
        i.add_item(j, [random.gauss(0, 1) for x in range(f)])
    i.build(10)

    v = [random.gauss(0, 1) for x in range(f)]
    for _ in range(10000000):
        indices, distances = i.get_nns_by_vector(v, 1, include_distances=True)


================================================
FILE: test/multithreaded_build_test.py
================================================
import numpy

from annoy import AnnoyIndex


def _test_building_with_threads(n_jobs):
    n, f = 10000, 10
    n_trees = 31
    i = AnnoyIndex(f, "euclidean")
    for j in range(n):
        i.add_item(j, numpy.random.normal(size=f))
    assert i.build(n_trees, n_jobs=n_jobs)
    assert n_trees == i.get_n_trees()


def test_one_thread():
    _test_building_with_threads(1)


def test_two_threads():
    _test_building_with_threads(2)


def test_four_threads():
    _test_building_with_threads(4)


def test_eight_threads():
    _test_building_with_threads(8)


================================================
FILE: test/on_disk_build_test.py
================================================
# Copyright (c) 2013 Spotify AB
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations under
# the License.

import os

import pytest

from annoy import AnnoyIndex


@pytest.fixture(scope="module", autouse=True)
def setUp():
    if os.path.exists("on_disk.ann"):
        os.remove("on_disk.ann")


def add_items(i):
    i.add_item(0, [2, 2])
    i.add_item(1, [3, 2])
    i.add_item(2, [3, 3])


def check_nns(i):
    assert i.get_nns_by_vector([4, 4], 3) == [2, 1, 0]
    assert i.get_nns_by_vector([1, 1], 3) == [0, 1, 2]
    assert i.get_nns_by_vector([4, 2], 3) == [1, 2, 0]


def test_on_disk():
    f = 2
    i = AnnoyIndex(f, "euclidean")
    i.on_disk_build("on_disk.ann")
    add_items(i)
    i.build(10)
    check_nns(i)
    i.unload()
    i.load("on_disk.ann")
    check_nns(i)
    j = AnnoyIndex(f, "euclidean")
    j.load("on_disk.ann")
    check_nns(j)


================================================
FILE: test/seed_test.py
================================================
# Copyright (c) 2013 Spotify AB
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations under
# the License.

import numpy

from annoy import AnnoyIndex


def test_seeding():
    f = 10
    X = numpy.random.rand(1000, f)
    Y = numpy.random.rand(50, f)

    indexes = []
    for i in range(2):
        index = AnnoyIndex(f, "angular")
        index.set_seed(42)
        for j in range(X.shape[0]):
            index.add_item(j, X[j])

        index.build(10)
        indexes.append(index)

    for k in range(Y.shape[0]):
        assert indexes[0].get_nns_by_vector(Y[k], 100) == indexes[1].get_nns_by_vector(
            Y[k], 100
        )


================================================
FILE: test/threading_test.py
================================================
# Copyright (c) 2013 Spotify AB
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations under
# the License.

import multiprocessing.pool

import numpy

from annoy import AnnoyIndex


def test_threads():
    n, f = 10000, 10
    i = AnnoyIndex(f, "euclidean")
    for j in range(n):
        i.add_item(j, numpy.random.normal(size=f))
    i.build(10)

    pool = multiprocessing.pool.ThreadPool()

    def query_f(j):
        i.get_nns_by_item(1, 1000)

    pool.map(query_f, range(n))


================================================
FILE: test/types_test.py
================================================
# Copyright (c) 2013 Spotify AB
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations under
# the License.

import random

import numpy
import pytest

from annoy import AnnoyIndex


def test_numpy(n_points=1000, n_trees=10):
    f = 10
    i = AnnoyIndex(f, "euclidean")
    for j in range(n_points):
        a = numpy.random.normal(size=f)
        a = a.astype(
            random.choice([numpy.float64, numpy.float32, numpy.uint8, numpy.int16])
        )
        i.add_item(j, a)

    i.build(n_trees)


def test_tuple(n_points=1000, n_trees=10):
    f = 10
    i = AnnoyIndex(f, "euclidean")
    for j in range(n_points):
        i.add_item(j, tuple(random.gauss(0, 1) for x in range(f)))

    i.build(n_trees)


def test_wrong_length(n_points=1000, n_trees=10):
    f = 10
    i = AnnoyIndex(f, "euclidean")
    i.add_item(0, [random.gauss(0, 1) for x in range(f)])
    with pytest.raises(IndexError):
        i.add_item(1, [random.gauss(0, 1) for x in range(f + 1000)])
    with pytest.raises(IndexError):
        i.add_item(2, [])

    i.build(n_trees)


def test_range_errors(n_points=1000, n_trees=10):
    f = 10
    i = AnnoyIndex(f, "euclidean")
    for j in range(n_points):
        i.add_item(j, [random.gauss(0, 1) for x in range(f)])
    with pytest.raises(IndexError):
        i.add_item(-1, [random.gauss(0, 1) for x in range(f)])
    i.build(n_trees)
    for bad_index in [-1000, -1, n_points, n_points + 1000]:
        with pytest.raises(IndexError):
            i.get_distance(0, bad_index)
        with pytest.raises(IndexError):
            i.get_nns_by_item(bad_index, 1)
        with pytest.raises(IndexError):
            i.get_item_vector(bad_index)


def test_missing_len():
    """
    We should get a helpful error message if our vector doesn't have a
    __len__ method.
    """

    class FakeCollection:
        pass

    i = AnnoyIndex(10, "euclidean")
    with pytest.raises(TypeError) as excinfo:
        i.add_item(1, FakeCollection())
    assert str(excinfo.value) == "object of type 'FakeCollection' has no len()"


def test_missing_getitem():
    """
    We should get a helpful error message if our vector doesn't have a
    __getitem__ method.
    """

    class FakeCollection:
        def __len__(self):
            return 5

    i = AnnoyIndex(5, "euclidean")
    with pytest.raises(TypeError) as excinfo:
        i.add_item(1, FakeCollection())
    assert str(excinfo.value) == "'FakeCollection' object is not subscriptable"


def test_short():
    """
    Ensure we handle our vector not being long enough.
    """

    class FakeCollection:
        def __len__(self):
            return 3

        def __getitem__(self, i):
            raise IndexError

    i = AnnoyIndex(3, "euclidean")
    with pytest.raises(IndexError):
        i.add_item(1, FakeCollection())


def test_non_float():
    """
    We should error gracefully if non-floats are provided in our vector.
    """
    array_strings = ["1", "2", "3"]

    i = AnnoyIndex(3, "euclidean")
    with pytest.raises(TypeError) as excinfo:
        i.add_item(1, array_strings)
    assert str(excinfo.value) == "must be real number, not str"


================================================
FILE: tox.ini
================================================
[tox]
envlist=py{26,27,33,34,35,36,37,38,39,310,311,312,313}, go, lua

[testenv]
setenv =
  TRAVIS = {env:TRAVIS:}
commands =
  pip install numpy h5py
  pip install .
  python setup.py nosetests --verbosity=3

[testenv:go]
setenv =
  GOPATH = {env:HOME:}/gopath
  GOROOT = /usr/local/go
whitelist_externals=*
commands =
  mkdir -p {env:GOPATH:}/src/annoyindex
  wget https://storage.googleapis.com/golang/go1.5.linux-amd64.tar.gz
  sudo tar -C /usr/local -xzf go1.5.linux-amd64.tar.gz
  sudo add-apt-repository -y ppa:timsc/swig-3.0.12
  sudo apt-get update -qq
  sudo apt-get install -y swig3.0
  swig3.0 -go -intgosize 64 -cgo -c++ src/annoygomodule.i
  cp src/annoygomodule_wrap.cxx src/annoyindex.go src/annoygomodule.h src/annoylib.h src/kissrandom.h {env:GOPATH:}/src/annoyindex
  {env:GOROOT}/bin/go build annoyindex

[testenv:lua]
setenv =
  HOME = {env:HOME}
whitelist_externals=*
commands =
  pip install hererocks
  hererocks {toxworkdir}/here --{env:LUA:} --luarocks 2.2
  {toxworkdir}/here/bin/luarocks make
  {toxworkdir}/here/bin/luarocks install busted
  {toxworkdir}/here/bin/busted test/annoy_test.lua