Showing preview only (224K chars total). Download the full file or copy to clipboard to get everything.
Repository: spotify/annoy
Branch: main
Commit: 379f744667ab
Files: 50
Total size: 211.3 KB
Directory structure:
gitextract__vrufcg9/
├── .github/
│ └── workflows/
│ ├── ci.yml
│ └── publish.yml
├── .gitignore
├── CMakeLists.txt
├── LICENSE
├── MANIFEST.in
├── README.rst
├── README_GO.rst
├── README_Lua.md
├── RELEASE.md
├── annoy/
│ ├── __init__.py
│ ├── __init__.pyi
│ └── py.typed
├── annoy-dev-1.rockspec
├── debian/
│ ├── changelog
│ ├── compat
│ ├── control
│ └── rules
├── examples/
│ ├── mmap_test.py
│ ├── precision_test.cpp
│ ├── precision_test.py
│ ├── s_compile_cpp.sh
│ └── simple_test.py
├── setup.cfg
├── setup.py
├── src/
│ ├── annoygomodule.h
│ ├── annoygomodule.i
│ ├── annoylib.h
│ ├── annoyluamodule.cc
│ ├── annoymodule.cc
│ ├── kissrandom.h
│ └── mman.h
├── test/
│ ├── accuracy_test.py
│ ├── angular_index_test.py
│ ├── annoy_test.go
│ ├── annoy_test.lua
│ ├── dot_index_test.py
│ ├── euclidean_index_test.py
│ ├── examples_test.py
│ ├── hamming_index_test.py
│ ├── holes_test.py
│ ├── index_test.py
│ ├── manhattan_index_test.py
│ ├── memory_leak_test.py
│ ├── multithreaded_build_test.py
│ ├── on_disk_build_test.py
│ ├── seed_test.py
│ ├── threading_test.py
│ └── types_test.py
└── tox.ini
================================================
FILE CONTENTS
================================================
================================================
FILE: .github/workflows/ci.yml
================================================
name: Annoy
on:
push:
branches:
- main
pull_request:
jobs:
unit-tests:
runs-on: ubuntu-22.04
strategy:
fail-fast: false
matrix:
python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "3.13"]
os: ["ubuntu-20.04", "macos-latest", "windows-latest"]
steps:
- uses: actions/checkout@v3 # Pull the repository
- uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- run: pip install .
- run: pip install h5py numpy pytest
- run: pytest -v
================================================
FILE: .github/workflows/publish.yml
================================================
name: Publish
on:
push:
tags:
- 'v*.*.*'
jobs:
build:
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest, windows-latest, macos-latest]
steps:
- name: Checkout code
uses: actions/checkout@v2
- name: Set up QEMU (for Linux aarch64)
if: runner.os == 'Linux'
uses: docker/setup-qemu-action@v3
with:
platforms: arm64
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.x'
- name: Install cibuildwheel
run: python -m pip install cibuildwheel==3.2.1
- name: Build wheels
run: python -m cibuildwheel --output-dir dist
env:
CIBW_BEFORE_BUILD: python -m pip install -U pip && rm -rf build
CIBW_ARCHS_LINUX: auto aarch64
- name: Upload wheels
uses: actions/upload-artifact@v4
with:
name: built-wheels-${{ matrix.os }}-${{ strategy.job-index }}
path: ./dist/*.whl
- name: Build source distribution
if: matrix.os == 'ubuntu-latest'
run: python -m pip install build && python -m build --sdist --outdir dist
- name: Upload sdist
if: matrix.os == 'ubuntu-latest'
uses: actions/upload-artifact@v4
with:
name: built-sdist
path: ./dist/*.tar.gz
publish:
needs: build
runs-on: ubuntu-latest
# pypi trusted publishing via OIDC
permissions:
id-token: write
steps:
- name: Download all artifacts
uses: actions/download-artifact@v4
with:
pattern: built-*
path: dist
merge-multiple: true
- name: Publish package
uses: pypa/gh-action-pypi-publish@release/v1
if: startsWith(github.ref, 'refs/tags/v') && github.event_name == 'push'
with:
password: ${{ secrets.PYPI_API_TOKEN }}
================================================
FILE: .gitignore
================================================
*.egg-info/
*.egg/
*.so
*.o
build/
dist/
.vscode/
*.pdb
MANIFEST
*.py[cod]
*.idea
# testing
*.ann
*.tree
*.annoy
*.idx
*.hdf5
================================================
FILE: CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.15...3.25 FATAL_ERROR)
project(Annoy
DESCRIPTION "Approximate Nearest Neighbors Oh Yeah"
VERSION 1.17.1
LANGUAGES CXX)
add_library(Annoy INTERFACE)
add_library(Annoy::Annoy ALIAS Annoy)
foreach (HEADER annoylib.h kissrandom.h mman.h)
configure_file("${CMAKE_CURRENT_SOURCE_DIR}/src/${HEADER}" "${CMAKE_CURRENT_BINARY_DIR}/include/annoy/${HEADER}" COPYONLY)
endforeach ()
target_include_directories(Annoy INTERFACE
$<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/include>
$<INSTALL_INTERFACE:include>)
# Install
include(GNUInstallDirs)
install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/include/
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
install(TARGETS Annoy
EXPORT AnnoyTargets)
install(EXPORT AnnoyTargets
FILE AnnoyConfig.cmake
NAMESPACE Annoy::
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/annoy)
export(TARGETS Annoy NAMESPACE Annoy:: FILE AnnoyConfig.cmake)
================================================
FILE: LICENSE
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2021 (c) Spotify and its affiliates.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: MANIFEST.in
================================================
include README.rst LICENSE ann.png
include src/annoylib.h
include src/kissrandom.h
include src/mman.h
================================================
FILE: README.rst
================================================
Annoy
-----
.. figure:: https://raw.github.com/spotify/annoy/master/ann.png
:alt: Annoy example
:align: center
.. image:: https://github.com/spotify/annoy/actions/workflows/ci.yml/badge.svg
:target: https://github.com/spotify/annoy/actions
Annoy (`Approximate Nearest Neighbors <http://en.wikipedia.org/wiki/Nearest_neighbor_search#Approximate_nearest_neighbor>`__ Oh Yeah) is a C++ library with Python bindings to search for points in space that are close to a given query point. It also creates large read-only file-based data structures that are `mmapped <https://en.wikipedia.org/wiki/Mmap>`__ into memory so that many processes may share the same data.
Install
-------
To install, simply do ``pip install --user annoy`` to pull down the latest version from `PyPI <https://pypi.python.org/pypi/annoy>`_.
For the C++ version, just clone the repo and ``#include "annoylib.h"``.
Background
----------
There are some other libraries to do nearest neighbor search. Annoy is almost as fast as the fastest libraries, (see below), but there is actually another feature that really sets Annoy apart: it has the ability to **use static files as indexes**. In particular, this means you can **share index across processes**. Annoy also decouples creating indexes from loading them, so you can pass around indexes as files and map them into memory quickly. Another nice thing of Annoy is that it tries to minimize memory footprint so the indexes are quite small.
Why is this useful? If you want to find nearest neighbors and you have many CPU's, you only need to build the index once. You can also pass around and distribute static files to use in production environment, in Hadoop jobs, etc. Any process will be able to load (mmap) the index into memory and will be able to do lookups immediately.
We use it at `Spotify <http://www.spotify.com/>`__ for music recommendations. After running matrix factorization algorithms, every user/item can be represented as a vector in f-dimensional space. This library helps us search for similar users/items. We have many millions of tracks in a high-dimensional space, so memory usage is a prime concern.
Annoy was built by `Erik Bernhardsson <http://www.erikbern.com>`__ in a couple of afternoons during `Hack Week <http://labs.spotify.com/2013/02/15/organizing-a-hack-week/>`__.
Summary of features
-------------------
* `Euclidean distance <https://en.wikipedia.org/wiki/Euclidean_distance>`__, `Manhattan distance <https://en.wikipedia.org/wiki/Taxicab_geometry>`__, `cosine distance <https://en.wikipedia.org/wiki/Cosine_similarity>`__, `Hamming distance <https://en.wikipedia.org/wiki/Hamming_distance>`__, or `Dot (Inner) Product distance <https://en.wikipedia.org/wiki/Dot_product>`__
* Cosine distance is equivalent to Euclidean distance of normalized vectors = sqrt(2-2*cos(u, v))
* Works better if you don't have too many dimensions (like <100) but seems to perform surprisingly well even up to 1,000 dimensions
* Small memory usage
* Lets you share memory between multiple processes
* Index creation is separate from lookup (in particular you can not add more items once the tree has been created)
* Native Python support, tested with 2.7, 3.6, and 3.7.
* Build index on disk to enable indexing big datasets that won't fit into memory (contributed by `Rene Hollander <https://github.com/ReneHollander>`__)
Python code example
-------------------
.. code-block:: python
from annoy import AnnoyIndex
import random
f = 40 # Length of item vector that will be indexed
t = AnnoyIndex(f, 'angular')
for i in range(1000):
v = [random.gauss(0, 1) for z in range(f)]
t.add_item(i, v)
t.build(10) # 10 trees
t.save('test.ann')
# ...
u = AnnoyIndex(f, 'angular')
u.load('test.ann') # super fast, will just mmap the file
print(u.get_nns_by_item(0, 1000)) # will find the 1000 nearest neighbors
Right now it only accepts integers as identifiers for items. Note that it will allocate memory for max(id)+1 items because it assumes your items are numbered 0 … n-1. If you need other id's, you will have to keep track of a map yourself.
Full Python API
---------------
* ``AnnoyIndex(f, metric)`` returns a new index that's read-write and stores vector of ``f`` dimensions. Metric can be ``"angular"``, ``"euclidean"``, ``"manhattan"``, ``"hamming"``, or ``"dot"``.
* ``a.add_item(i, v)`` adds item ``i`` (any nonnegative integer) with vector ``v``. Note that it will allocate memory for ``max(i)+1`` items.
* ``a.build(n_trees, n_jobs=-1)`` builds a forest of ``n_trees`` trees. More trees gives higher precision when querying. After calling ``build``, no more items can be added. ``n_jobs`` specifies the number of threads used to build the trees. ``n_jobs=-1`` uses all available CPU cores.
* ``a.save(fn, prefault=False)`` saves the index to disk and loads it (see next function). After saving, no more items can be added.
* ``a.load(fn, prefault=False)`` loads (mmaps) an index from disk. If `prefault` is set to `True`, it will pre-read the entire file into memory (using mmap with `MAP_POPULATE`). Default is `False`.
* ``a.unload()`` unloads.
* ``a.get_nns_by_item(i, n, search_k=-1, include_distances=False)`` returns the ``n`` closest items. During the query it will inspect up to ``search_k`` nodes which defaults to ``n_trees * n`` if not provided. ``search_k`` gives you a run-time tradeoff between better accuracy and speed. If you set ``include_distances`` to ``True``, it will return a 2 element tuple with two lists in it: the second one containing all corresponding distances.
* ``a.get_nns_by_vector(v, n, search_k=-1, include_distances=False)`` same but query by vector ``v``.
* ``a.get_item_vector(i)`` returns the vector for item ``i`` that was previously added.
* ``a.get_distance(i, j)`` returns the distance between items ``i`` and ``j``. NOTE: this used to return the *squared* distance, but has been changed as of Aug 2016.
* ``a.get_n_items()`` returns the number of items in the index.
* ``a.get_n_trees()`` returns the number of trees in the index.
* ``a.on_disk_build(fn)`` prepares annoy to build the index in the specified file instead of RAM (execute before adding items, no need to save after build)
* ``a.set_seed(seed)`` will initialize the random number generator with the given seed. Only used for building up the tree, i. e. only necessary to pass this before adding the items. Will have no effect after calling `a.build(n_trees)` or `a.load(fn)`.
Notes:
* There's no bounds checking performed on the values so be careful.
* Annoy uses Euclidean distance of normalized vectors for its angular distance, which for two vectors u,v is equal to ``sqrt(2(1-cos(u,v)))``
The C++ API is very similar: just ``#include "annoylib.h"`` to get access to it.
Tradeoffs
---------
There are just two main parameters needed to tune Annoy: the number of trees ``n_trees`` and the number of nodes to inspect during searching ``search_k``.
* ``n_trees`` is provided during build time and affects the build time and the index size. A larger value will give more accurate results, but larger indexes.
* ``search_k`` is provided in runtime and affects the search performance. A larger value will give more accurate results, but will take longer time to return.
If ``search_k`` is not provided, it will default to ``n * n_trees`` where ``n`` is the number of approximate nearest neighbors. Otherwise, ``search_k`` and ``n_trees`` are roughly independent, i.e. the value of ``n_trees`` will not affect search time if ``search_k`` is held constant and vice versa. Basically it's recommended to set ``n_trees`` as large as possible given the amount of memory you can afford, and it's recommended to set ``search_k`` as large as possible given the time constraints you have for the queries.
You can also accept slower search times in favour of reduced loading times, memory usage, and disk IO. On supported platforms the index is prefaulted during ``load`` and ``save``, causing the file to be pre-emptively read from disk into memory. If you set ``prefault`` to ``False``, pages of the mmapped index are instead read from disk and cached in memory on-demand, as necessary for a search to complete. This can significantly increase early search times but may be better suited for systems with low memory compared to index size, when few queries are executed against a loaded index, and/or when large areas of the index are unlikely to be relevant to search queries.
How does it work
----------------
Using `random projections <http://en.wikipedia.org/wiki/Locality-sensitive_hashing#Random_projection>`__ and by building up a tree. At every intermediate node in the tree, a random hyperplane is chosen, which divides the space into two subspaces. This hyperplane is chosen by sampling two points from the subset and taking the hyperplane equidistant from them.
We do this k times so that we get a forest of trees. k has to be tuned to your need, by looking at what tradeoff you have between precision and performance.
Hamming distance (contributed by `Martin Aumüller <https://github.com/maumueller>`__) packs the data into 64-bit integers under the hood and uses built-in bit count primitives so it could be quite fast. All splits are axis-aligned.
Dot Product distance (contributed by `Peter Sobot <https://github.com/psobot>`__ and `Pavel Korobov <https://github.com/pkorobov>`__) reduces the provided vectors from dot (or "inner-product") space to a more query-friendly cosine space using `a method by Bachrach et al., at Microsoft Research, published in 2014 <https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/XboxInnerProduct.pdf>`__.
More info
---------
* `Dirk Eddelbuettel <https://github.com/eddelbuettel>`__ provides an `R version of Annoy <http://dirk.eddelbuettel.com/code/rcpp.annoy.html>`__.
* `Andy Sloane <https://github.com/a1k0n>`__ provides a `Java version of Annoy <https://github.com/spotify/annoy-java>`__ although currently limited to cosine and read-only.
* `Pishen Tsai <https://github.com/pishen>`__ provides a `Scala wrapper of Annoy <https://github.com/pishen/annoy4s>`__ which uses JNA to call the C++ library of Annoy.
* `Atsushi Tatsuma <https://github.com/yoshoku>`__ provides `Ruby bindings for Annoy <https://github.com/yoshoku/annoy.rb>`__.
* There is `experimental support for Go <https://github.com/spotify/annoy/blob/master/README_GO.rst>`__ provided by `Taneli Leppä <https://github.com/rosmo>`__.
* `Boris Nagaev <https://github.com/starius>`__ wrote `Lua bindings <https://github.com/spotify/annoy/blob/master/README_Lua.md>`__.
* During part of Spotify Hack Week 2016 (and a bit afterward), `Jim Kang <https://github.com/jimkang>`__ wrote `Node bindings <https://github.com/jimkang/annoy-node>`__ for Annoy.
* `Min-Seok Kim <https://github.com/mskimm>`__ built a `Scala version <https://github.com/mskimm/ann4s>`__ of Annoy.
* `hanabi1224 <https://github.com/hanabi1224>`__ built a read-only `Rust version <https://github.com/hanabi1224/RuAnnoy>`__ of Annoy, together with **dotnet, jvm and dart** read-only bindings.
* `Presentation from New York Machine Learning meetup <http://www.slideshare.net/erikbern/approximate-nearest-neighbor-methods-and-vector-models-nyc-ml-meetup>`__ about Annoy
* Annoy is available as a `conda package <https://anaconda.org/conda-forge/python-annoy>`__ on Linux, OS X, and Windows.
* `ann-benchmarks <https://github.com/erikbern/ann-benchmarks>`__ is a benchmark for several approximate nearest neighbor libraries. Annoy seems to be fairly competitive, especially at higher precisions:
.. figure:: https://raw.githubusercontent.com/erikbern/ann-benchmarks/main/results/glove-100-angular.png
:alt: ANN benchmarks
:align: center
:target: https://github.com/erikbern/ann-benchmarks
Source code
-----------
It's all written in C++ with a handful of ugly optimizations for performance and memory usage. You have been warned :)
The code should support Windows, thanks to `Qiang Kou <https://github.com/thirdwing>`__ and `Timothy Riley <https://github.com/tjrileywisc>`__.
To run the tests, execute `python setup.py nosetests`. The test suite includes a big real world dataset that is downloaded from the internet, so it will take a few minutes to execute.
Discuss
-------
Feel free to post any questions or comments to the `annoy-user <https://groups.google.com/group/annoy-user>`__ group. I'm `@fulhack <https://twitter.com/fulhack>`__ on Twitter.
================================================
FILE: README_GO.rst
================================================
Install
-------
To install, you'll need Swig (tested with Swig 4.2.1 on Ubuntu 24.04), and then just::
swig -go -intgosize 64 -cgo -c++ src/annoygomodule.i
mkdir -p $(go env GOPATH)/src/annoy
cp src/annoygomodule_wrap.cxx src/annoy.go src/annoygomodule.h src/annoylib.h src/kissrandom.h test/annoy_test.go $(go env GOPATH)/src/annoy
cd $(go env GOPATH)/src/annoy
go mod init github.com/spotify/annoy
go mod tidy
go test
Background
----------
See the main README.
Go code example
-------------------
.. code-block:: go
package main
import (
"fmt"
"math/rand"
"github.com/spotify/annoy"
)
func main() {
f := 40
t := annoy.NewAnnoyIndexAngular(f)
for i := 0; i < 1000; i++ {
item := make([]float32, 0, f)
for x:= 0; x < f; x++ {
item = append(item, rand.Float32())
}
t.AddItem(i, item)
}
t.Build(10)
t.Save("test.ann")
annoy.DeleteAnnoyIndexAngular(t)
t = annoy.NewAnnoyIndexAngular(f)
t.Load("test.ann")
result := annoyindex.NewAnnoyVectorInt()
defer result.Free()
t.GetNnsByItem(0, 1000, -1, result)
fmt.Printf("%v\n", result.ToSlice())
}
Right now it only accepts integers as identifiers for items. Note that it will allocate memory for max(id)+1 items because it assumes your items are numbered 0 … n-1. If you need other id's, you will have to keep track of a map yourself.
Full Go API
---------------
See annoygomodule.h. Generally the same as Python API except some arguments are not optional. Go binding does not support multithreaded build.
Tests
-------
A simple test is supplied in test/annoy_test.go.
Discuss
-------
Memroy leak in the previous versions has been fixed thanks to https://github.com/swig/swig/issues/2292. (memory leak fix is implemented in https://github.com/Rikanishu/annoy-go)
Go glue written by Taneli Leppä (@rosmo). You can contact me via email (see https://github.com/rosmo).
================================================
FILE: README_Lua.md
================================================
Install
-------
To install, you'll need Lua (binary + library) and LuaRocks.
If you have Python and Pip, you can get Lua and LuaRocks
using [hererocks](https://github.com/mpeterv/hererocks/),
written by Peter Melnichenko.
```
pip install hererocks
hererocks here --lua 5.1 --luarocks 2.2
```
This command installs Lua and LuaRocks locally to directory `here`.
To activate it, add `here/bin` to `PATH`:
```
export PATH="$(pwd)/here/bin/:$PATH"
```
Then you can use commands `lua`, `luarocks`,
and tools installed by `luarocks`.
To build and install `annoy`, type:
```
luarocks make
```
Background
----------
See the main README.
Lua code example
----------------
```lua
local annoy = require "annoy"
local f = 3
local t = annoy.AnnoyIndex(f) -- Length of item vector that will be indexed
for i = 0, 999 do
local v = {math.random(), math.random(), math.random()}
t:add_item(i, v)
end
t:build(10) -- 10 trees
t:save('test.ann')
-- ...
local u = annoy.AnnoyIndex(f)
u:load('test.ann') -- super fast, will just mmap the file
-- find the 10 nearest neighbors
local neighbors = u:get_nns_by_item(0, 10)
for rank, i in ipairs(neighbors) do
print("neighbor", rank, "is", i)
end
```
Full Lua API
------------
Lua API closely resembles Python API, see main README. Lua binding does not support multithreaded build.
Tests
-------
File `test/annoy_test.lua` is the literal translation of
`test/annoy_test.py` from Python+Nosetests to Lua+Busted.
To run tests, you need [Busted](http://olivinelabs.com/busted/),
Elegant Lua unit testing. To install it, type:
```
luarocks install busted
```
To run tests, type:
```
busted test/annoy_test.lua
```
It will take few minutes to execute.
Discuss
-------
There might be some memory leaks if inputs are incorrect.
Some functions allocate stack objects calling Lua functions throwing
Lua errors (e.g., `luaL_checkinteger`). A Lua error may omit calling
C++ destructors when unwinding the stack. (If it does, depends on
the Lua implementation and platform being in use.)
Lua binding was written by Boris Nagaev.
You can contact me via email (see https://github.com/starius).
================================================
FILE: RELEASE.md
================================================
How to release
--------------
1. Make sure you're on master. `git checkout master && git fetch && git reset --hard origin/master`
1. Update `setup.py` to the newest version, `git add setup.py && git commit -m "version 1.2.3"`
1. `python setup.py sdist bdist_wheel`
1. `git tag -a v1.2.3 -m "version 1.2.3"`
1. `git push --tags origin master` to push the last version to Github
1. Go to https://github.com/spotify/annoy/releases and click "Draft a new release"
1. `twine upload dist/annoy-1.2.3*`
TODO
----
* Wheel
================================================
FILE: annoy/__init__.py
================================================
# Copyright (c) 2013 Spotify AB
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations under
# the License.
# This module is a dummy wrapper around the underlying C++ module.
from .annoylib import Annoy as AnnoyIndex
================================================
FILE: annoy/__init__.pyi
================================================
from typing import Sized, overload
from typing_extensions import Literal, Protocol
class _Vector(Protocol, Sized):
def __getitem__(self, __index: int) -> float: ...
class AnnoyIndex:
f: int
def __init__(self, f: int, metric: Literal["angular", "euclidean", "manhattan", "hamming", "dot"]) -> None: ...
def load(self, fn: str, prefault: bool = ...) -> Literal[True]: ...
def save(self, fn: str, prefault: bool = ...) -> Literal[True]: ...
@overload
def get_nns_by_item(self, i: int, n: int, search_k: int = ..., include_distances: Literal[False] = ...) -> list[int]: ...
@overload
def get_nns_by_item(
self, i: int, n: int, search_k: int, include_distances: Literal[True]
) -> tuple[list[int], list[float]]: ...
@overload
def get_nns_by_item(
self, i: int, n: int, search_k: int = ..., *, include_distances: Literal[True]
) -> tuple[list[int], list[float]]: ...
@overload
def get_nns_by_vector(
self, vector: _Vector, n: int, search_k: int = ..., include_distances: Literal[False] = ...
) -> list[int]: ...
@overload
def get_nns_by_vector(
self, vector: _Vector, n: int, search_k: int, include_distances: Literal[True]
) -> tuple[list[int], list[float]]: ...
@overload
def get_nns_by_vector(
self, vector: _Vector, n: int, search_k: int = ..., *, include_distances: Literal[True]
) -> tuple[list[int], list[float]]: ...
def get_item_vector(self, __i: int) -> list[float]: ...
def add_item(self, i: int, vector: _Vector) -> None: ...
def on_disk_build(self, fn: str) -> Literal[True]: ...
def build(self, n_trees: int, n_jobs: int = ...) -> Literal[True]: ...
def unbuild(self) -> Literal[True]: ...
def unload(self) -> Literal[True]: ...
def get_distance(self, __i: int, __j: int) -> float: ...
def get_n_items(self) -> int: ...
def get_n_trees(self) -> int: ...
def verbose(self, __v: bool) -> Literal[True]: ...
def set_seed(self, __s: int) -> None: ...
================================================
FILE: annoy/py.typed
================================================
================================================
FILE: annoy-dev-1.rockspec
================================================
-- Copyright (c) 2016 Boris Nagaev
--
-- Licensed under the Apache License, Version 2.0 (the "License"); you may not
-- use this file except in compliance with the License. You may obtain a copy of
-- the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-- License for the specific language governing permissions and limitations under
-- the License.
package = "annoy"
version = "dev-1"
source = {
url = "git://github.com/spotify/annoy.git",
}
description = {
summary = "Approximate Nearest Neighbors Oh Yeah",
homepage = "https://github.com/spotify/annoy",
license = "Apache",
detailed = [[
Annoy (Approximate Nearest Neighbors Oh Yeah) is a C++ library with Python
Go and Lua bindings to search for points in space that are close to a given
query point. It also creates large read-only file-based data structures
that are mmapped into memory so that many processes may share the same data.
]],
}
dependencies = {
"lua >= 5.1",
}
build = {
type = "builtin",
modules = {
['annoy'] = {
sources = {
"src/annoyluamodule.cc",
},
},
},
platforms = {
unix = {
modules = {
['annoy'] = {
libraries = {"stdc++"},
},
},
},
mingw32 = {
modules = {
['annoy'] = {
libraries = {"stdc++"},
},
},
},
},
}
================================================
FILE: debian/changelog
================================================
spotify-annoy (1.0.0) unstable; urgency=low
* Initial release.
-- Erik Bernhardsson <erikbern@spotify.com> Wed, 20 Feb 2013 00:00:00 +0000
================================================
FILE: debian/compat
================================================
7
================================================
FILE: debian/control
================================================
Source: spotify-annoy
Section: non-free/net
Priority: extra
Maintainer: Erik Bernhardsson <erikbern@spotify.com>
Build-Depends: debhelper (>= 7), python-all-dev, python-setuptools
Standards-Version: 3.7.2
XS-Python-Version: >= 2.6
Package: spotify-annoy
Architecture: any
Depends: ${python:Depends}
Description: Python module (written in C++) for high-dimensional approximate nearest neigbor (ANN) queries
================================================
FILE: debian/rules
================================================
#!/usr/bin/make -f
%:
dh $@
================================================
FILE: examples/mmap_test.py
================================================
from annoy import AnnoyIndex
a = AnnoyIndex(3, 'angular')
a.add_item(0, [1, 0, 0])
a.add_item(1, [0, 1, 0])
a.add_item(2, [0, 0, 1])
a.build(-1)
a.save('test.tree')
b = AnnoyIndex(3)
b.load('test.tree')
print(b.get_nns_by_item(0, 100))
print(b.get_nns_by_vector([1.0, 0.5, 0.5], 100))
================================================
FILE: examples/precision_test.cpp
================================================
/*
* precision_test.cpp
*
* Created on: Jul 13, 2016
* Author: Claudio Sanhueza
* Contact: csanhuezalobos@gmail.com
*/
#include <iostream>
#include <iomanip>
#include "../src/kissrandom.h"
#include "../src/annoylib.h"
#include <chrono>
#include <algorithm>
#include <map>
#include <random>
using namespace Annoy;
int precision(int f=40, int n=1000000){
std::chrono::high_resolution_clock::time_point t_start, t_end;
std::default_random_engine generator;
std::normal_distribution<double> distribution(0.0, 1.0);
//******************************************************
//Building the tree
AnnoyIndex<int, double, Angular, Kiss32Random, AnnoyIndexMultiThreadedBuildPolicy> t = AnnoyIndex<int, double, Angular, Kiss32Random, AnnoyIndexMultiThreadedBuildPolicy>(f);
std::cout << "Building index ... be patient !!" << std::endl;
std::cout << "\"Trees that are slow to grow bear the best fruit\" (Moliere)" << std::endl;
for(int i=0; i<n; ++i){
double *vec = (double *) malloc( f * sizeof(double) );
for(int z=0; z<f; ++z){
vec[z] = (distribution(generator));
}
t.add_item(i, vec);
std::cout << "Loading objects ...\t object: "<< i+1 << "\tProgress:"<< std::fixed << std::setprecision(2) << (double) i / (double)(n + 1) * 100 << "%\r";
}
std::cout << std::endl;
std::cout << "Building index num_trees = 2 * num_features ...";
t_start = std::chrono::high_resolution_clock::now();
t.build(2 * f);
t_end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::seconds>( t_end - t_start ).count();
std::cout << " Done in "<< duration << " secs." << std::endl;
std::cout << "Saving index ...";
t.save("precision.tree");
std::cout << " Done" << std::endl;
//******************************************************
std::vector<int> limits = {10, 100, 1000, 10000};
int K=10;
int prec_n = 1000;
std::map<int, double> prec_sum;
std::map<int, double> time_sum;
std::vector<int> closest;
//init precision and timers map
for(std::vector<int>::iterator it = limits.begin(); it!=limits.end(); ++it){
prec_sum[(*it)] = 0.0;
time_sum[(*it)] = 0.0;
}
// doing the work
for(int i=0; i<prec_n; ++i){
//select a random node
int j = rand() % n;
std::cout << "finding nbs for " << j << std::endl;
// getting the K closest
t.get_nns_by_item(j, K, n, &closest, nullptr);
std::vector<int> toplist;
std::vector<int> intersection;
for(std::vector<int>::iterator limit = limits.begin(); limit!=limits.end(); ++limit){
t_start = std::chrono::high_resolution_clock::now();
t.get_nns_by_item(j, (*limit), (size_t) -1, &toplist, nullptr); //search_k defaults to "n_trees * n" if not provided.
t_end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>( t_end - t_start ).count();
//intersecting results
std::sort(closest.begin(), closest.end(), std::less<int>());
std::sort(toplist.begin(), toplist.end(), std::less<int>());
intersection.resize(std::max(closest.size(), toplist.size()));
std::vector<int>::iterator it_set = std::set_intersection(closest.begin(), closest.end(), toplist.begin(), toplist.end(), intersection.begin());
intersection.resize(it_set-intersection.begin());
// storing metrics
int found = intersection.size();
double hitrate = found / (double) K;
prec_sum[(*limit)] += hitrate;
time_sum[(*limit)] += duration;
//deallocate memory
vector<int>().swap(intersection);
vector<int>().swap(toplist);
}
//print resulting metrics
for(std::vector<int>::iterator limit = limits.begin(); limit!=limits.end(); ++limit){
std::cout << "limit: " << (*limit) << "\tprecision: "<< std::fixed << std::setprecision(2) << (100.0 * prec_sum[(*limit)] / (i + 1)) << "% \tavg. time: "<< std::fixed<< std::setprecision(6) << (time_sum[(*limit)] / (i + 1)) * 1e-04 << "s" << std::endl;
}
closest.clear(); vector<int>().swap(closest);
}
std::cout << "\nDone" << std::endl;
return 0;
}
void help(){
std::cout << "Annoy Precision C++ example" << std::endl;
std::cout << "Usage:" << std::endl;
std::cout << "(default) ./precision" << std::endl;
std::cout << "(using parameters) ./precision num_features num_nodes" << std::endl;
std::cout << std::endl;
}
void feedback(int f, int n){
std::cout<<"Runing precision example with:" << std::endl;
std::cout<<"num. features: "<< f << std::endl;
std::cout<<"num. nodes: "<< n << std::endl;
std::cout << std::endl;
}
int main(int argc, char **argv) {
int f, n;
if(argc == 1){
f = 40;
n = 1000000;
feedback(f,n);
precision(40, 1000000);
}
else if(argc == 3){
f = atoi(argv[1]);
n = atoi(argv[2]);
feedback(f,n);
precision(f, n);
}
else {
help();
return EXIT_FAILURE;
}
return EXIT_SUCCESS;
}
================================================
FILE: examples/precision_test.py
================================================
from __future__ import print_function
import random, time
from annoy import AnnoyIndex
try:
xrange
except NameError:
# Python 3 compat
xrange = range
n, f = 100000, 40
t = AnnoyIndex(f, 'angular')
for i in xrange(n):
v = []
for z in xrange(f):
v.append(random.gauss(0, 1))
t.add_item(i, v)
t.build(2 * f)
t.save('test.tree')
limits = [10, 100, 1000, 10000]
k = 10
prec_sum = {}
prec_n = 1000
time_sum = {}
for i in xrange(prec_n):
j = random.randrange(0, n)
closest = set(t.get_nns_by_item(j, k, n))
for limit in limits:
t0 = time.time()
toplist = t.get_nns_by_item(j, k, limit)
T = time.time() - t0
found = len(closest.intersection(toplist))
hitrate = 1.0 * found / k
prec_sum[limit] = prec_sum.get(limit, 0.0) + hitrate
time_sum[limit] = time_sum.get(limit, 0.0) + T
for limit in limits:
print('limit: %-9d precision: %6.2f%% avg time: %.6fs'
% (limit, 100.0 * prec_sum[limit] / (i + 1),
time_sum[limit] / (i + 1)))
================================================
FILE: examples/s_compile_cpp.sh
================================================
#!/bin/bash
echo "compiling precision example..."
cmd="g++ precision_test.cpp -DANNOYLIB_MULTITHREADED_BUILD -o precision_test -std=c++14 -pthread"
eval $cmd
echo "Done"
================================================
FILE: examples/simple_test.py
================================================
from annoy import AnnoyIndex
a = AnnoyIndex(3, 'angular')
a.add_item(0, [1, 0, 0])
a.add_item(1, [0, 1, 0])
a.add_item(2, [0, 0, 1])
a.build(-1)
print(a.get_nns_by_item(0, 100))
print(a.get_nns_by_vector([1.0, 0.5, 0.5], 100))
================================================
FILE: setup.cfg
================================================
[nosetests]
attr=!slow
nocapture=1
================================================
FILE: setup.py
================================================
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (c) 2013 Spotify AB
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations under
# the License.
from setuptools import setup, Extension
import os
import platform
import sys
readme_note = """\
.. note::
For the latest source, discussion, etc, please visit the
`GitHub repository <https://github.com/spotify/annoy>`_\n\n
.. image:: https://img.shields.io/github/stars/spotify/annoy.svg
:target: https://github.com/spotify/annoy
"""
with open('README.rst', encoding='utf-8') as fobj:
long_description = readme_note + fobj.read()
# Various platform-dependent extras
extra_compile_args = ['-D_CRT_SECURE_NO_WARNINGS', '-fpermissive']
extra_link_args = []
if platform.machine() == 'ppc64le':
extra_compile_args += ['-mcpu=native',]
if platform.machine() == 'x86_64':
# do not apply march on Intel Darwin
if platform.system() != 'Darwin':
# Not all CPUs have march as a tuning parameter
extra_compile_args += ['-march=native',]
if os.name != 'nt':
extra_compile_args += ['-O3', '-ffast-math', '-fno-associative-math']
# Add multithreaded build flag for all platforms using Python 3 and
# for non-Windows Python 2 platforms
python_major_version = sys.version_info[0]
if python_major_version == 3 or (python_major_version == 2 and os.name != 'nt'):
extra_compile_args += ['-DANNOYLIB_MULTITHREADED_BUILD']
if os.name != 'nt':
extra_compile_args += ['-std=c++14']
# #349: something with OS X Mojave causes libstd not to be found
if platform.system() == 'Darwin':
extra_compile_args += ['-mmacosx-version-min=10.12']
extra_link_args += ['-stdlib=libc++', '-mmacosx-version-min=10.12']
# Manual configuration, you're on your own here.
manual_compiler_args = os.environ.get('ANNOY_COMPILER_ARGS', None)
if manual_compiler_args:
extra_compile_args = manual_compiler_args.split(',')
manual_linker_args = os.environ.get('ANNOY_LINKER_ARGS', None)
if manual_linker_args:
extra_link_args = manual_linker_args.split(',')
setup(name='annoy',
version='1.17.3',
description='Approximate Nearest Neighbors in C++/Python optimized for memory usage and loading/saving to disk.',
packages=['annoy'],
package_data={'annoy': ['__init__.pyi', 'py.typed']},
ext_modules=[
Extension(
'annoy.annoylib', ['src/annoymodule.cc'],
depends=['src/annoylib.h', 'src/kissrandom.h', 'src/mman.h'],
extra_compile_args=extra_compile_args,
extra_link_args=extra_link_args,
)
],
long_description=long_description,
long_description_content_type='text/x-rst',
author='Erik Bernhardsson',
author_email='mail@erikbern.com',
url='https://github.com/spotify/annoy',
license='Apache License 2.0',
classifiers=[
'Development Status :: 5 - Production/Stable',
'Programming Language :: Python',
'Programming Language :: Python :: 2.6',
'Programming Language :: Python :: 2.7',
'Programming Language :: Python :: 3.3',
'Programming Language :: Python :: 3.4',
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
'Programming Language :: Python :: 3.8',
'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: 3.10',
'Programming Language :: Python :: 3.11',
'Programming Language :: Python :: 3.12',
'Programming Language :: Python :: 3.13',
],
keywords='nns, approximate nearest neighbor search',
setup_requires=['nose>=1.0'],
tests_require=['numpy', 'h5py']
)
================================================
FILE: src/annoygomodule.h
================================================
#include "annoylib.h"
#include "kissrandom.h"
using namespace Annoy;
namespace GoAnnoy {
class AnnoyVectorFloat {
protected:
float *ptr;
int len;
public:
~AnnoyVectorFloat() {
free(ptr);
};
float* ArrayPtr() {
return ptr;
};
int Len() {
return len;
};
float Get(int i) {
if (i >= len) {
return 0.0;
}
return ptr[i];
};
void fill_from_vector(vector<float>* v) {
if (ptr != NULL) {
free(ptr);
}
ptr = (float*) malloc(v->size() * sizeof(float));
for (int i = 0; i < v->size(); i++) {
ptr[i] = (float)(*v)[i];
}
len = v->size();
};
};
class AnnoyVectorInt {
protected:
int32_t *ptr;
int len;
public:
~AnnoyVectorInt() {
free(ptr);
};
int32_t* ArrayPtr() {
return ptr;
};
int Len() {
return len;
};
int32_t Get(int i) {
if (i >= len) {
return 0.0;
}
return ptr[i];
};
void fill_from_vector(vector<int32_t>* v) {
if (ptr != NULL) {
free(ptr);
}
ptr = (int32_t*) malloc(v->size() * sizeof(int32_t));
for (int i = 0; i < v->size(); i++) {
ptr[i] = (int32_t)(*v)[i];
}
len = v->size();
};
};
class AnnoyIndex {
protected:
::AnnoyIndexInterface<int32_t, float> *ptr;
int f;
public:
~AnnoyIndex() {
delete ptr;
};
void addItem(int item, const float* w) {
ptr->add_item(item, w);
};
void build(int q) {
ptr->build(q, 1);
};
bool save(const char* filename, bool prefault) {
return ptr->save(filename, prefault);
};
bool save(const char* filename) {
return ptr->save(filename, true);
};
void unload() {
ptr->unload();
};
bool load(const char* filename, bool prefault) {
return ptr->load(filename, prefault);
};
bool load(const char* filename) {
return ptr->load(filename, true);
};
float getDistance(int i, int j) {
return ptr->get_distance(i, j);
};
void getNnsByItem(int item, int n, int search_k, AnnoyVectorInt* out_result, AnnoyVectorFloat* out_distances) {
vector<int32_t>* result = new vector<int32_t>();
vector<float>* distances = new vector<float>();
ptr->get_nns_by_item(item, n, search_k, result, distances);
out_result->fill_from_vector(result);
out_distances->fill_from_vector(distances);
delete result;
delete distances;
};
void getNnsByVector(const float* w, int n, int search_k, AnnoyVectorInt* out_result, AnnoyVectorFloat* out_distances) {
vector<int32_t>* result = new vector<int32_t>();
vector<float>* distances = new vector<float>();
ptr->get_nns_by_vector(w, n, search_k, result, distances);
out_result->fill_from_vector(result);
out_distances->fill_from_vector(distances);
delete result;
delete distances;
};
void getNnsByItem(int item, int n, int search_k, AnnoyVectorInt* out_result) {
vector<int32_t>* result = new vector<int32_t>();
ptr->get_nns_by_item(item, n, search_k, result, NULL);
out_result->fill_from_vector(result);
delete result;
};
void getNnsByVector(const float* w, int n, int search_k, AnnoyVectorInt* out_result) {
vector<int32_t>* result = new vector<int32_t>();
ptr->get_nns_by_vector(w, n, search_k, result, NULL);
out_result->fill_from_vector(result);
delete result;
};
int getNItems() {
return (int)ptr->get_n_items();
};
void verbose(bool v) {
ptr->verbose(v);
};
void getItem(int item, AnnoyVectorFloat *v) {
vector<float>* r = new vector<float>();
r->resize(this->f);
ptr->get_item(item, &r->front());
v->fill_from_vector(r);
};
bool onDiskBuild(const char* filename) {
return ptr->on_disk_build(filename);
};
};
class AnnoyIndexAngular : public AnnoyIndex
{
public:
AnnoyIndexAngular(int f) {
ptr = new ::AnnoyIndex<int32_t, float, ::Angular, ::Kiss64Random, AnnoyIndexSingleThreadedBuildPolicy>(f);
this->f = f;
}
};
class AnnoyIndexEuclidean : public AnnoyIndex {
public:
AnnoyIndexEuclidean(int f) {
ptr = new ::AnnoyIndex<int32_t, float, ::Euclidean, ::Kiss64Random, AnnoyIndexSingleThreadedBuildPolicy>(f);
this->f = f;
}
};
class AnnoyIndexManhattan : public AnnoyIndex {
public:
AnnoyIndexManhattan(int f) {
ptr = new ::AnnoyIndex<int32_t, float, ::Manhattan, ::Kiss64Random, AnnoyIndexSingleThreadedBuildPolicy>(f);
this->f = f;
}
};
class AnnoyIndexDotProduct : public AnnoyIndex {
public:
AnnoyIndexDotProduct(int f) {
ptr = new ::AnnoyIndex<int32_t, float, ::DotProduct, ::Kiss64Random, AnnoyIndexSingleThreadedBuildPolicy>(f);
this->f = f;
}
};
}
================================================
FILE: src/annoygomodule.i
================================================
%module annoy
namespace Annoy {}
%{
#include "annoygomodule.h"
%}
// const float *
%typemap(gotype) (const float *) "[]float32"
%typemap(gotype) (int32_t) "int32"
%typemap(in) (const float *)
%{
float *v;
vector<float> w;
v = (float *)$input.array;
for (int i = 0; i < $input.len; i++) {
w.push_back(v[i]);
}
$1 = &w[0];
%}
%typemap(gotype) (const char *) "string"
%typemap(in) (const char *)
%{
$1 = (char *)calloc((((_gostring_)$input).n + 1), sizeof(char));
strncpy($1, (((_gostring_)$input).p), ((_gostring_)$input).n);
%}
%typemap(freearg) (const char *)
%{
free($1);
%}
%ignore fill_from_vector;
%rename(X_RawAnnoyVectorInt) AnnoyVectorInt;
%rename(X_RawAnnoyVectorFloat) AnnoyVectorFloat;
%insert(go_wrapper) %{
type AnnoyVectorInt interface {
X_RawAnnoyVectorInt
ToSlice() []int32
Copy(in *[]int32)
InnerArray() []int32
Free()
}
func NewAnnoyVectorInt() AnnoyVectorInt {
vec := NewX_RawAnnoyVectorInt()
return vec.(SwigcptrX_RawAnnoyVectorInt)
}
func (p SwigcptrX_RawAnnoyVectorInt) ToSlice() []int32 {
var out []int32
p.Copy(&out)
return out
}
func (p SwigcptrX_RawAnnoyVectorInt) Copy(in *[]int32) {
out := *in
inner := p.InnerArray()
if cap(out) >= len(inner) {
if len(out) != len(inner) {
out = out[:len(inner)]
}
} else {
out = make([]int32, len(inner))
}
copy(out, inner)
*in = out
}
func (p SwigcptrX_RawAnnoyVectorInt) Free() {
DeleteX_RawAnnoyVectorInt(p)
}
func (p SwigcptrX_RawAnnoyVectorInt) InnerArray() []int32 {
length := p.Len()
ptr := unsafe.Pointer(p.ArrayPtr())
return ((*[1 << 30]int32)(ptr))[:length:length]
}
%}
%insert(go_wrapper) %{
type AnnoyVectorFloat interface {
X_RawAnnoyVectorFloat
ToSlice() []float32
Copy(in *[]float32)
InnerArray() []float32
Free()
}
func NewAnnoyVectorFloat() AnnoyVectorFloat {
vec := NewX_RawAnnoyVectorFloat()
return vec.(SwigcptrX_RawAnnoyVectorFloat)
}
func (p SwigcptrX_RawAnnoyVectorFloat) ToSlice() []float32 {
var out []float32
p.Copy(&out)
return out
}
func (p SwigcptrX_RawAnnoyVectorFloat) Copy(in *[]float32) {
out := *in
inner := p.InnerArray()
if cap(out) >= len(inner) {
if len(out) != len(inner) {
out = out[:len(inner)]
}
} else {
out = make([]float32, len(inner))
}
copy(out, inner)
*in = out
}
func (p SwigcptrX_RawAnnoyVectorFloat) Free() {
DeleteX_RawAnnoyVectorFloat(p)
}
func (p SwigcptrX_RawAnnoyVectorFloat) InnerArray() []float32 {
length := p.Len()
ptr := unsafe.Pointer(p.ArrayPtr())
return ((*[1 << 30]float32)(ptr))[:length:length]
}
%}
/* Let's just grab the original header file here */
%include "annoygomodule.h"
%feature("notabstract") GoAnnoyIndexAngular;
%feature("notabstract") GoAnnoyIndexEuclidean;
%feature("notabstract") GoAnnoyIndexManhattan;
%feature("notabstract") GoAnnoyIndexDotProduct;
================================================
FILE: src/annoylib.h
================================================
// Copyright (c) 2013 Spotify AB
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy of
// the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations under
// the License.
#ifndef ANNOY_ANNOYLIB_H
#define ANNOY_ANNOYLIB_H
#include <stdio.h>
#include <sys/stat.h>
#ifndef _MSC_VER
#include <unistd.h>
#endif
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <fcntl.h>
#include <stddef.h>
#if defined(_MSC_VER) && _MSC_VER == 1500
typedef unsigned char uint8_t;
typedef signed __int32 int32_t;
typedef unsigned __int64 uint64_t;
typedef signed __int64 int64_t;
#else
#include <stdint.h>
#endif
#if defined(_MSC_VER) || defined(__MINGW32__)
// a bit hacky, but override some definitions to support 64 bit
#define off_t int64_t
#define lseek_getsize(fd) _lseeki64(fd, 0, SEEK_END)
#ifndef NOMINMAX
#define NOMINMAX
#endif
#include "mman.h"
#include <windows.h>
#else
#include <sys/mman.h>
#define lseek_getsize(fd) lseek(fd, 0, SEEK_END)
#endif
#include <cerrno>
#include <string.h>
#include <math.h>
#include <vector>
#include <algorithm>
#include <queue>
#include <limits>
#if __cplusplus >= 201103L
#include <type_traits>
#endif
#ifdef ANNOYLIB_MULTITHREADED_BUILD
#include <thread>
#include <mutex>
#include <shared_mutex>
#endif
#ifdef _MSC_VER
// Needed for Visual Studio to disable runtime checks for mempcy
#pragma runtime_checks("s", off)
#endif
// This allows others to supply their own logger / error printer without
// requiring Annoy to import their headers. See RcppAnnoy for a use case.
#ifndef __ERROR_PRINTER_OVERRIDE__
#define annoylib_showUpdate(...) { fprintf(stderr, __VA_ARGS__ ); }
#else
#define annoylib_showUpdate(...) { __ERROR_PRINTER_OVERRIDE__( __VA_ARGS__ ); }
#endif
// Portable alloc definition, cf Writing R Extensions, Section 1.6.4
#ifdef __GNUC__
// Includes GCC, clang and Intel compilers
# undef alloca
# define alloca(x) __builtin_alloca((x))
#elif defined(__sun) || defined(_AIX)
// this is necessary (and sufficient) for Solaris 10 and AIX 6:
# include <alloca.h>
#endif
// We let the v array in the Node struct take whatever space is needed, so this is a mostly insignificant number.
// Compilers need *some* size defined for the v array, and some memory checking tools will flag for buffer overruns if this is set too low.
#define ANNOYLIB_V_ARRAY_SIZE 65536
#ifndef _MSC_VER
#define annoylib_popcount __builtin_popcountll
#else // See #293, #358
#define annoylib_popcount cole_popcount
#endif
#if !defined(NO_MANUAL_VECTORIZATION) && defined(__GNUC__) && (__GNUC__ >6) && defined(__AVX512F__) // See #402
#define ANNOYLIB_USE_AVX512
#elif !defined(NO_MANUAL_VECTORIZATION) && defined(__AVX__) && defined (__SSE__) && defined(__SSE2__) && defined(__SSE3__)
#define ANNOYLIB_USE_AVX
#else
#endif
#if defined(ANNOYLIB_USE_AVX) || defined(ANNOYLIB_USE_AVX512)
#if defined(_MSC_VER)
#include <intrin.h>
#elif defined(__GNUC__)
#include <x86intrin.h>
#endif
#endif
#if !defined(__MINGW32__)
#define ANNOYLIB_FTRUNCATE_SIZE(x) static_cast<int64_t>(x)
#else
#define ANNOYLIB_FTRUNCATE_SIZE(x) (x)
#endif
namespace Annoy {
inline void set_error_from_errno(char **error, const char* msg) {
annoylib_showUpdate("%s: %s (%d)\n", msg, strerror(errno), errno);
if (error) {
*error = (char *)malloc(256); // TODO: win doesn't support snprintf
snprintf(*error, 255, "%s: %s (%d)", msg, strerror(errno), errno);
}
}
inline void set_error_from_string(char **error, const char* msg) {
annoylib_showUpdate("%s\n", msg);
if (error) {
*error = (char *)malloc(strlen(msg) + 1);
strcpy(*error, msg);
}
}
using std::vector;
using std::pair;
using std::numeric_limits;
using std::make_pair;
inline bool remap_memory_and_truncate(void** _ptr, int _fd, size_t old_size, size_t new_size) {
#ifdef __linux__
*_ptr = mremap(*_ptr, old_size, new_size, MREMAP_MAYMOVE);
bool ok = ftruncate(_fd, new_size) != -1;
#else
munmap(*_ptr, old_size);
bool ok = ftruncate(_fd, ANNOYLIB_FTRUNCATE_SIZE(new_size)) != -1;
#ifdef MAP_POPULATE
*_ptr = mmap(*_ptr, new_size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, _fd, 0);
#else
*_ptr = mmap(*_ptr, new_size, PROT_READ | PROT_WRITE, MAP_SHARED, _fd, 0);
#endif
#endif
return ok;
}
namespace {
template<typename S, typename Node>
inline Node* get_node_ptr(const void* _nodes, const size_t _s, const S i) {
return (Node*)((uint8_t *)_nodes + (_s * i));
}
template<typename T>
inline T dot(const T* x, const T* y, int f) {
T s = 0;
for (int z = 0; z < f; z++) {
s += (*x) * (*y);
x++;
y++;
}
return s;
}
template<typename T>
inline T manhattan_distance(const T* x, const T* y, int f) {
T d = 0.0;
for (int i = 0; i < f; i++)
d += fabs(x[i] - y[i]);
return d;
}
template<typename T>
inline T euclidean_distance(const T* x, const T* y, int f) {
// Don't use dot-product: avoid catastrophic cancellation in #314.
T d = 0.0;
for (int i = 0; i < f; ++i) {
const T tmp=*x - *y;
d += tmp * tmp;
++x;
++y;
}
return d;
}
#ifdef ANNOYLIB_USE_AVX
// Horizontal single sum of 256bit vector.
inline float hsum256_ps_avx(__m256 v) {
const __m128 x128 = _mm_add_ps(_mm256_extractf128_ps(v, 1), _mm256_castps256_ps128(v));
const __m128 x64 = _mm_add_ps(x128, _mm_movehl_ps(x128, x128));
const __m128 x32 = _mm_add_ss(x64, _mm_shuffle_ps(x64, x64, 0x55));
return _mm_cvtss_f32(x32);
}
template<>
inline float dot<float>(const float* x, const float *y, int f) {
float result = 0;
if (f > 7) {
__m256 d = _mm256_setzero_ps();
for (; f > 7; f -= 8) {
d = _mm256_add_ps(d, _mm256_mul_ps(_mm256_loadu_ps(x), _mm256_loadu_ps(y)));
x += 8;
y += 8;
}
// Sum all floats in dot register.
result += hsum256_ps_avx(d);
}
// Don't forget the remaining values.
for (; f > 0; f--) {
result += *x * *y;
x++;
y++;
}
return result;
}
template<>
inline float manhattan_distance<float>(const float* x, const float* y, int f) {
float result = 0;
int i = f;
if (f > 7) {
__m256 manhattan = _mm256_setzero_ps();
__m256 minus_zero = _mm256_set1_ps(-0.0f);
for (; i > 7; i -= 8) {
const __m256 x_minus_y = _mm256_sub_ps(_mm256_loadu_ps(x), _mm256_loadu_ps(y));
const __m256 distance = _mm256_andnot_ps(minus_zero, x_minus_y); // Absolute value of x_minus_y (forces sign bit to zero)
manhattan = _mm256_add_ps(manhattan, distance);
x += 8;
y += 8;
}
// Sum all floats in manhattan register.
result = hsum256_ps_avx(manhattan);
}
// Don't forget the remaining values.
for (; i > 0; i--) {
result += fabsf(*x - *y);
x++;
y++;
}
return result;
}
template<>
inline float euclidean_distance<float>(const float* x, const float* y, int f) {
float result=0;
if (f > 7) {
__m256 d = _mm256_setzero_ps();
for (; f > 7; f -= 8) {
const __m256 diff = _mm256_sub_ps(_mm256_loadu_ps(x), _mm256_loadu_ps(y));
d = _mm256_add_ps(d, _mm256_mul_ps(diff, diff)); // no support for fmadd in AVX...
x += 8;
y += 8;
}
// Sum all floats in dot register.
result = hsum256_ps_avx(d);
}
// Don't forget the remaining values.
for (; f > 0; f--) {
float tmp = *x - *y;
result += tmp * tmp;
x++;
y++;
}
return result;
}
#endif
#ifdef ANNOYLIB_USE_AVX512
template<>
inline float dot<float>(const float* x, const float *y, int f) {
float result = 0;
if (f > 15) {
__m512 d = _mm512_setzero_ps();
for (; f > 15; f -= 16) {
//AVX512F includes FMA
d = _mm512_fmadd_ps(_mm512_loadu_ps(x), _mm512_loadu_ps(y), d);
x += 16;
y += 16;
}
// Sum all floats in dot register.
result += _mm512_reduce_add_ps(d);
}
// Don't forget the remaining values.
for (; f > 0; f--) {
result += *x * *y;
x++;
y++;
}
return result;
}
template<>
inline float manhattan_distance<float>(const float* x, const float* y, int f) {
float result = 0;
int i = f;
if (f > 15) {
__m512 manhattan = _mm512_setzero_ps();
for (; i > 15; i -= 16) {
const __m512 x_minus_y = _mm512_sub_ps(_mm512_loadu_ps(x), _mm512_loadu_ps(y));
manhattan = _mm512_add_ps(manhattan, _mm512_abs_ps(x_minus_y));
x += 16;
y += 16;
}
// Sum all floats in manhattan register.
result = _mm512_reduce_add_ps(manhattan);
}
// Don't forget the remaining values.
for (; i > 0; i--) {
result += fabsf(*x - *y);
x++;
y++;
}
return result;
}
template<>
inline float euclidean_distance<float>(const float* x, const float* y, int f) {
float result=0;
if (f > 15) {
__m512 d = _mm512_setzero_ps();
for (; f > 15; f -= 16) {
const __m512 diff = _mm512_sub_ps(_mm512_loadu_ps(x), _mm512_loadu_ps(y));
d = _mm512_fmadd_ps(diff, diff, d);
x += 16;
y += 16;
}
// Sum all floats in dot register.
result = _mm512_reduce_add_ps(d);
}
// Don't forget the remaining values.
for (; f > 0; f--) {
float tmp = *x - *y;
result += tmp * tmp;
x++;
y++;
}
return result;
}
#endif
template<typename T, typename Random, typename Distance, typename Node>
inline void two_means(const vector<Node*>& nodes, int f, Random& random, bool cosine, Node* p, Node* q) {
/*
This algorithm is a huge heuristic. Empirically it works really well, but I
can't motivate it well. The basic idea is to keep two centroids and assign
points to either one of them. We weight each centroid by the number of points
assigned to it, so to balance it.
*/
static int iteration_steps = 200;
size_t count = nodes.size();
size_t i = random.index(count);
size_t j = random.index(count-1);
j += (j >= i); // ensure that i != j
Distance::template copy_node<T, Node>(p, nodes[i], f);
Distance::template copy_node<T, Node>(q, nodes[j], f);
if (cosine) { Distance::template normalize<T, Node>(p, f); Distance::template normalize<T, Node>(q, f); }
Distance::init_node(p, f);
Distance::init_node(q, f);
int ic = 1, jc = 1;
for (int l = 0; l < iteration_steps; l++) {
size_t k = random.index(count);
T di = ic * Distance::distance(p, nodes[k], f),
dj = jc * Distance::distance(q, nodes[k], f);
T norm = cosine ? Distance::template get_norm<T, Node>(nodes[k], f) : 1;
if (!(norm > T(0))) {
continue;
}
if (di < dj) {
Distance::update_mean(p, nodes[k], norm, ic, f);
Distance::init_node(p, f);
ic++;
} else if (dj < di) {
Distance::update_mean(q, nodes[k], norm, jc, f);
Distance::init_node(q, f);
jc++;
}
}
}
} // namespace
struct Base {
template<typename T, typename S, typename Node>
static inline void preprocess(void* nodes, size_t _s, const S node_count, const int f) {
// Override this in specific metric structs below if you need to do any pre-processing
// on the entire set of nodes passed into this index.
}
template<typename T, typename S, typename Node>
static inline void postprocess(void* nodes, size_t _s, const S node_count, const int f) {
// Override this in specific metric structs below if you need to do any post-processing
// on the entire set of nodes passed into this index.
}
template<typename Node>
static inline void zero_value(Node* dest) {
// Initialize any fields that require sane defaults within this node.
}
template<typename T, typename Node>
static inline void copy_node(Node* dest, const Node* source, const int f) {
memcpy(dest->v, source->v, f * sizeof(T));
}
template<typename T, typename Node>
static inline T get_norm(Node* node, int f) {
return sqrt(dot(node->v, node->v, f));
}
template<typename T, typename Node>
static inline void normalize(Node* node, int f) {
T norm = Base::get_norm<T, Node>(node, f);
if (norm > 0) {
for (int z = 0; z < f; z++)
node->v[z] /= norm;
}
}
template<typename T, typename Node>
static inline void update_mean(Node* mean, Node* new_node, T norm, int c, int f) {
for (int z = 0; z < f; z++)
mean->v[z] = (mean->v[z] * c + new_node->v[z] / norm) / (c + 1);
}
};
struct Angular : Base {
template<typename S, typename T>
struct Node {
/*
* We store a binary tree where each node has two things
* - A vector associated with it
* - Two children
* All nodes occupy the same amount of memory
* All nodes with n_descendants == 1 are leaf nodes.
* A memory optimization is that for nodes with 2 <= n_descendants <= K,
* we skip the vector. Instead we store a list of all descendants. K is
* determined by the number of items that fits in the space of the vector.
* For nodes with n_descendants == 1 the vector is a data point.
* For nodes with n_descendants > K the vector is the normal of the split plane.
* Note that we can't really do sizeof(node<T>) because we cheat and allocate
* more memory to be able to fit the vector outside
*/
S n_descendants;
union {
S children[2]; // Will possibly store more than 2
T norm;
};
T v[ANNOYLIB_V_ARRAY_SIZE];
};
template<typename S, typename T>
static inline T distance(const Node<S, T>* x, const Node<S, T>* y, int f) {
// want to calculate (a/|a| - b/|b|)^2
// = a^2 / a^2 + b^2 / b^2 - 2ab/|a||b|
// = 2 - 2cos
T pp = x->norm ? x->norm : dot(x->v, x->v, f); // For backwards compatibility reasons, we need to fall back and compute the norm here
T qq = y->norm ? y->norm : dot(y->v, y->v, f);
T pq = dot(x->v, y->v, f);
T ppqq = pp * qq;
if (ppqq > 0) return 2.0 - 2.0 * pq / sqrt(ppqq);
else return 2.0; // cos is 0
}
template<typename S, typename T>
static inline T margin(const Node<S, T>* n, const T* y, int f) {
return dot(n->v, y, f);
}
template<typename S, typename T, typename Random>
static inline bool side(const Node<S, T>* n, const T* y, int f, Random& random) {
T dot = margin(n, y, f);
if (dot != 0)
return (dot > 0);
else
return (bool)random.flip();
}
template<typename S, typename T, typename Random>
static inline bool side(const Node<S, T>* n, const Node<S, T>* y, int f, Random& random) {
return side(n, y->v, f, random);
}
template<typename S, typename T, typename Random>
static inline void create_split(const vector<Node<S, T>*>& nodes, int f, size_t s, Random& random, Node<S, T>* n) {
Node<S, T>* p = (Node<S, T>*)alloca(s);
Node<S, T>* q = (Node<S, T>*)alloca(s);
two_means<T, Random, Angular, Node<S, T> >(nodes, f, random, true, p, q);
for (int z = 0; z < f; z++)
n->v[z] = p->v[z] - q->v[z];
Base::normalize<T, Node<S, T> >(n, f);
}
template<typename T>
static inline T normalized_distance(T distance) {
// Used when requesting distances from Python layer
// Turns out sometimes the squared distance is -0.0
// so we have to make sure it's a positive number.
return sqrt(std::max(distance, T(0)));
}
template<typename T>
static inline T pq_distance(T distance, T margin, int child_nr) {
if (child_nr == 0)
margin = -margin;
return std::min(distance, margin);
}
template<typename T>
static inline T pq_initial_value() {
return numeric_limits<T>::infinity();
}
template<typename S, typename T>
static inline void init_node(Node<S, T>* n, int f) {
n->norm = dot(n->v, n->v, f);
}
static const char* name() {
return "angular";
}
};
struct DotProduct : Angular {
template<typename S, typename T>
struct Node {
/*
* This is an extension of the Angular node with extra attributes for the DotProduct metric.
* It has dot_factor which is needed to reduce the task to Angular distance metric (see the preprocess method)
* and also a built flag that helps to compute exact dot products when an index is already built.
*/
S n_descendants;
S children[2]; // Will possibly store more than 2
T dot_factor;
T norm;
bool built;
T v[ANNOYLIB_V_ARRAY_SIZE];
};
static const char* name() {
return "dot";
}
template<typename T, typename Node>
static inline T get_norm(Node* node, int f) {
return sqrt(dot(node->v, node->v, f) + node->dot_factor * node->dot_factor);
}
template<typename T, typename Node>
static inline void update_mean(Node* mean, Node* new_node, T norm, int c, int f) {
for (int z = 0; z < f; z++)
mean->v[z] = (mean->v[z] * c + new_node->v[z] / norm) / (c + 1);
mean->dot_factor = (mean->dot_factor * c + new_node->dot_factor / norm) / (c + 1);
}
template<typename S, typename T>
static inline T distance(const Node<S, T>* x, const Node<S, T>* y, int f) {
if (x->built || y->built) {
// When index is already built, we don't need angular distances to retrieve NNs
// Thus, we can return dot product scores itself
return -dot(x->v, y->v, f);
}
// Calculated by analogy with the angular case
T pp = x->norm ? x->norm : dot(x->v, x->v, f) + x->dot_factor * x->dot_factor;
T qq = y->norm ? y->norm : dot(y->v, y->v, f) + y->dot_factor * y->dot_factor;
T pq = dot(x->v, y->v, f) + x->dot_factor * y->dot_factor;
T ppqq = pp * qq;
if (ppqq > 0) return 2.0 - 2.0 * pq / sqrt(ppqq);
else return 2.0;
}
template<typename Node>
static inline void zero_value(Node* dest) {
dest->dot_factor = 0;
}
template<typename S, typename T>
static inline void init_node(Node<S, T>* n, int f) {
n->built = false;
n->norm = dot(n->v, n->v, f) + n->dot_factor * n->dot_factor;
}
template<typename T, typename Node>
static inline void copy_node(Node* dest, const Node* source, const int f) {
memcpy(dest->v, source->v, f * sizeof(T));
dest->dot_factor = source->dot_factor;
}
template<typename S, typename T, typename Random>
static inline void create_split(const vector<Node<S, T>*>& nodes, int f, size_t s, Random& random, Node<S, T>* n) {
Node<S, T>* p = (Node<S, T>*)alloca(s);
Node<S, T>* q = (Node<S, T>*)alloca(s);
DotProduct::zero_value(p);
DotProduct::zero_value(q);
two_means<T, Random, DotProduct, Node<S, T> >(nodes, f, random, true, p, q);
for (int z = 0; z < f; z++)
n->v[z] = p->v[z] - q->v[z];
n->dot_factor = p->dot_factor - q->dot_factor;
DotProduct::normalize<T, Node<S, T> >(n, f);
}
template<typename T, typename Node>
static inline void normalize(Node* node, int f) {
T norm = sqrt(dot(node->v, node->v, f) + pow(node->dot_factor, 2));
if (norm > 0) {
for (int z = 0; z < f; z++)
node->v[z] /= norm;
node->dot_factor /= norm;
}
}
template<typename S, typename T>
static inline T margin(const Node<S, T>* n, const T* y, int f) {
return dot(n->v, y, f);
}
template<typename S, typename T>
static inline T margin(const Node<S, T>* n, const Node<S, T>* y, int f) {
return dot(n->v, y->v, f) + n->dot_factor * y->dot_factor;
}
template<typename S, typename T, typename Random>
static inline bool side(const Node<S, T>* n, const Node<S, T>* y, int f, Random& random) {
T dot = margin(n, y, f);
if (dot != 0)
return (dot > 0);
else
return (bool)random.flip();
}
template<typename S, typename T, typename Random>
static inline bool side(const Node<S, T>* n, const T* y, int f, Random& random) {
T dot = margin(n, y, f);
if (dot != 0)
return (dot > 0);
else
return (bool)random.flip();
}
template<typename T>
static inline T normalized_distance(T distance) {
return -distance;
}
template<typename T, typename S, typename Node>
static inline void preprocess(void* nodes, size_t _s, const S node_count, const int f) {
// This uses a method from Microsoft Research for transforming inner product spaces to cosine/angular-compatible spaces.
// (Bachrach et al., 2014, see https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/XboxInnerProduct.pdf)
// Step one: compute the norm of each vector and store that in its extra dimension (f-1)
for (S i = 0; i < node_count; i++) {
Node* node = get_node_ptr<S, Node>(nodes, _s, i);
T d = dot(node->v, node->v, f);
T norm = d < 0 ? 0 : sqrt(d);
node->dot_factor = norm;
node->built = false;
}
// Step two: find the maximum norm
T max_norm = 0;
for (S i = 0; i < node_count; i++) {
Node* node = get_node_ptr<S, Node>(nodes, _s, i);
if (node->dot_factor > max_norm) {
max_norm = node->dot_factor;
}
}
// Step three: set each vector's extra dimension to sqrt(max_norm^2 - norm^2)
for (S i = 0; i < node_count; i++) {
Node* node = get_node_ptr<S, Node>(nodes, _s, i);
T node_norm = node->dot_factor;
T squared_norm_diff = pow(max_norm, static_cast<T>(2.0)) - pow(node_norm, static_cast<T>(2.0));
T dot_factor = squared_norm_diff < 0 ? 0 : sqrt(squared_norm_diff);
node->norm = pow(max_norm, static_cast<T>(2.0));
node->dot_factor = dot_factor;
}
}
template<typename T, typename S, typename Node>
static inline void postprocess(void* nodes, size_t _s, const S node_count, const int f) {
for (S i = 0; i < node_count; i++) {
Node* node = get_node_ptr<S, Node>(nodes, _s, i);
// When an index is built, we will remember it in index item nodes to compute distances differently
node->built = true;
}
}
};
struct Hamming : Base {
template<typename S, typename T>
struct Node {
S n_descendants;
S children[2];
T v[ANNOYLIB_V_ARRAY_SIZE];
};
static const size_t max_iterations = 20;
template<typename T>
static inline T pq_distance(T distance, T margin, int child_nr) {
return distance - (margin != (unsigned int) child_nr);
}
template<typename T>
static inline T pq_initial_value() {
return numeric_limits<T>::max();
}
template<typename T>
static inline int cole_popcount(T v) {
// Note: Only used with MSVC 9, which lacks intrinsics and fails to
// calculate std::bitset::count for v > 32bit. Uses the generalized
// approach by Eric Cole.
// See https://graphics.stanford.edu/~seander/bithacks.html#CountBitsSet64
v = v - ((v >> 1) & (T)~(T)0/3);
v = (v & (T)~(T)0/15*3) + ((v >> 2) & (T)~(T)0/15*3);
v = (v + (v >> 4)) & (T)~(T)0/255*15;
return (T)(v * ((T)~(T)0/255)) >> (sizeof(T) - 1) * 8;
}
template<typename S, typename T>
static inline T distance(const Node<S, T>* x, const Node<S, T>* y, int f) {
size_t dist = 0;
for (int i = 0; i < f; i++) {
dist += annoylib_popcount(x->v[i] ^ y->v[i]);
}
return dist;
}
template<typename S, typename T>
static inline bool margin(const Node<S, T>* n, const T* y, int f) {
static const size_t n_bits = sizeof(T) * 8;
T chunk = n->v[0] / n_bits;
return (y[chunk] & (static_cast<T>(1) << (n_bits - 1 - (n->v[0] % n_bits)))) != 0;
}
template<typename S, typename T, typename Random>
static inline bool side(const Node<S, T>* n, const T* y, int f, Random& random) {
return margin(n, y, f);
}
template<typename S, typename T, typename Random>
static inline bool side(const Node<S, T>* n, const Node<S, T>* y, int f, Random& random) {
return side(n, y->v, f, random);
}
template<typename S, typename T, typename Random>
static inline void create_split(const vector<Node<S, T>*>& nodes, int f, size_t s, Random& random, Node<S, T>* n) {
size_t cur_size = 0;
size_t i = 0;
int dim = f * 8 * sizeof(T);
for (; i < max_iterations; i++) {
// choose random position to split at
n->v[0] = random.index(dim);
cur_size = 0;
for (typename vector<Node<S, T>*>::const_iterator it = nodes.begin(); it != nodes.end(); ++it) {
if (margin(n, (*it)->v, f)) {
cur_size++;
}
}
if (cur_size > 0 && cur_size < nodes.size()) {
break;
}
}
// brute-force search for splitting coordinate
if (i == max_iterations) {
int j = 0;
for (; j < dim; j++) {
n->v[0] = j;
cur_size = 0;
for (typename vector<Node<S, T>*>::const_iterator it = nodes.begin(); it != nodes.end(); ++it) {
if (margin(n, (*it)->v, f)) {
cur_size++;
}
}
if (cur_size > 0 && cur_size < nodes.size()) {
break;
}
}
}
}
template<typename T>
static inline T normalized_distance(T distance) {
return distance;
}
template<typename S, typename T>
static inline void init_node(Node<S, T>* n, int f) {
}
static const char* name() {
return "hamming";
}
};
struct Minkowski : Base {
template<typename S, typename T>
struct Node {
S n_descendants;
T a; // need an extra constant term to determine the offset of the plane
S children[2];
T v[ANNOYLIB_V_ARRAY_SIZE];
};
template<typename S, typename T>
static inline T margin(const Node<S, T>* n, const T* y, int f) {
return n->a + dot(n->v, y, f);
}
template<typename S, typename T, typename Random>
static inline bool side(const Node<S, T>* n, const T* y, int f, Random& random) {
T dot = margin(n, y, f);
if (dot != 0)
return (dot > 0);
else
return (bool)random.flip();
}
template<typename S, typename T, typename Random>
static inline bool side(const Node<S, T>* n, const Node<S, T>* y, int f, Random& random) {
return side(n, y->v, f, random);
}
template<typename T>
static inline T pq_distance(T distance, T margin, int child_nr) {
if (child_nr == 0)
margin = -margin;
return std::min(distance, margin);
}
template<typename T>
static inline T pq_initial_value() {
return numeric_limits<T>::infinity();
}
};
struct Euclidean : Minkowski {
template<typename S, typename T>
static inline T distance(const Node<S, T>* x, const Node<S, T>* y, int f) {
return euclidean_distance(x->v, y->v, f);
}
template<typename S, typename T, typename Random>
static inline void create_split(const vector<Node<S, T>*>& nodes, int f, size_t s, Random& random, Node<S, T>* n) {
Node<S, T>* p = (Node<S, T>*)alloca(s);
Node<S, T>* q = (Node<S, T>*)alloca(s);
two_means<T, Random, Euclidean, Node<S, T> >(nodes, f, random, false, p, q);
for (int z = 0; z < f; z++)
n->v[z] = p->v[z] - q->v[z];
Base::normalize<T, Node<S, T> >(n, f);
n->a = 0.0;
for (int z = 0; z < f; z++)
n->a += -n->v[z] * (p->v[z] + q->v[z]) / 2;
}
template<typename T>
static inline T normalized_distance(T distance) {
return sqrt(std::max(distance, T(0)));
}
template<typename S, typename T>
static inline void init_node(Node<S, T>* n, int f) {
}
static const char* name() {
return "euclidean";
}
};
struct Manhattan : Minkowski {
template<typename S, typename T>
static inline T distance(const Node<S, T>* x, const Node<S, T>* y, int f) {
return manhattan_distance(x->v, y->v, f);
}
template<typename S, typename T, typename Random>
static inline void create_split(const vector<Node<S, T>*>& nodes, int f, size_t s, Random& random, Node<S, T>* n) {
Node<S, T>* p = (Node<S, T>*)alloca(s);
Node<S, T>* q = (Node<S, T>*)alloca(s);
two_means<T, Random, Manhattan, Node<S, T> >(nodes, f, random, false, p, q);
for (int z = 0; z < f; z++)
n->v[z] = p->v[z] - q->v[z];
Base::normalize<T, Node<S, T> >(n, f);
n->a = 0.0;
for (int z = 0; z < f; z++)
n->a += -n->v[z] * (p->v[z] + q->v[z]) / 2;
}
template<typename T>
static inline T normalized_distance(T distance) {
return std::max(distance, T(0));
}
template<typename S, typename T>
static inline void init_node(Node<S, T>* n, int f) {
}
static const char* name() {
return "manhattan";
}
};
template<typename S, typename T, typename R = uint64_t>
class AnnoyIndexInterface {
public:
// Note that the methods with an **error argument will allocate memory and write the pointer to that string if error is non-NULL
virtual ~AnnoyIndexInterface() {};
virtual bool add_item(S item, const T* w, char** error=NULL) = 0;
virtual bool build(int q, int n_threads=-1, char** error=NULL) = 0;
virtual bool unbuild(char** error=NULL) = 0;
virtual bool save(const char* filename, bool prefault=false, char** error=NULL) = 0;
virtual void unload() = 0;
virtual bool load(const char* filename, bool prefault=false, char** error=NULL) = 0;
virtual T get_distance(S i, S j) const = 0;
virtual void get_nns_by_item(S item, size_t n, int search_k, vector<S>* result, vector<T>* distances) const = 0;
virtual void get_nns_by_vector(const T* w, size_t n, int search_k, vector<S>* result, vector<T>* distances) const = 0;
virtual S get_n_items() const = 0;
virtual S get_n_trees() const = 0;
virtual void verbose(bool v) = 0;
virtual void get_item(S item, T* v) const = 0;
virtual void set_seed(R q) = 0;
virtual bool on_disk_build(const char* filename, char** error=NULL) = 0;
};
template<typename S, typename T, typename Distance, typename Random, class ThreadedBuildPolicy>
class AnnoyIndex : public AnnoyIndexInterface<S, T,
#if __cplusplus >= 201103L
typename std::remove_const<decltype(Random::default_seed)>::type
#else
typename Random::seed_type
#endif
> {
/*
* We use random projection to build a forest of binary trees of all items.
* Basically just split the hyperspace into two sides by a hyperplane,
* then recursively split each of those subtrees etc.
* We create a tree like this q times. The default q is determined automatically
* in such a way that we at most use 2x as much memory as the vectors take.
*/
public:
typedef Distance D;
typedef typename D::template Node<S, T> Node;
#if __cplusplus >= 201103L
typedef typename std::remove_const<decltype(Random::default_seed)>::type R;
#else
typedef typename Random::seed_type R;
#endif
protected:
const int _f;
size_t _s;
S _n_items;
void* _nodes; // Could either be mmapped, or point to a memory buffer that we reallocate
S _n_nodes;
S _nodes_size;
vector<S> _roots;
S _K;
R _seed;
bool _loaded;
bool _verbose;
int _fd;
bool _on_disk;
bool _built;
public:
AnnoyIndex(int f) : _f(f), _seed(Random::default_seed) {
_s = offsetof(Node, v) + _f * sizeof(T); // Size of each node
_verbose = false;
_built = false;
_K = (S) (((size_t) (_s - offsetof(Node, children))) / sizeof(S)); // Max number of descendants to fit into node
reinitialize(); // Reset everything
}
~AnnoyIndex() {
unload();
}
int get_f() const {
return _f;
}
bool add_item(S item, const T* w, char** error=NULL) {
return add_item_impl(item, w, error);
}
template<typename W>
bool add_item_impl(S item, const W& w, char** error=NULL) {
if (_loaded) {
set_error_from_string(error, "You can't add an item to a loaded index");
return false;
}
_allocate_size(item + 1);
Node* n = _get(item);
D::zero_value(n);
n->children[0] = 0;
n->children[1] = 0;
n->n_descendants = 1;
for (int z = 0; z < _f; z++)
n->v[z] = w[z];
D::init_node(n, _f);
if (item >= _n_items)
_n_items = item + 1;
return true;
}
bool on_disk_build(const char* file, char** error=NULL) {
_on_disk = true;
#ifndef _MSC_VER
_fd = open(file, O_RDWR | O_CREAT | O_TRUNC, (int) 0600);
#else
_fd = _open(file, _O_RDWR | _O_CREAT | _O_TRUNC, (int) 0600);
#endif
if (_fd == -1) {
set_error_from_errno(error, "Unable to open");
_fd = 0;
return false;
}
_nodes_size = 1;
if (ftruncate(_fd, ANNOYLIB_FTRUNCATE_SIZE(_s) * ANNOYLIB_FTRUNCATE_SIZE(_nodes_size)) == -1) {
set_error_from_errno(error, "Unable to truncate");
return false;
}
#ifdef MAP_POPULATE
_nodes = (Node*) mmap(0, _s * _nodes_size, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, _fd, 0);
#else
_nodes = (Node*) mmap(0, _s * _nodes_size, PROT_READ | PROT_WRITE, MAP_SHARED, _fd, 0);
#endif
return true;
}
bool build(int q, int n_threads=-1, char** error=NULL) {
if (_loaded) {
set_error_from_string(error, "You can't build a loaded index");
return false;
}
if (_built) {
set_error_from_string(error, "You can't build a built index");
return false;
}
D::template preprocess<T, S, Node>(_nodes, _s, _n_items, _f);
_n_nodes = _n_items;
ThreadedBuildPolicy::template build<S, T>(this, q, n_threads);
// Also, copy the roots into the last segment of the array
// This way we can load them faster without reading the whole file
_allocate_size(_n_nodes + (S)_roots.size());
for (size_t i = 0; i < _roots.size(); i++)
memcpy(_get(_n_nodes + (S)i), _get(_roots[i]), _s);
_n_nodes += _roots.size();
if (_verbose) annoylib_showUpdate("has %d nodes\n", _n_nodes);
if (_on_disk) {
if (!remap_memory_and_truncate(&_nodes, _fd,
static_cast<size_t>(_s) * static_cast<size_t>(_nodes_size),
static_cast<size_t>(_s) * static_cast<size_t>(_n_nodes))) {
// TODO: this probably creates an index in a corrupt state... not sure what to do
set_error_from_errno(error, "Unable to truncate");
return false;
}
_nodes_size = _n_nodes;
}
D::template postprocess<T, S, Node>(_nodes, _s, _n_items, _f);
_built = true;
return true;
}
bool unbuild(char** error=NULL) {
if (_loaded) {
set_error_from_string(error, "You can't unbuild a loaded index");
return false;
}
_roots.clear();
_n_nodes = _n_items;
_built = false;
return true;
}
bool save(const char* filename, bool prefault=false, char** error=NULL) {
if (!_built) {
set_error_from_string(error, "You can't save an index that hasn't been built");
return false;
}
if (_on_disk) {
return true;
} else {
// Delete file if it already exists (See issue #335)
#ifndef _MSC_VER
unlink(filename);
#else
_unlink(filename);
#endif
FILE *f = fopen(filename, "wb");
if (f == NULL) {
set_error_from_errno(error, "Unable to open");
return false;
}
if (fwrite(_nodes, _s, _n_nodes, f) != (size_t) _n_nodes) {
set_error_from_errno(error, "Unable to write");
return false;
}
if (fclose(f) == EOF) {
set_error_from_errno(error, "Unable to close");
return false;
}
unload();
return load(filename, prefault, error);
}
}
void reinitialize() {
_fd = 0;
_nodes = NULL;
_loaded = false;
_n_items = 0;
_n_nodes = 0;
_nodes_size = 0;
_on_disk = false;
_seed = Random::default_seed;
_roots.clear();
}
void unload() {
if (_on_disk && _fd) {
#ifndef _MSC_VER
close(_fd);
#else
_close(_fd);
#endif
munmap(_nodes, _s * _nodes_size);
} else {
if (_fd) {
// we have mmapped data
#ifndef _MSC_VER
close(_fd);
#else
_close(_fd);
#endif
munmap(_nodes, _n_nodes * _s);
} else if (_nodes) {
// We have heap allocated data
free(_nodes);
}
}
reinitialize();
if (_verbose) annoylib_showUpdate("unloaded\n");
}
bool load(const char* filename, bool prefault=false, char** error=NULL) {
#ifndef _MSC_VER
_fd = open(filename, O_RDONLY, (int)0400);
#else
_fd = _open(filename, _O_RDONLY, (int)0400);
#endif
if (_fd == -1) {
set_error_from_errno(error, "Unable to open");
_fd = 0;
return false;
}
off_t size = lseek_getsize(_fd);
if (size == -1) {
set_error_from_errno(error, "Unable to get size");
return false;
} else if (size == 0) {
set_error_from_errno(error, "Size of file is zero");
return false;
} else if (size % _s) {
// Something is fishy with this index!
set_error_from_errno(error, "Index size is not a multiple of vector size. Ensure you are opening using the same metric you used to create the index.");
return false;
}
int flags = MAP_SHARED;
if (prefault) {
#ifdef MAP_POPULATE
flags |= MAP_POPULATE;
#else
annoylib_showUpdate("prefault is set to true, but MAP_POPULATE is not defined on this platform");
#endif
}
_nodes = (Node*)mmap(0, size, PROT_READ, flags, _fd, 0);
_n_nodes = (S)(size / _s);
// Find the roots by scanning the end of the file and taking the nodes with most descendants
_roots.clear();
S m = -1;
for (S i = _n_nodes - 1; i >= 0; i--) {
S k = _get(i)->n_descendants;
if (m == -1 || k == m) {
_roots.push_back(i);
m = k;
} else {
break;
}
}
// hacky fix: since the last root precedes the copy of all roots, delete it
if (_roots.size() > 1 && _get(_roots.front())->children[0] == _get(_roots.back())->children[0])
_roots.pop_back();
_loaded = true;
_built = true;
_n_items = m;
if (_verbose) annoylib_showUpdate("found %zu roots with degree %d\n", _roots.size(), m);
return true;
}
T get_distance(S i, S j) const {
return D::normalized_distance(D::distance(_get(i), _get(j), _f));
}
void get_nns_by_item(S item, size_t n, int search_k, vector<S>* result, vector<T>* distances) const {
// TODO: handle OOB
const Node* m = _get(item);
_get_all_nns(m->v, n, search_k, result, distances);
}
void get_nns_by_vector(const T* w, size_t n, int search_k, vector<S>* result, vector<T>* distances) const {
_get_all_nns(w, n, search_k, result, distances);
}
S get_n_items() const {
return _n_items;
}
S get_n_trees() const {
return (S)_roots.size();
}
void verbose(bool v) {
_verbose = v;
}
void get_item(S item, T* v) const {
// TODO: handle OOB
Node* m = _get(item);
memcpy(v, m->v, (_f) * sizeof(T));
}
void set_seed(R seed) {
_seed = seed;
}
void thread_build(int q, int thread_idx, ThreadedBuildPolicy& threaded_build_policy) {
// Each thread needs its own seed, otherwise each thread would be building the same tree(s)
Random _random(_seed + thread_idx);
vector<S> thread_roots;
while (1) {
if (q == -1) {
threaded_build_policy.lock_n_nodes();
if (_n_nodes >= 2 * _n_items) {
threaded_build_policy.unlock_n_nodes();
break;
}
threaded_build_policy.unlock_n_nodes();
} else {
if (thread_roots.size() >= (size_t)q) {
break;
}
}
if (_verbose) annoylib_showUpdate("pass %zd...\n", thread_roots.size());
vector<S> indices;
threaded_build_policy.lock_shared_nodes();
for (S i = 0; i < _n_items; i++) {
if (_get(i)->n_descendants >= 1) { // Issue #223
indices.push_back(i);
}
}
threaded_build_policy.unlock_shared_nodes();
thread_roots.push_back(_make_tree(indices, true, _random, threaded_build_policy));
}
threaded_build_policy.lock_roots();
_roots.insert(_roots.end(), thread_roots.begin(), thread_roots.end());
threaded_build_policy.unlock_roots();
}
protected:
void _reallocate_nodes(S n) {
const double reallocation_factor = 1.3;
S new_nodes_size = std::max(n, (S) ((_nodes_size + 1) * reallocation_factor));
void *old = _nodes;
if (_on_disk) {
if (!remap_memory_and_truncate(&_nodes, _fd,
static_cast<size_t>(_s) * static_cast<size_t>(_nodes_size),
static_cast<size_t>(_s) * static_cast<size_t>(new_nodes_size)) &&
_verbose)
annoylib_showUpdate("File truncation error\n");
} else {
_nodes = realloc(_nodes, _s * new_nodes_size);
memset((char *) _nodes + (_nodes_size * _s) / sizeof(char), 0, (new_nodes_size - _nodes_size) * _s);
}
_nodes_size = new_nodes_size;
if (_verbose) annoylib_showUpdate("Reallocating to %d nodes: old_address=%p, new_address=%p\n", new_nodes_size, old, _nodes);
}
void _allocate_size(S n, ThreadedBuildPolicy& threaded_build_policy) {
if (n > _nodes_size) {
threaded_build_policy.lock_nodes();
_reallocate_nodes(n);
threaded_build_policy.unlock_nodes();
}
}
void _allocate_size(S n) {
if (n > _nodes_size) {
_reallocate_nodes(n);
}
}
Node* _get(const S i) const {
return get_node_ptr<S, Node>(_nodes, _s, i);
}
double _split_imbalance(const vector<S>& left_indices, const vector<S>& right_indices) {
double ls = (float)left_indices.size();
double rs = (float)right_indices.size();
float f = ls / (ls + rs + 1e-9); // Avoid 0/0
return std::max(f, 1-f);
}
S _make_tree(const vector<S>& indices, bool is_root, Random& _random, ThreadedBuildPolicy& threaded_build_policy) {
// The basic rule is that if we have <= _K items, then it's a leaf node, otherwise it's a split node.
// There's some regrettable complications caused by the problem that root nodes have to be "special":
// 1. We identify root nodes by the arguable logic that _n_items == n->n_descendants, regardless of how many descendants they actually have
// 2. Root nodes with only 1 child need to be a "dummy" parent
// 3. Due to the _n_items "hack", we need to be careful with the cases where _n_items <= _K or _n_items > _K
if (indices.size() == 1 && !is_root)
return indices[0];
if (indices.size() <= (size_t)_K && (!is_root || (size_t)_n_items <= (size_t)_K || indices.size() == 1)) {
threaded_build_policy.lock_n_nodes();
_allocate_size(_n_nodes + 1, threaded_build_policy);
S item = _n_nodes++;
threaded_build_policy.unlock_n_nodes();
threaded_build_policy.lock_shared_nodes();
Node* m = _get(item);
m->n_descendants = is_root ? _n_items : (S)indices.size();
// Using std::copy instead of a loop seems to resolve issues #3 and #13,
// probably because gcc 4.8 goes overboard with optimizations.
// Using memcpy instead of std::copy for MSVC compatibility. #235
// Only copy when necessary to avoid crash in MSVC 9. #293
if (!indices.empty())
memcpy(m->children, &indices[0], indices.size() * sizeof(S));
threaded_build_policy.unlock_shared_nodes();
return item;
}
threaded_build_policy.lock_shared_nodes();
vector<Node*> children;
for (size_t i = 0; i < indices.size(); i++) {
S j = indices[i];
Node* n = _get(j);
if (n)
children.push_back(n);
}
vector<S> children_indices[2];
Node* m = (Node*)alloca(_s);
for (int attempt = 0; attempt < 3; attempt++) {
children_indices[0].clear();
children_indices[1].clear();
D::create_split(children, _f, _s, _random, m);
for (size_t i = 0; i < indices.size(); i++) {
S j = indices[i];
Node* n = _get(j);
if (n) {
bool side = D::side(m, n, _f, _random);
children_indices[side].push_back(j);
} else {
annoylib_showUpdate("No node for index %d?\n", j);
}
}
if (_split_imbalance(children_indices[0], children_indices[1]) < 0.95)
break;
}
threaded_build_policy.unlock_shared_nodes();
// If we didn't find a hyperplane, just randomize sides as a last option
while (_split_imbalance(children_indices[0], children_indices[1]) > 0.99) {
if (_verbose)
annoylib_showUpdate("\tNo hyperplane found (left has %zu children, right has %zu children)\n",
children_indices[0].size(), children_indices[1].size());
children_indices[0].clear();
children_indices[1].clear();
// Set the vector to 0.0
for (int z = 0; z < _f; z++)
m->v[z] = 0;
for (size_t i = 0; i < indices.size(); i++) {
S j = indices[i];
// Just randomize...
children_indices[_random.flip()].push_back(j);
}
}
int flip = (children_indices[0].size() > children_indices[1].size());
m->n_descendants = is_root ? _n_items : (S)indices.size();
for (int side = 0; side < 2; side++) {
// run _make_tree for the smallest child first (for cache locality)
m->children[side^flip] = _make_tree(children_indices[side^flip], false, _random, threaded_build_policy);
}
threaded_build_policy.lock_n_nodes();
_allocate_size(_n_nodes + 1, threaded_build_policy);
S item = _n_nodes++;
threaded_build_policy.unlock_n_nodes();
threaded_build_policy.lock_shared_nodes();
memcpy(_get(item), m, _s);
threaded_build_policy.unlock_shared_nodes();
return item;
}
void _get_all_nns(const T* v, size_t n, int search_k, vector<S>* result, vector<T>* distances) const {
Node* v_node = (Node *)alloca(_s);
D::template zero_value<Node>(v_node);
memcpy(v_node->v, v, sizeof(T) * _f);
D::init_node(v_node, _f);
std::priority_queue<pair<T, S> > q;
if (search_k == -1) {
search_k = n * _roots.size();
}
for (size_t i = 0; i < _roots.size(); i++) {
q.push(make_pair(Distance::template pq_initial_value<T>(), _roots[i]));
}
std::vector<S> nns;
while (nns.size() < (size_t)search_k && !q.empty()) {
const pair<T, S>& top = q.top();
T d = top.first;
S i = top.second;
Node* nd = _get(i);
q.pop();
if (nd->n_descendants == 1 && i < _n_items) {
nns.push_back(i);
} else if (nd->n_descendants <= _K) {
const S* dst = nd->children;
nns.insert(nns.end(), dst, &dst[nd->n_descendants]);
} else {
T margin = D::margin(nd, v, _f);
q.push(make_pair(D::pq_distance(d, margin, 1), static_cast<S>(nd->children[1])));
q.push(make_pair(D::pq_distance(d, margin, 0), static_cast<S>(nd->children[0])));
}
}
// Get distances for all items
// To avoid calculating distance multiple times for any items, sort by id
std::sort(nns.begin(), nns.end());
vector<pair<T, S> > nns_dist;
S last = -1;
for (size_t i = 0; i < nns.size(); i++) {
S j = nns[i];
if (j == last)
continue;
last = j;
if (_get(j)->n_descendants == 1) // This is only to guard a really obscure case, #284
nns_dist.push_back(make_pair(D::distance(v_node, _get(j), _f), j));
}
size_t m = nns_dist.size();
size_t p = n < m ? n : m; // Return this many items
std::partial_sort(nns_dist.begin(), nns_dist.begin() + p, nns_dist.end());
for (size_t i = 0; i < p; i++) {
if (distances)
distances->push_back(D::normalized_distance(nns_dist[i].first));
result->push_back(nns_dist[i].second);
}
}
};
class AnnoyIndexSingleThreadedBuildPolicy {
public:
template<typename S, typename T, typename D, typename Random>
static void build(AnnoyIndex<S, T, D, Random, AnnoyIndexSingleThreadedBuildPolicy>* annoy, int q, int n_threads) {
AnnoyIndexSingleThreadedBuildPolicy threaded_build_policy;
annoy->thread_build(q, 0, threaded_build_policy);
}
void lock_n_nodes() {}
void unlock_n_nodes() {}
void lock_nodes() {}
void unlock_nodes() {}
void lock_shared_nodes() {}
void unlock_shared_nodes() {}
void lock_roots() {}
void unlock_roots() {}
};
#ifdef ANNOYLIB_MULTITHREADED_BUILD
class AnnoyIndexMultiThreadedBuildPolicy {
private:
std::shared_timed_mutex nodes_mutex;
std::mutex n_nodes_mutex;
std::mutex roots_mutex;
public:
template<typename S, typename T, typename D, typename Random>
static void build(AnnoyIndex<S, T, D, Random, AnnoyIndexMultiThreadedBuildPolicy>* annoy, int q, int n_threads) {
AnnoyIndexMultiThreadedBuildPolicy threaded_build_policy;
if (n_threads == -1) {
// If the hardware_concurrency() value is not well defined or not computable, it returns 0.
// We guard against this by using at least 1 thread.
n_threads = std::max(1, (int)std::thread::hardware_concurrency());
}
vector<std::thread> threads(n_threads);
for (int thread_idx = 0; thread_idx < n_threads; thread_idx++) {
int trees_per_thread = q == -1 ? -1 : (int)floor((q + thread_idx) / n_threads);
threads[thread_idx] = std::thread(
&AnnoyIndex<S, T, D, Random, AnnoyIndexMultiThreadedBuildPolicy>::thread_build,
annoy,
trees_per_thread,
thread_idx,
std::ref(threaded_build_policy)
);
}
for (auto& thread : threads) {
thread.join();
}
}
void lock_n_nodes() {
n_nodes_mutex.lock();
}
void unlock_n_nodes() {
n_nodes_mutex.unlock();
}
void lock_nodes() {
nodes_mutex.lock();
}
void unlock_nodes() {
nodes_mutex.unlock();
}
void lock_shared_nodes() {
nodes_mutex.lock_shared();
}
void unlock_shared_nodes() {
nodes_mutex.unlock_shared();
}
void lock_roots() {
roots_mutex.lock();
}
void unlock_roots() {
roots_mutex.unlock();
}
};
#endif
}
#endif
// vim: tabstop=2 shiftwidth=2
================================================
FILE: src/annoyluamodule.cc
================================================
// Copyright (c) 2016 Boris Nagaev
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy of
// the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations under
// the License.
#include <cstring>
#include <typeinfo>
#include <lua.hpp>
#include "annoylib.h"
#include "kissrandom.h"
#if LUA_VERSION_NUM == 501
#define compat_setfuncs(L, funcs) luaL_register(L, NULL, funcs)
#define compat_rawlen lua_objlen
#else
#define compat_setfuncs(L, funcs) luaL_setfuncs(L, funcs, 0)
#define compat_rawlen lua_rawlen
#endif
using namespace Annoy;
template<typename Distance>
class LuaAnnoy {
public:
typedef int32_t AnnoyS;
typedef float AnnoyT;
typedef AnnoyIndex<AnnoyS, AnnoyT, Distance, Kiss64Random, AnnoyIndexSingleThreadedBuildPolicy> Impl;
typedef LuaAnnoy<Distance> ThisClass;
class LuaArrayProxy {
public:
LuaArrayProxy(lua_State* L, int object, int f)
: L_(L)
, object_(object)
{
luaL_checktype(L, object, LUA_TTABLE);
int v_len = compat_rawlen(L, object);
luaL_argcheck(L, v_len == f, object, "Length of v != f");
}
double operator[](int index) const {
lua_rawgeti(L_, object_, index + 1);
double result = lua_tonumber(L_, -1);
lua_pop(L_, 1);
return result;
}
private:
lua_State* L_;
int object_;
};
static void toVector(lua_State* L, int object, int f, AnnoyT* dst) {
LuaArrayProxy proxy(L, object, f);
for (int i = 0; i < f; i++) {
dst[i] = proxy[i];
}
}
template <typename Vector>
static void pushVector(lua_State* L, const Vector& v) {
lua_createtable(L, v.size(), 0);
for (int j = 0; j < v.size(); j++) {
lua_pushnumber(L, v[j]);
lua_rawseti(L, -2, j + 1);
}
}
static const char* typeAsString() {
return typeid(Impl).name();
}
static Impl* getAnnoy(lua_State* L, int object) {
return reinterpret_cast<Impl*>(
luaL_checkudata(L, object, typeAsString())
);
}
static int getItemIndex(lua_State* L, int object, int size = -1) {
int item = luaL_checkinteger(L, object);
luaL_argcheck(L, item >= 0, object, "Index must be >= 0");
if (size != -1) {
luaL_argcheck(L, item < size, object, "Index must be < size");
}
return item;
}
static int gc(lua_State* L) {
Impl* self = getAnnoy(L, 1);
self->~Impl();
return 0;
}
static int tostring(lua_State* L) {
Impl* self = getAnnoy(L, 1);
lua_pushfstring(
L,
"annoy.AnnoyIndex object (%dx%d, %s distance)",
self->get_n_items(), self->get_f(), Distance::name()
);
return 1;
}
static int add_item(lua_State* L) {
Impl* self = getAnnoy(L, 1);
int item = getItemIndex(L, 2);
self->add_item_impl(item, LuaArrayProxy(L, 3, self->get_f()));
return 0;
}
static int build(lua_State* L) {
int nargs = lua_gettop(L);
Impl* self = getAnnoy(L, 1);
int n_trees = luaL_checkinteger(L, 2);
self->build(n_trees, 1);
lua_pushboolean(L, true);
return 1;
}
static int on_disk_build(lua_State* L) {
Impl* self = getAnnoy(L, 1);
const char* filename = luaL_checkstring(L, 2);
self->on_disk_build(filename);
lua_pushboolean(L, true);
return 1;
}
static int save(lua_State* L) {
int nargs = lua_gettop(L);
Impl* self = getAnnoy(L, 1);
const char* filename = luaL_checkstring(L, 2);
bool prefault = true;
if (nargs >= 3) {
prefault = lua_toboolean(L, 3);
}
self->save(filename, prefault);
lua_pushboolean(L, true);
return 1;
}
static int load(lua_State* L) {
Impl* self = getAnnoy(L, 1);
int nargs = lua_gettop(L);
const char* filename = luaL_checkstring(L, 2);
bool prefault = true;
if (nargs >= 3) {
prefault = lua_toboolean(L, 3);
}
if (!self->load(filename, prefault)) {
return luaL_error(L, "Can't load file: %s", filename);
}
lua_pushboolean(L, true);
return 1;
}
static int unload(lua_State* L) {
Impl* self = getAnnoy(L, 1);
self->unload();
lua_pushboolean(L, true);
return 1;
}
struct Searcher {
std::vector<AnnoyS> result;
std::vector<AnnoyT> distances;
Impl* self;
int n;
int search_k;
bool include_distances;
Searcher(lua_State* L) {
int nargs = lua_gettop(L);
self = getAnnoy(L, 1);
n = luaL_checkinteger(L, 3);
search_k = -1;
if (nargs >= 4) {
search_k = luaL_checkinteger(L, 4);
}
include_distances = false;
if (nargs >= 5) {
include_distances = lua_toboolean(L, 5);
}
}
int pushResults(lua_State* L) {
pushVector(L, result);
if (include_distances) {
pushVector(L, distances);
}
return include_distances ? 2 : 1;
}
};
static int get_nns_by_item(lua_State* L) {
Searcher s(L);
int item = getItemIndex(L, 2, s.self->get_n_items());
s.self->get_nns_by_item(item, s.n, s.search_k, &s.result,
s.include_distances ? &s.distances : NULL);
return s.pushResults(L);
}
static int get_nns_by_vector(lua_State* L) {
Searcher s(L);
std::vector<AnnoyT> _vec(s.self->get_f());
AnnoyT* vec = &(_vec[0]);
toVector(L, 2, s.self->get_f(), vec);
s.self->get_nns_by_vector(vec, s.n, s.search_k, &s.result,
s.include_distances ? &s.distances : NULL);
return s.pushResults(L);
}
static int get_item_vector(lua_State* L) {
Impl* self = getAnnoy(L, 1);
int item = getItemIndex(L, 2, self->get_n_items());
std::vector<AnnoyT> _vec(self->get_f());
AnnoyT* vec = &(_vec[0]);
self->get_item(item, vec);
pushVector(L, _vec);
return 1;
}
static int get_distance(lua_State* L) {
Impl* self = getAnnoy(L, 1);
int i = getItemIndex(L, 2, self->get_n_items());
int j = getItemIndex(L, 3, self->get_n_items());
AnnoyT distance = self->get_distance(i, j);
lua_pushnumber(L, distance);
return 1;
}
static int get_n_items(lua_State* L) {
Impl* self = getAnnoy(L, 1);
lua_pushnumber(L, self->get_n_items());
return 1;
}
static const luaL_Reg* getMetatable() {
static const luaL_Reg funcs[] = {
{"__gc", &ThisClass::gc},
{"__tostring", &ThisClass::tostring},
{NULL, NULL},
};
return funcs;
}
static const luaL_Reg* getMethods() {
static const luaL_Reg funcs[] = {
{"add_item", &ThisClass::add_item},
{"build", &ThisClass::build},
{"save", &ThisClass::save},
{"load", &ThisClass::load},
{"unload", &ThisClass::unload},
{"get_nns_by_item", &ThisClass::get_nns_by_item},
{"get_nns_by_vector", &ThisClass::get_nns_by_vector},
{"get_item_vector", &ThisClass::get_item_vector},
{"get_distance", &ThisClass::get_distance},
{"get_n_items", &ThisClass::get_n_items},
{"on_disk_build", &ThisClass::on_disk_build},
{NULL, NULL},
};
return funcs;
}
static void createNew(lua_State* L, int f) {
void* self = lua_newuserdata(L, sizeof(Impl));
if (luaL_newmetatable(L, typeAsString())) {
compat_setfuncs(L, getMetatable());
lua_newtable(L);
compat_setfuncs(L, getMethods());
lua_setfield(L, -2, "__index");
}
new (self) Impl(f);
lua_setmetatable(L, -2);
}
};
static int lua_an_make(lua_State* L) {
int f = luaL_checkinteger(L, 1);
const char* metric = "angular";
if (lua_gettop(L) >= 2) {
metric = luaL_checkstring(L, 2);
}
if (strcmp(metric, "angular") == 0) {
LuaAnnoy<Angular>::createNew(L, f);
return 1;
} else if (strcmp(metric, "euclidean") == 0) {
LuaAnnoy<Euclidean>::createNew(L, f);
return 1;
} else if (strcmp(metric, "manhattan") == 0) {
LuaAnnoy<Manhattan>::createNew(L, f);
return 1;
} else {
return luaL_error(L, "Unknown metric: %s", metric);
}
}
static const luaL_Reg LUA_ANNOY_FUNCS[] = {
{"AnnoyIndex", lua_an_make},
{NULL, NULL},
};
extern "C" {
int luaopen_annoy(lua_State* L) {
lua_newtable(L);
compat_setfuncs(L, LUA_ANNOY_FUNCS);
return 1;
}
}
// vim: tabstop=2 shiftwidth=2
================================================
FILE: src/annoymodule.cc
================================================
// Copyright (c) 2013 Spotify AB
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License. You may obtain a copy of
// the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
// License for the specific language governing permissions and limitations under
// the License.
#include "annoylib.h"
#include "kissrandom.h"
#include "Python.h"
#include "structmember.h"
#include <exception>
#if defined(_MSC_VER) && _MSC_VER == 1500
typedef signed __int32 int32_t;
#else
#include <stdint.h>
#endif
#if defined(ANNOYLIB_USE_AVX512)
#define AVX_INFO "Using 512-bit AVX instructions"
#elif defined(ANNOYLIB_USE_AVX128)
#define AVX_INFO "Using 128-bit AVX instructions"
#else
#define AVX_INFO "Not using AVX instructions"
#endif
#if defined(_MSC_VER)
#define COMPILER_INFO "Compiled using MSC"
#elif defined(__GNUC__)
#define COMPILER_INFO "Compiled on GCC"
#else
#define COMPILER_INFO "Compiled on unknown platform"
#endif
#define ANNOY_DOC (COMPILER_INFO ". " AVX_INFO ".")
#if PY_MAJOR_VERSION >= 3
#define IS_PY3K
#endif
#ifndef Py_TYPE
#define Py_TYPE(ob) (((PyObject*)(ob))->ob_type)
#endif
#ifdef IS_PY3K
#define PyInt_FromLong PyLong_FromLong
#endif
using namespace Annoy;
#ifdef ANNOYLIB_MULTITHREADED_BUILD
typedef AnnoyIndexMultiThreadedBuildPolicy AnnoyIndexThreadedBuildPolicy;
#else
typedef AnnoyIndexSingleThreadedBuildPolicy AnnoyIndexThreadedBuildPolicy;
#endif
template class Annoy::AnnoyIndexInterface<int32_t, float>;
class HammingWrapper : public AnnoyIndexInterface<int32_t, float> {
// Wrapper class for Hamming distance, using composition.
// This translates binary (float) vectors into packed uint64_t vectors.
// This is questionable from a performance point of view. Should reconsider this solution.
private:
int32_t _f_external, _f_internal;
AnnoyIndex<int32_t, uint64_t, Hamming, Kiss64Random, AnnoyIndexThreadedBuildPolicy> _index;
void _pack(const float* src, uint64_t* dst) const {
for (int32_t i = 0; i < _f_internal; i++) {
dst[i] = 0;
for (int32_t j = 0; j < 64 && i*64+j < _f_external; j++) {
dst[i] |= (uint64_t)(src[i * 64 + j] > 0.5) << j;
}
}
};
void _unpack(const uint64_t* src, float* dst) const {
for (int32_t i = 0; i < _f_external; i++) {
dst[i] = (src[i / 64] >> (i % 64)) & 1;
}
};
public:
HammingWrapper(int f) : _f_external(f), _f_internal((f + 63) / 64), _index((f + 63) / 64) {};
bool add_item(int32_t item, const float* w, char**error) {
vector<uint64_t> w_internal(_f_internal, 0);
_pack(w, &w_internal[0]);
return _index.add_item(item, &w_internal[0], error);
};
bool build(int q, int n_threads, char** error) { return _index.build(q, n_threads, error); };
bool unbuild(char** error) { return _index.unbuild(error); };
bool save(const char* filename, bool prefault, char** error) { return _index.save(filename, prefault, error); };
void unload() { _index.unload(); };
bool load(const char* filename, bool prefault, char** error) { return _index.load(filename, prefault, error); };
float get_distance(int32_t i, int32_t j) const { return _index.get_distance(i, j); };
void get_nns_by_item(int32_t item, size_t n, int search_k, vector<int32_t>* result, vector<float>* distances) const {
if (distances) {
vector<uint64_t> distances_internal;
_index.get_nns_by_item(item, n, search_k, result, &distances_internal);
distances->insert(distances->begin(), distances_internal.begin(), distances_internal.end());
} else {
_index.get_nns_by_item(item, n, search_k, result, NULL);
}
};
void get_nns_by_vector(const float* w, size_t n, int search_k, vector<int32_t>* result, vector<float>* distances) const {
vector<uint64_t> w_internal(_f_internal, 0);
_pack(w, &w_internal[0]);
if (distances) {
vector<uint64_t> distances_internal;
_index.get_nns_by_vector(&w_internal[0], n, search_k, result, &distances_internal);
distances->insert(distances->begin(), distances_internal.begin(), distances_internal.end());
} else {
_index.get_nns_by_vector(&w_internal[0], n, search_k, result, NULL);
}
};
int32_t get_n_items() const { return _index.get_n_items(); };
int32_t get_n_trees() const { return _index.get_n_trees(); };
void verbose(bool v) { _index.verbose(v); };
void get_item(int32_t item, float* v) const {
vector<uint64_t> v_internal(_f_internal, 0);
_index.get_item(item, &v_internal[0]);
_unpack(&v_internal[0], v);
};
void set_seed(uint64_t q) { _index.set_seed(q); };
bool on_disk_build(const char* filename, char** error) { return _index.on_disk_build(filename, error); };
};
// annoy python object
typedef struct {
PyObject_HEAD
int f;
AnnoyIndexInterface<int32_t, float>* ptr;
} py_annoy;
static PyObject *
py_an_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) {
py_annoy *self = (py_annoy *)type->tp_alloc(type, 0);
if (self == NULL) {
return NULL;
}
const char *metric = NULL;
static char const * kwlist[] = {"f", "metric", NULL};
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "i|s", (char**)kwlist, &self->f, &metric))
return NULL;
if (!metric) {
// This keeps coming up, see #368 etc
PyErr_WarnEx(PyExc_FutureWarning, "The default argument for metric will be removed "
"in future version of Annoy. Please pass metric='angular' explicitly.", 1);
self->ptr = new AnnoyIndex<int32_t, float, Angular, Kiss64Random, AnnoyIndexThreadedBuildPolicy>(self->f);
} else if (!strcmp(metric, "angular")) {
self->ptr = new AnnoyIndex<int32_t, float, Angular, Kiss64Random, AnnoyIndexThreadedBuildPolicy>(self->f);
} else if (!strcmp(metric, "euclidean")) {
self->ptr = new AnnoyIndex<int32_t, float, Euclidean, Kiss64Random, AnnoyIndexThreadedBuildPolicy>(self->f);
} else if (!strcmp(metric, "manhattan")) {
self->ptr = new AnnoyIndex<int32_t, float, Manhattan, Kiss64Random, AnnoyIndexThreadedBuildPolicy>(self->f);
} else if (!strcmp(metric, "hamming")) {
self->ptr = new HammingWrapper(self->f);
} else if (!strcmp(metric, "dot")) {
self->ptr = new AnnoyIndex<int32_t, float, DotProduct, Kiss64Random, AnnoyIndexThreadedBuildPolicy>(self->f);
} else {
PyErr_SetString(PyExc_ValueError, "No such metric");
return NULL;
}
return (PyObject *)self;
}
static int
py_an_init(py_annoy *self, PyObject *args, PyObject *kwargs) {
// Seems to be needed for Python 3
const char *metric = NULL;
int f;
static char const * kwlist[] = {"f", "metric", NULL};
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "i|s", (char**)kwlist, &f, &metric))
return (int) NULL;
return 0;
}
static void
py_an_dealloc(py_annoy* self) {
delete self->ptr;
Py_TYPE(self)->tp_free((PyObject*)self);
}
static PyMemberDef py_annoy_members[] = {
{(char*)"f", T_INT, offsetof(py_annoy, f), 0,
(char*)""},
{NULL} /* Sentinel */
};
static PyObject *
py_an_load(py_annoy *self, PyObject *args, PyObject *kwargs) {
char *filename, *error;
bool prefault = false;
if (!self->ptr)
return NULL;
static char const * kwlist[] = {"fn", "prefault", NULL};
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s|b", (char**)kwlist, &filename, &prefault))
return NULL;
if (!self->ptr->load(filename, prefault, &error)) {
PyErr_SetString(PyExc_IOError, error);
free(error);
return NULL;
}
Py_RETURN_TRUE;
}
static PyObject *
py_an_save(py_annoy *self, PyObject *args, PyObject *kwargs) {
char *filename, *error;
bool prefault = false;
if (!self->ptr)
return NULL;
static char const * kwlist[] = {"fn", "prefault", NULL};
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s|b", (char**)kwlist, &filename, &prefault))
return NULL;
if (!self->ptr->save(filename, prefault, &error)) {
PyErr_SetString(PyExc_IOError, error);
free(error);
return NULL;
}
Py_RETURN_TRUE;
}
PyObject*
get_nns_to_python(const vector<int32_t>& result, const vector<float>& distances, int include_distances) {
PyObject* l = NULL;
PyObject* d = NULL;
PyObject* t = NULL;
if ((l = PyList_New(result.size())) == NULL) {
goto error;
}
for (size_t i = 0; i < result.size(); i++) {
PyObject* res = PyInt_FromLong(result[i]);
if (res == NULL) {
goto error;
}
PyList_SetItem(l, i, res);
}
if (!include_distances)
return l;
if ((d = PyList_New(distances.size())) == NULL) {
goto error;
}
for (size_t i = 0; i < distances.size(); i++) {
PyObject* dist = PyFloat_FromDouble(distances[i]);
if (dist == NULL) {
goto error;
}
PyList_SetItem(d, i, dist);
}
if ((t = PyTuple_Pack(2, l, d)) == NULL) {
goto error;
}
Py_XDECREF(l);
Py_XDECREF(d);
return t;
error:
Py_XDECREF(l);
Py_XDECREF(d);
Py_XDECREF(t);
return NULL;
}
bool check_constraints(py_annoy *self, int32_t item, bool building) {
if (item < 0) {
PyErr_SetString(PyExc_IndexError, "Item index can not be negative");
return false;
} else if (!building && item >= self->ptr->get_n_items()) {
PyErr_SetString(PyExc_IndexError, "Item index larger than the largest item index");
return false;
} else {
return true;
}
}
static PyObject*
py_an_get_nns_by_item(py_annoy *self, PyObject *args, PyObject *kwargs) {
int32_t item, n, search_k=-1, include_distances=0;
if (!self->ptr)
return NULL;
static char const * kwlist[] = {"i", "n", "search_k", "include_distances", NULL};
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "ii|ii", (char**)kwlist, &item, &n, &search_k, &include_distances))
return NULL;
if (!check_constraints(self, item, false)) {
return NULL;
}
vector<int32_t> result;
vector<float> distances;
Py_BEGIN_ALLOW_THREADS;
self->ptr->get_nns_by_item(item, n, search_k, &result, include_distances ? &distances : NULL);
Py_END_ALLOW_THREADS;
return get_nns_to_python(result, distances, include_distances);
}
bool
convert_list_to_vector(PyObject* v, int f, vector<float>* w) {
Py_ssize_t length = PyObject_Size(v);
if (length == -1) {
return false;
}
if (length != f) {
PyErr_Format(PyExc_IndexError, "Vector has wrong length (expected %d, got %ld)", f, length);
return false;
}
for (int z = 0; z < f; z++) {
PyObject *key = PyInt_FromLong(z);
if (key == NULL) {
return false;
}
PyObject *pf = PyObject_GetItem(v, key);
Py_DECREF(key);
if (pf == NULL) {
return false;
}
double value = PyFloat_AsDouble(pf);
Py_DECREF(pf);
if (value == -1.0 && PyErr_Occurred()) {
return false;
}
(*w)[z] = value;
}
return true;
}
static PyObject*
py_an_get_nns_by_vector(py_annoy *self, PyObject *args, PyObject *kwargs) {
PyObject* v;
int32_t n, search_k=-1, include_distances=0;
if (!self->ptr)
return NULL;
static char const * kwlist[] = {"vector", "n", "search_k", "include_distances", NULL};
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "Oi|ii", (char**)kwlist, &v, &n, &search_k, &include_distances))
return NULL;
vector<float> w(self->f);
if (!convert_list_to_vector(v, self->f, &w)) {
return NULL;
}
vector<int32_t> result;
vector<float> distances;
Py_BEGIN_ALLOW_THREADS;
self->ptr->get_nns_by_vector(&w[0], n, search_k, &result, include_distances ? &distances : NULL);
Py_END_ALLOW_THREADS;
return get_nns_to_python(result, distances, include_distances);
}
static PyObject*
py_an_get_item_vector(py_annoy *self, PyObject *args) {
int32_t item;
if (!self->ptr)
return NULL;
if (!PyArg_ParseTuple(args, "i", &item))
return NULL;
if (!check_constraints(self, item, false)) {
return NULL;
}
vector<float> v(self->f);
self->ptr->get_item(item, &v[0]);
PyObject* l = PyList_New(self->f);
if (l == NULL) {
return NULL;
}
for (int z = 0; z < self->f; z++) {
PyObject* dist = PyFloat_FromDouble(v[z]);
if (dist == NULL) {
goto error;
}
PyList_SetItem(l, z, dist);
}
return l;
error:
Py_XDECREF(l);
return NULL;
}
static PyObject*
py_an_add_item(py_annoy *self, PyObject *args, PyObject* kwargs) {
PyObject* v;
int32_t item;
if (!self->ptr)
return NULL;
static char const * kwlist[] = {"i", "vector", NULL};
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "iO", (char**)kwlist, &item, &v))
return NULL;
if (!check_constraints(self, item, true)) {
return NULL;
}
vector<float> w(self->f);
if (!convert_list_to_vector(v, self->f, &w)) {
return NULL;
}
char* error;
if (!self->ptr->add_item(item, &w[0], &error)) {
PyErr_SetString(PyExc_Exception, error);
free(error);
return NULL;
}
Py_RETURN_NONE;
}
static PyObject *
py_an_on_disk_build(py_annoy *self, PyObject *args, PyObject *kwargs) {
char *filename, *error;
if (!self->ptr)
return NULL;
static char const * kwlist[] = {"fn", NULL};
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "s", (char**)kwlist, &filename))
return NULL;
if (!self->ptr->on_disk_build(filename, &error)) {
PyErr_SetString(PyExc_IOError, error);
free(error);
return NULL;
}
Py_RETURN_TRUE;
}
static PyObject *
py_an_build(py_annoy *self, PyObject *args, PyObject *kwargs) {
int q;
int n_jobs = -1;
if (!self->ptr)
return NULL;
static char const * kwlist[] = {"n_trees", "n_jobs", NULL};
if (!PyArg_ParseTupleAndKeywords(args, kwargs, "i|i", (char**)kwlist, &q, &n_jobs))
return NULL;
bool res;
char* error;
Py_BEGIN_ALLOW_THREADS;
res = self->ptr->build(q, n_jobs, &error);
Py_END_ALLOW_THREADS;
if (!res) {
PyErr_SetString(PyExc_Exception, error);
free(error);
return NULL;
}
Py_RETURN_TRUE;
}
static PyObject *
py_an_unbuild(py_annoy *self) {
if (!self->ptr)
return NULL;
char* error;
if (!self->ptr->unbuild(&error)) {
PyErr_SetString(PyExc_Exception, error);
free(error);
return NULL;
}
Py_RETURN_TRUE;
}
static PyObject *
py_an_unload(py_annoy *self) {
if (!self->ptr)
return NULL;
self->ptr->unload();
Py_RETURN_TRUE;
}
static PyObject *
py_an_get_distance(py_annoy *self, PyObject *args) {
int32_t i, j;
if (!self->ptr)
return NULL;
if (!PyArg_ParseTuple(args, "ii", &i, &j))
return NULL;
if (!check_constraints(self, i, false) || !check_constraints(self, j, false)) {
return NULL;
}
double d = self->ptr->get_distance(i,j);
return PyFloat_FromDouble(d);
}
static PyObject *
py_an_get_n_items(py_annoy *self) {
if (!self->ptr)
return NULL;
int32_t n = self->ptr->get_n_items();
return PyInt_FromLong(n);
}
static PyObject *
py_an_get_n_trees(py_annoy *self) {
if (!self->ptr)
return NULL;
int32_t n = self->ptr->get_n_trees();
return PyInt_FromLong(n);
}
static PyObject *
py_an_verbose(py_annoy *self, PyObject *args) {
int verbose;
if (!self->ptr)
return NULL;
if (!PyArg_ParseTuple(args, "i", &verbose))
return NULL;
self->ptr->verbose((bool)verbose);
Py_RETURN_TRUE;
}
static PyObject *
py_an_set_seed(py_annoy *self, PyObject *args) {
int q;
if (!self->ptr)
return NULL;
if (!PyArg_ParseTuple(args, "i", &q))
return NULL;
self->ptr->set_seed(q);
Py_RETURN_NONE;
}
static PyMethodDef AnnoyMethods[] = {
{"load", (PyCFunction)py_an_load, METH_VARARGS | METH_KEYWORDS, "Loads (mmaps) an index from disk."},
{"save", (PyCFunction)py_an_save, METH_VARARGS | METH_KEYWORDS, "Saves the index to disk."},
{"get_nns_by_item",(PyCFunction)py_an_get_nns_by_item, METH_VARARGS | METH_KEYWORDS, "Returns the `n` closest items to item `i`.\n\n:param search_k: the query will inspect up to `search_k` nodes.\n`search_k` gives you a run-time tradeoff between better accuracy and speed.\n`search_k` defaults to `n_trees * n` if not provided.\n\n:param include_distances: If `True`, this function will return a\n2 element tuple of lists. The first list contains the `n` closest items.\nThe second list contains the corresponding distances."},
{"get_nns_by_vector",(PyCFunction)py_an_get_nns_by_vector, METH_VARARGS | METH_KEYWORDS, "Returns the `n` closest items to vector `vector`.\n\n:param search_k: the query will inspect up to `search_k` nodes.\n`search_k` gives you a run-time tradeoff between better accuracy and speed.\n`search_k` defaults to `n_trees * n` if not provided.\n\n:param include_distances: If `True`, this function will return a\n2 element tuple of lists. The first list contains the `n` closest items.\nThe second list contains the corresponding distances."},
{"get_item_vector",(PyCFunction)py_an_get_item_vector, METH_VARARGS, "Returns the vector for item `i` that was previously added."},
{"add_item",(PyCFunction)py_an_add_item, METH_VARARGS | METH_KEYWORDS, "Adds item `i` (any nonnegative integer) with vector `v`.\n\nNote that it will allocate memory for `max(i)+1` items."},
{"on_disk_build",(PyCFunction)py_an_on_disk_build, METH_VARARGS | METH_KEYWORDS, "Build will be performed with storage on disk instead of RAM."},
{"build",(PyCFunction)py_an_build, METH_VARARGS | METH_KEYWORDS, "Builds a forest of `n_trees` trees.\n\nMore trees give higher precision when querying. After calling `build`,\nno more items can be added. `n_jobs` specifies the number of threads used to build the trees. `n_jobs=-1` uses all available CPU cores."},
{"unbuild",(PyCFunction)py_an_unbuild, METH_NOARGS, "Unbuilds the tree in order to allows adding new items.\n\nbuild() has to be called again afterwards in order to\nrun queries."},
{"unload",(PyCFunction)py_an_unload, METH_NOARGS, "Unloads an index from disk."},
{"get_distance",(PyCFunction)py_an_get_distance, METH_VARARGS, "Returns the distance between items `i` and `j`."},
{"get_n_items",(PyCFunction)py_an_get_n_items, METH_NOARGS, "Returns the number of items in the index."},
{"get_n_trees",(PyCFunction)py_an_get_n_trees, METH_NOARGS, "Returns the number of trees in the index."},
{"verbose",(PyCFunction)py_an_verbose, METH_VARARGS, ""},
{"set_seed",(PyCFunction)py_an_set_seed, METH_VARARGS, "Sets the seed of Annoy's random number generator."},
{NULL, NULL, 0, NULL} /* Sentinel */
};
static PyTypeObject PyAnnoyType = {
PyVarObject_HEAD_INIT(NULL, 0)
"annoy.Annoy", /*tp_name*/
sizeof(py_annoy), /*tp_basicsize*/
0, /*tp_itemsize*/
(destructor)py_an_dealloc, /*tp_dealloc*/
0, /*tp_print*/
0, /*tp_getattr*/
0, /*tp_setattr*/
0, /*tp_compare*/
0, /*tp_repr*/
0, /*tp_as_number*/
0, /*tp_as_sequence*/
0, /*tp_as_mapping*/
0, /*tp_hash */
0, /*tp_call*/
0, /*tp_str*/
0, /*tp_getattro*/
0, /*tp_setattro*/
0, /*tp_as_buffer*/
Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /*tp_flags*/
ANNOY_DOC, /* tp_doc */
0, /* tp_traverse */
0, /* tp_clear */
0, /* tp_richcompare */
0, /* tp_weaklistoffset */
0, /* tp_iter */
0, /* tp_iternext */
AnnoyMethods, /* tp_methods */
py_annoy_members, /* tp_members */
0, /* tp_getset */
0, /* tp_base */
0, /* tp_dict */
0, /* tp_descr_get */
0, /* tp_descr_set */
0, /* tp_dictoffset */
(initproc)py_an_init, /* tp_init */
0, /* tp_alloc */
py_an_new, /* tp_new */
};
static PyMethodDef module_methods[] = {
{NULL} /* Sentinel */
};
#if PY_MAJOR_VERSION >= 3
static struct PyModuleDef moduledef = {
PyModuleDef_HEAD_INIT,
"annoylib", /* m_name */
ANNOY_DOC, /* m_doc */
-1, /* m_size */
module_methods, /* m_methods */
NULL, /* m_reload */
NULL, /* m_traverse */
NULL, /* m_clear */
NULL, /* m_free */
};
#endif
PyObject *create_module(void) {
PyObject *m;
if (PyType_Ready(&PyAnnoyType) < 0)
return NULL;
#if PY_MAJOR_VERSION >= 3
m = PyModule_Create(&moduledef);
#else
m = Py_InitModule("annoylib", module_methods);
#endif
if (m == NULL)
return NULL;
Py_INCREF(&PyAnnoyType);
PyModule_AddObject(m, "Annoy", (PyObject *)&PyAnnoyType);
return m;
}
#if PY_MAJOR_VERSION >= 3
PyMODINIT_FUNC PyInit_annoylib(void) {
return create_module(); // it should return moudule object in py3
}
#else
PyMODINIT_FUNC initannoylib(void) {
create_module();
}
#endif
// vim: tabstop=2 shiftwidth=2
================================================
FILE: src/kissrandom.h
================================================
#ifndef ANNOY_KISSRANDOM_H
#define ANNOY_KISSRANDOM_H
#if defined(_MSC_VER) && _MSC_VER == 1500
typedef unsigned __int32 uint32_t;
typedef unsigned __int64 uint64_t;
#else
#include <stdint.h>
#endif
namespace Annoy {
// KISS = "keep it simple, stupid", but high quality random number generator
// http://www0.cs.ucl.ac.uk/staff/d.jones/GoodPracticeRNG.pdf -> "Use a good RNG and build it into your code"
// http://mathforum.org/kb/message.jspa?messageID=6627731
// https://de.wikipedia.org/wiki/KISS_(Zufallszahlengenerator)
// 32 bit KISS
struct Kiss32Random {
uint32_t x;
uint32_t y;
uint32_t z;
uint32_t c;
static const uint32_t default_seed = 123456789;
#if __cplusplus < 201103L
typedef uint32_t seed_type;
#endif
// seed must be != 0
Kiss32Random(uint32_t seed = default_seed) {
x = seed;
y = 362436000;
z = 521288629;
c = 7654321;
}
uint32_t kiss() {
// Linear congruence generator
x = 69069 * x + 12345;
// Xor shift
y ^= y << 13;
y ^= y >> 17;
y ^= y << 5;
// Multiply-with-carry
uint64_t t = 698769069ULL * z + c;
c = t >> 32;
z = (uint32_t) t;
return x + y + z;
}
inline int flip() {
// Draw random 0 or 1
return kiss() & 1;
}
inline size_t index(size_t n) {
// Draw random integer between 0 and n-1 where n is at most the number of data points you have
return kiss() % n;
}
inline void set_seed(uint32_t seed) {
x = seed;
}
};
// 64 bit KISS. Use this if you have more than about 2^24 data points ("big data" ;) )
struct Kiss64Random {
uint64_t x;
uint64_t y;
uint64_t z;
uint64_t c;
static const uint64_t default_seed = 1234567890987654321ULL;
#if __cplusplus < 201103L
typedef uint64_t seed_type;
#endif
// seed must be != 0
Kiss64Random(uint64_t seed = default_seed) {
x = seed;
y = 362436362436362436ULL;
z = 1066149217761810ULL;
c = 123456123456123456ULL;
}
uint64_t kiss() {
// Linear congruence generator
z = 6906969069LL*z+1234567;
// Xor shift
y ^= (y<<13);
y ^= (y>>17);
y ^= (y<<43);
// Multiply-with-carry (uint128_t t = (2^58 + 1) * x + c; c = t >> 64; x = (uint64_t) t)
uint64_t t = (x<<58)+c;
c = (x>>6);
x += t;
c += (x<t);
return x + y + z;
}
inline int flip() {
// Draw random 0 or 1
return kiss() & 1;
}
inline size_t index(size_t n) {
// Draw random integer between 0 and n-1 where n is at most the number of data points you have
return kiss() % n;
}
inline void set_seed(uint64_t seed) {
x = seed;
}
};
}
#endif
// vim: tabstop=2 shiftwidth=2
================================================
FILE: src/mman.h
================================================
// This is from https://code.google.com/p/mman-win32/
//
// Licensed under MIT
#ifndef _MMAN_WIN32_H
#define _MMAN_WIN32_H
#ifndef _WIN32_WINNT // Allow use of features specific to Windows XP or later.
#define _WIN32_WINNT 0x0501 // Change this to the appropriate value to target other versions of Windows.
#endif
#include <sys/types.h>
#include <windows.h>
#include <errno.h>
#include <io.h>
#define PROT_NONE 0
#define PROT_READ 1
#define PROT_WRITE 2
#define PROT_EXEC 4
#define MAP_FILE 0
#define MAP_SHARED 1
#define MAP_PRIVATE 2
#define MAP_TYPE 0xf
#define MAP_FIXED 0x10
#define MAP_ANONYMOUS 0x20
#define MAP_ANON MAP_ANONYMOUS
#define MAP_FAILED ((void *)-1)
/* Flags for msync. */
#define MS_ASYNC 1
#define MS_SYNC 2
#define MS_INVALIDATE 4
#ifndef FILE_MAP_EXECUTE
#define FILE_MAP_EXECUTE 0x0020
#endif
static int __map_mman_error(const DWORD err, const int deferr)
{
if (err == 0)
return 0;
//TODO: implement
return err;
}
static DWORD __map_mmap_prot_page(const int prot)
{
DWORD protect = 0;
if (prot == PROT_NONE)
return protect;
if ((prot & PROT_EXEC) != 0)
{
protect = ((prot & PROT_WRITE) != 0) ?
PAGE_EXECUTE_READWRITE : PAGE_EXECUTE_READ;
}
else
{
protect = ((prot & PROT_WRITE) != 0) ?
PAGE_READWRITE : PAGE_READONLY;
}
return protect;
}
static DWORD __map_mmap_prot_file(const int prot)
{
DWORD desiredAccess = 0;
if (prot == PROT_NONE)
return desiredAccess;
if ((prot & PROT_READ) != 0)
desiredAccess |= FILE_MAP_READ;
if ((prot & PROT_WRITE) != 0)
desiredAccess |= FILE_MAP_WRITE;
if ((prot & PROT_EXEC) != 0)
desiredAccess |= FILE_MAP_EXECUTE;
return desiredAccess;
}
inline void* mmap(void *addr, size_t len, int prot, int flags, int fildes, off_t off)
{
HANDLE fm, h;
void * map = MAP_FAILED;
#ifdef _MSC_VER
#pragma warning(push)
#pragma warning(disable: 4293)
#endif
const DWORD dwFileOffsetLow = (sizeof(off_t) <= sizeof(DWORD)) ?
(DWORD)off : (DWORD)(off & 0xFFFFFFFFL);
const DWORD dwFileOffsetHigh = (sizeof(off_t) <= sizeof(DWORD)) ?
(DWORD)0 : (DWORD)((off >> 32) & 0xFFFFFFFFL);
const DWORD protect = __map_mmap_prot_page(prot);
const DWORD desiredAccess = __map_mmap_prot_file(prot);
const off_t maxSize = off + (off_t)len;
const DWORD dwMaxSizeLow = (sizeof(off_t) <= sizeof(DWORD)) ?
(DWORD)maxSize : (DWORD)(maxSize & 0xFFFFFFFFL);
const DWORD dwMaxSizeHigh = (sizeof(off_t) <= sizeof(DWORD)) ?
(DWORD)0 : (DWORD)((maxSize >> 32) & 0xFFFFFFFFL);
#ifdef _MSC_VER
#pragma warning(pop)
#endif
errno = 0;
if (len == 0
/* Unsupported flag combinations */
|| (flags & MAP_FIXED) != 0
/* Usupported protection combinations */
|| prot == PROT_EXEC)
{
errno = EINVAL;
return MAP_FAILED;
}
h = ((flags & MAP_ANONYMOUS) == 0) ?
(HANDLE)_get_osfhandle(fildes) : INVALID_HANDLE_VALUE;
if ((flags & MAP_ANONYMOUS) == 0 && h == INVALID_HANDLE_VALUE)
{
errno = EBADF;
return MAP_FAILED;
}
fm = CreateFileMapping(h, NULL, protect, dwMaxSizeHigh, dwMaxSizeLow, NULL);
if (fm == NULL)
{
errno = __map_mman_error(GetLastError(), EPERM);
return MAP_FAILED;
}
map = MapViewOfFile(fm, desiredAccess, dwFileOffsetHigh, dwFileOffsetLow, len);
CloseHandle(fm);
if (map == NULL)
{
errno = __map_mman_error(GetLastError(), EPERM);
return MAP_FAILED;
}
return map;
}
inline int munmap(void *addr, size_t len)
{
if (UnmapViewOfFile(addr))
return 0;
errno = __map_mman_error(GetLastError(), EPERM);
return -1;
}
inline int mprotect(void *addr, size_t len, int prot)
{
DWORD newProtect = __map_mmap_prot_page(prot);
DWORD oldProtect = 0;
if (VirtualProtect(addr, len, newProtect, &oldProtect))
return 0;
errno = __map_mman_error(GetLastError(), EPERM);
return -1;
}
inline int msync(void *addr, size_t len, int flags)
{
if (FlushViewOfFile(addr, len))
return 0;
errno = __map_mman_error(GetLastError(), EPERM);
return -1;
}
inline int mlock(const void *addr, size_t len)
{
if (VirtualLock((LPVOID)addr, len))
return 0;
errno = __map_mman_error(GetLastError(), EPERM);
return -1;
}
inline int munlock(const void *addr, size_t len)
{
if (VirtualUnlock((LPVOID)addr, len))
return 0;
errno = __map_mman_error(GetLastError(), EPERM);
return -1;
}
#if !defined(__MINGW32__)
inline int ftruncate(const int fd, const int64_t size) {
if (fd < 0) {
errno = EBADF;
return -1;
}
HANDLE h = reinterpret_cast<HANDLE>(_get_osfhandle(fd));
LARGE_INTEGER li_start, li_size;
li_start.QuadPart = static_cast<int64_t>(0);
li_size.QuadPart = size;
if (SetFilePointerEx(h, li_start, NULL, FILE_CURRENT) == ~0 ||
SetFilePointerEx(h, li_size, NULL, FILE_BEGIN) == ~0 ||
!SetEndOfFile(h)) {
unsigned long error = GetLastError();
fprintf(stderr, "I/O error while truncating: %lu\n", error);
switch (error) {
case ERROR_INVALID_HANDLE:
errno = EBADF;
break;
default:
errno = EIO;
break;
}
return -1;
}
return 0;
}
#endif
#endif
================================================
FILE: test/accuracy_test.py
================================================
# Copyright (c) 2013 Spotify AB
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations under
# the License.
from __future__ import print_function
import os
import h5py
from annoy import AnnoyIndex
try:
from urllib import urlretrieve
except ImportError:
from urllib.request import urlretrieve # Python 3
def _get_index(dataset, custom_distance=None, custom_dim=None):
url = 'http://ann-benchmarks.com/%s.hdf5' % dataset
vectors_fn = os.path.join("test", dataset + ".hdf5")
index_fn = os.path.join("test", dataset + ".annoy")
if not os.path.exists(vectors_fn):
print("downloading", url, "->", vectors_fn)
urlretrieve(url, vectors_fn)
dataset_f = h5py.File(vectors_fn, "r")
distance = dataset_f.attrs["distance"]
if custom_distance is not None:
distance = custom_distance
f = dataset_f["train"].shape[1]
if custom_dim:
f = custom_dim
if custom_distance:
dataset = dataset.rsplit('-', 2)[0] + "-%d-%s" % (f, custom_distance)
index_fn = os.path.join('test', dataset + '.annoy')
annoy = AnnoyIndex(f, distance)
if not os.path.exists(index_fn):
print("adding items", distance, f)
for i, v in enumerate(dataset_f["train"]):
if len(v) > f:
v = v[:f]
annoy.add_item(i, v)
print("building index")
annoy.build(10)
annoy.save(index_fn)
else:
annoy.load(index_fn)
return annoy, dataset_f, dataset
def _test_index(dataset, exp_accuracy, custom_metric=None, custom_dim=None):
annoy, dataset_f, dataset = _get_index(dataset, custom_metric, custom_dim)
n, k = 0, 0
for i, v in enumerate(dataset_f["test"]):
if custom_dim:
v = v[:custom_dim]
js_fast = annoy.get_nns_by_vector(v, 10, 10000)
js_real = dataset_f["neighbors"][i][:10]
assert len(js_fast) == 10
assert len(js_real) == 10
n += 10
k += len(set(js_fast).intersection(js_real))
accuracy = 100.0 * k / n
print(
"%50s accuracy: %5.2f%% (expected %5.2f%%)" % (dataset, accuracy, exp_accuracy)
)
assert accuracy > exp_accuracy - 1.0 # should be within 1%
def test_glove_25():
_test_index("glove-25-angular", 69.00)
def test_nytimes_16():
_test_index("nytimes-16-angular", 80.00)
def test_lastfm_dot():
_test_index('lastfm-64-dot', 60.00, 'dot', 64)
def test_lastfm_angular():
_test_index('lastfm-64-dot', 60.00, 'angular', 65)
================================================
FILE: test/angular_index_test.py
================================================
# Copyright (c) 2013 Spotify AB
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations under
# the License.
import random
import numpy
import pytest
from annoy import AnnoyIndex
def test_get_nns_by_vector():
f = 3
i = AnnoyIndex(f, "angular")
i.add_item(0, [0, 0, 1])
i.add_item(1, [0, 1, 0])
i.add_item(2, [1, 0, 0])
i.build(10)
assert i.get_nns_by_vector([3, 2, 1], 3) == [2, 1, 0]
assert i.get_nns_by_vector([1, 2, 3], 3) == [0, 1, 2]
assert i.get_nns_by_vector([2, 0, 1], 3) == [2, 0, 1]
def test_get_nns_by_item():
f = 3
i = AnnoyIndex(f, "angular")
i.add_item(0, [2, 1, 0])
i.add_item(1, [1, 2, 0])
i.add_item(2, [0, 0, 1])
i.build(10)
assert i.get_nns_by_item(0, 3) == [0, 1, 2]
assert i.get_nns_by_item(1, 3) == [1, 0, 2]
assert i.get_nns_by_item(2, 3) in [[2, 0, 1], [2, 1, 0]] # could be either
def test_dist():
f = 2
i = AnnoyIndex(f, "angular")
i.add_item(0, [0, 1])
i.add_item(1, [1, 1])
assert i.get_distance(0, 1) == pytest.approx((2 * (1.0 - 2**-0.5)) ** 0.5)
def test_dist_2():
f = 2
i = AnnoyIndex(f, "angular")
i.add_item(0, [1000, 0])
i.add_item(1, [10, 0])
assert i.get_distance(0, 1) == pytest.approx(0)
def test_dist_3():
f = 2
i = AnnoyIndex(f, "angular")
i.add_item(0, [97, 0])
i.add_item(1, [42, 42])
dist = ((1 - 2**-0.5) ** 2 + (2**-0.5) ** 2) ** 0.5
assert i.get_distance(0, 1) == pytest.approx(dist)
def test_dist_degen():
f = 2
i = AnnoyIndex(f, "angular")
i.add_item(0, [1, 0])
i.add_item(1, [0, 0])
assert i.get_distance(0, 1) == pytest.approx(2.0**0.5)
def test_large_index():
# Generate pairs of random points where the pair is super close
f = 10
i = AnnoyIndex(f, "angular")
for j in range(0, 10000, 2):
p = [random.gauss(0, 1) for z in range(f)]
f1 = random.random() + 1
f2 = random.random() + 1
x = [f1 * pi + random.gauss(0, 1e-2) for pi in p]
y = [f2 * pi + random.gauss(0, 1e-2) for pi in p]
i.add_item(j, x)
i.add_item(j + 1, y)
i.build(10)
for j in range(0, 10000, 2):
assert i.get_nns_by_item(j, 2) == [j, j + 1]
assert i.get_nns_by_item(j + 1, 2) == [j + 1, j]
def precision(n, n_trees=10, n_points=10000, n_rounds=10, search_k=100000):
found = 0
for r in range(n_rounds):
# create random points at distance x from (1000, 0, 0, ...)
f = 10
i = AnnoyIndex(f, "angular")
for j in range(n_points):
p = [random.gauss(0, 1) for z in range(f - 1)]
norm = sum([pi**2 for pi in p]) ** 0.5
x = [1000] + [pi / norm * j for pi in p]
i.add_item(j, x)
i.build(n_trees)
nns = i.get_nns_by_vector([1000] + [0] * (f - 1), n, search_k)
assert nns == sorted(nns) # should be in order
# The number of gaps should be equal to the last item minus n-1
found += len([x for x in nns if x < n])
return 1.0 * found / (n * n_rounds)
def test_precision_1():
assert precision(1) >= 0.98
def test_precision_10():
assert precision(10) >= 0.98
def test_precision_100():
assert precision(100) >= 0.98
def test_precision_1000():
assert precision(1000) >= 0.98
def test_load_save_get_item_vector():
f = 3
i = AnnoyIndex(f, "angular")
i.add_item(0, [1.1, 2.2, 3.3])
i.add_item(1, [4.4, 5.5, 6.6])
i.add_item(2, [7.7, 8.8, 9.9])
numpy.testing.assert_array_almost_equal(i.get_item_vector(0), [1.1, 2.2, 3.3])
assert i.build(10)
assert i.save("blah.ann")
numpy.testing.assert_array_almost_equal(i.get_item_vector(1), [4.4, 5.5, 6.6])
j = AnnoyIndex(f, "angular")
assert j.load("blah.ann")
numpy.testing.assert_array_almost_equal(j.get_item_vector(2), [7.7, 8.8, 9.9])
def test_get_nns_search_k():
f = 3
i = AnnoyIndex(f, "angular")
i.add_item(0, [0, 0, 1])
i.add_item(1, [0, 1, 0])
i.add_item(2, [1, 0, 0])
i.build(10)
assert i.get_nns_by_item(0, 3, 10) == [0, 1, 2]
assert i.get_nns_by_vector([3, 2, 1], 3, 10) == [2, 1, 0]
def test_include_dists():
# Double checking issue 112
f = 40
i = AnnoyIndex(f, "angular")
v = numpy.random.normal(size=f)
i.add_item(0, v)
i.add_item(1, -v)
i.build(10)
indices, dists = i.get_nns_by_item(0, 2, 10, True)
assert indices == [0, 1]
assert dists[0] == pytest.approx(0.0)
assert dists[1] == pytest.approx(2.0)
def test_include_dists_check_ranges():
f = 3
i = AnnoyIndex(f, "angular")
for j in range(100000):
i.add_item(j, numpy.random.normal(size=f))
i.build(10)
indices, dists = i.get_nns_by_item(0, 100000, include_distances=True)
assert max(dists) <= 2.0
assert min(dists) == pytest.approx(0.0)
def test_distance_consistency():
n, f = 1000, 3
i = AnnoyIndex(f, "angular")
for j in range(n):
while True:
v = numpy.random.normal(size=f)
if numpy.dot(v, v) > 0.1:
break
i.add_item(j, v)
i.build(10)
for a in random.sample(range(n), 100):
indices, dists = i.get_nns_by_item(a, 100, include_distances=True)
for b, dist in zip(indices, dists):
u = i.get_item_vector(a)
v = i.get_item_vector(b)
assert dist == pytest.approx(i.get_distance(a, b), rel=1e-3, abs=1e-3)
u_norm = numpy.array(u) * numpy.dot(u, u) ** -0.5
v_norm = numpy.array(v) * numpy.dot(v, v) ** -0.5
# cos = numpy.clip(1 - cosine(u, v), -1, 1) # scipy returns 1 - cos
assert dist**2 == pytest.approx(
numpy.dot(u_norm - v_norm, u_norm - v_norm), rel=1e-3, abs=1e-3
)
# self.assertAlmostEqual(dist, (2*(1 - cos))**0.5)
assert dist**2 == pytest.approx(
sum([(x - y) ** 2 for x, y in zip(u_norm, v_norm)]),
rel=1e-3,
abs=1e-3,
)
def test_only_one_item():
# reported to annoy-user by Kireet Reddy
idx = AnnoyIndex(100, "angular")
idx.add_item(0, numpy.random.randn(100))
idx.build(n_trees=10)
idx.save("foo.idx")
idx = AnnoyIndex(100, "angular")
idx.load("foo.idx")
assert idx.get_n_items() == 1
assert idx.get_nns_by_vector(
vector=numpy.random.randn(100), n=50, include_distances=False
) == [0]
def test_no_items():
idx = AnnoyIndex(100, "angular")
idx.build(n_trees=10)
idx.save("foo.idx")
idx = AnnoyIndex(100, "angular")
idx.load("foo.idx")
assert idx.get_n_items() == 0
assert (
idx.get_nns_by_vector(
vector=numpy.random.randn(100), n=50, include_distances=False
)
== []
)
def test_single_vector():
# https://github.com/spotify/annoy/issues/194
a = AnnoyIndex(3, "angular")
a.add_item(0, [1, 0, 0])
a.build(10)
a.save("1.ann")
indices, dists = a.get_nns_by_vector([1, 0, 0], 3, include_distances=True)
assert indices == [0]
assert dists[0] ** 2 == pytest.approx(0.0)
================================================
FILE: test/annoy_test.go
================================================
/*
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations under
# the License.
*/
package annoy_test
import (
"math"
"math/rand"
"os"
"testing"
"github.com/spotify/annoy"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/stretchr/testify/suite"
)
type AnnoyTestSuite struct {
suite.Suite
}
func Round(f float64) float64 {
return math.Floor(f + 0.5)
}
func RoundPlus(f float64, places int) float64 {
shift := math.Pow(10, float64(places))
return Round(f*shift) / shift
}
func (suite *AnnoyTestSuite) SetupTest() {
}
func (suite *AnnoyTestSuite) TestFileHandling() {
index := annoy.NewAnnoyIndexAngular(3)
index.AddItem(0, []float32{0, 0, 1})
index.AddItem(1, []float32{0, 1, 0})
index.AddItem(2, []float32{1, 0, 0})
index.Build(10)
index.Save("go_test.ann")
info, err := os.Stat("go_test.ann")
if err != nil {
assert.Fail(suite.T(), "Failed to create file, file not found")
}
if info.Size() == 0 {
assert.Fail(suite.T(), "Failed to create file, file size zero")
}
annoy.DeleteAnnoyIndexAngular(index)
index = annoy.NewAnnoyIndexAngular(3)
if ret := index.Load("go_test.ann"); ret == false {
assert.Fail(suite.T(), "Failed to load file")
}
os.Remove("go_test.ann")
index.Save("go_test2.ann", false)
info, err = os.Stat("go_test2.ann")
if err != nil {
assert.Fail(suite.T(), "Failed to create file without prefault, file not found")
}
if info.Size() == 0 {
assert.Fail(suite.T(), "Failed to create file without prefault, file size zero")
}
annoy.DeleteAnnoyIndexAngular(index)
index = annoy.NewAnnoyIndexAngular(3)
if ret := index.Load("go_test2.ann", false); ret == false {
assert.Fail(suite.T(), "Failed to load file without prefault")
}
os.Remove("go_test2.ann")
index.Save("go_test3.ann", true)
info, err = os.Stat("go_test3.ann")
if err != nil {
assert.Fail(suite.T(), "Failed to create file allowing prefault, file not found")
}
if info.Size() == 0 {
assert.Fail(suite.T(), "Failed to create file allowing prefault, file size zero")
}
annoy.DeleteAnnoyIndexAngular(index)
index = annoy.NewAnnoyIndexAngular(3)
if ret := index.Load("go_test3.ann", true); ret == false {
assert.Fail(suite.T(), "Failed to load file allowing prefault")
}
annoy.DeleteAnnoyIndexAngular(index)
os.Remove("go_test3.ann")
}
func (suite *AnnoyTestSuite) TestOnDiskBuild() {
index := annoy.NewAnnoyIndexAngular(3)
index.OnDiskBuild("go_test.ann")
info, err := os.Stat("go_test.ann")
if err != nil {
assert.Fail(suite.T(), "Failed to create file, file not found")
}
if info.Size() == 0 {
assert.Fail(suite.T(), "Failed to create file, file size zero")
}
index.AddItem(0, []float32{0, 0, 1})
index.AddItem(1, []float32{0, 1, 0})
index.AddItem(2, []float32{1, 0, 0})
index.Build(10)
index.Unload()
index.Load("go_test.ann")
result := annoy.NewAnnoyVectorInt()
defer result.Free()
index.GetNnsByVector([]float32{3, 2, 1}, 3, -1, result)
assert.Equal(suite.T(), []int32{2, 1, 0}, result.ToSlice())
index.GetNnsByVector([]float32{1, 2, 3}, 3, -1, result)
assert.Equal(suite.T(), []int32{0, 1, 2}, result.ToSlice())
index.GetNnsByVector([]float32{2, 0, 1}, 3, -1, result)
assert.Equal(suite.T(), []int32{2, 0, 1}, result.ToSlice())
annoy.DeleteAnnoyIndexAngular(index)
os.Remove("go_test.ann")
}
func (suite *AnnoyTestSuite) TestGetNnsByVector() {
t := suite.T()
index := annoy.NewAnnoyIndexAngular(3)
index.AddItem(0, []float32{0, 0, 1})
index.AddItem(1, []float32{0, 1, 0})
index.AddItem(2, []float32{1, 0, 0})
index.Build(10)
t.Run("regular", func(t *testing.T) {
result := annoy.NewAnnoyVectorInt()
defer result.Free()
index.GetNnsByVector([]float32{3, 2, 1}, 3, -1, result)
assert.Equal(t, []int32{2, 1, 0}, result.ToSlice())
index.GetNnsByVector([]float32{1, 2, 3}, 3, -1, result)
assert.Equal(t, []int32{0, 1, 2}, result.ToSlice())
index.GetNnsByVector([]float32{2, 0, 1}, 3, -1, result)
assert.Equal(t, []int32{2, 0, 1}, result.ToSlice())
})
t.Run("with copying", func(t *testing.T) {
result := annoy.NewAnnoyVectorInt()
defer result.Free()
var notAllocated []int32
index.GetNnsByVector([]float32{3, 2, 1}, 3, -1, result)
result.Copy(¬Allocated)
assert.Equal(t, []int32{2, 1, 0}, notAllocated)
// to make sure it will be overwritten
var alreadyAllocated = make([]int32, 10)
for i := 0; i < len(alreadyAllocated); i++ {
alreadyAllocated[i] = -1
}
index.GetNnsByVector([]float32{3, 2, 1}, 3, -1, result)
result.Copy(&alreadyAllocated)
assert.Equal(t, []int32{2, 1, 0}, alreadyAllocated)
var alreadyAllocatedCap = make([]int32, 0, 00)
index.GetNnsByVector([]float32{3, 2, 1}, 3, -1, result)
result.Copy(&alreadyAllocatedCap)
assert.Equal(t, []int32{2, 1, 0}, alreadyAllocatedCap)
})
t.Run("with inner array", func(t *testing.T) {
result := annoy.NewAnnoyVectorInt()
defer result.Free()
index.GetNnsByVector([]float32{3, 2, 1}, 3, -1, result)
assert.Equal(t, []int32{2, 1, 0}, result.InnerArray())
})
annoy.DeleteAnnoyIndexAngular(index)
}
func (suite *AnnoyTestSuite) TestGetNnsByItem() {
index := annoy.NewAnnoyIndexAngular(3)
index.AddItem(0, []float32{2, 1, 0})
index.AddItem(1, []float32{1, 2, 0})
index.AddItem(2, []float32{0, 0, 1})
index.Build(10)
var result = annoy.NewAnnoyVectorInt()
defer result.Free()
index.GetNnsByItem(0, 3, -1, result)
assert.Equal(suite.T(), []int32{0, 1, 2}, result.ToSlice())
index.GetNnsByItem(1, 3, -1, result)
assert.Equal(suite.T(), []int32{1, 0, 2}, result.ToSlice())
annoy.DeleteAnnoyIndexAngular(index)
}
func (suite *AnnoyTestSuite) TestGetItem() {
index := annoy.NewAnnoyIndexAngular(3)
index.AddItem(0, []float32{2, 1, 0})
index.AddItem(1, []float32{1, 2, 0})
index.AddItem(2, []float32{0, 0, 1})
index.Build(10)
var result = annoy.NewAnnoyVectorFloat()
defer result.Free()
index.GetItem(0, result)
assert.Equal(suite.T(), []float32{2, 1, 0}, result.ToSlice())
index.GetItem(1, result)
assert.Equal(suite.T(), []float32{1, 2, 0}, result.ToSlice())
index.GetItem(2, result)
assert.Equal(suite.T(), []float32{0, 0, 1}, result.ToSlice())
annoy.DeleteAnnoyIndexAngular(index)
}
func (suite *AnnoyTestSuite) TestGetDistance() {
index := annoy.NewAnnoyIndexAngular(2)
index.AddItem(0, []float32{0, 1})
index.AddItem(1, []float32{1, 1})
index.Build(10)
assert.Equal(suite.T(), RoundPlus(math.Pow(2*(1.0-math.Pow(2, -0.5)), 0.5), 3), RoundPlus(float64(index.GetDistance(0, 1)), 3))
annoy.DeleteAnnoyIndexAngular(index)
}
func (suite *AnnoyTestSuite) TestGetDotProductDistance() {
index := annoy.NewAnnoyIndexDotProduct(2)
index.AddItem(0, []float32{0, 1})
index.AddItem(1, []float32{1, 1})
index.Build(10)
assert.True(suite.T(),
math.Abs(1.0-float64(index.GetDistance(0, 1))) < 0.00001)
annoy.DeleteAnnoyIndexDotProduct(index)
}
func (suite *AnnoyTestSuite) TestLargeEuclideanIndex() {
index := annoy.NewAnnoyIndexEuclidean(10)
for j := 0; j < 10000; j += 2 {
p := make([]float32, 0, 10)
for i := 0; i < 10; i++ {
p = append(p, rand.Float32())
}
x := make([]float32, 0, 10)
for i := 0; i < 10; i++ {
x = append(x, 1+p[i]+rand.Float32()*1e-2)
}
y := make([]float32, 0, 10)
for i := 0; i < 10; i++ {
y = append(y, 1+p[i]+rand.Float32()*1e-2)
}
index.AddItem(j, x)
index.AddItem(j+1, y)
}
index.Build(10)
result := annoy.NewAnnoyVectorInt()
defer result.Free()
for j := 0; j < 10000; j += 2 {
index.GetNnsByItem(j, 2, -1, result)
require.Equal(suite.T(), result.ToSlice(), []int32{int32(j), int32(j + 1)})
index.GetNnsByItem(j+1, 2, -1, result)
require.Equal(suite.T(), result.ToSlice(), []int32{int32(j) + 1, int32(j)})
}
annoy.DeleteAnnoyIndexEuclidean(index)
}
func TestAnnoyTestSuite(t *testing.T) {
suite.Run(t, new(AnnoyTestSuite))
}
================================================
FILE: test/annoy_test.lua
================================================
-- Copyright (c) 2016 Boris Nagaev
--
-- Licensed under the Apache License, Version 2.0 (the "License"); you may not
-- use this file except in compliance with the License. You may obtain a copy of
-- the License at
--
-- http://www.apache.org/licenses/LICENSE-2.0
--
-- Unless required by applicable law or agreed to in writing, software
-- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
-- License for the specific language governing permissions and limitations under
-- the License.
local AnnoyIndex = require 'annoy'.AnnoyIndex
local function gauss(mu, sigma)
local sum = -6
for _ = 1, 12 do
sum = sum + math.random()
end
return mu + sum * sigma
end
local function randomVector(f, mu, sigma)
local v = {}
for i = 1, f do
v[i] = gauss(mu, sigma)
end
return v
end
local function round(x)
return ("%.3f"):format(x)
end
local function roundArray(array)
local rounded_array = {}
for k, v in ipairs(array) do
rounded_array[k] = round(v)
end
return rounded_array
end
local function isSorted(v)
for i = 2, #v do
if v[i-1] > v[i] then
return false
end
end
return true
end
local function max(array)
local ans = assert(array[1])
for _, v in ipairs(array) do
ans = math.max(ans, v)
end
return ans
end
local function min(array)
local ans = assert(array[1])
for _, v in ipairs(array) do
ans = math.min(ans, v)
end
return ans
end
local function precision(first1000, n, n_trees, n_points, n_rounds)
if not n_trees then
n_trees = 10
end
if not n_points then
n_points = 10000
end
if not n_rounds then
n_rounds = 10
end
local found = 0
for _ = 1, n_rounds do
local f = 10
local p_size
if first1000 then
-- create random points at distance x from (1000, 0, 0, ...)
p_size = f - 1
else
-- create random points at distance x
p_size = f
end
local i = AnnoyIndex(f, 'euclidean')
for j = 0, n_points - 1 do
local p = randomVector(p_size, 0, 1)
local norm
do
norm = 0
for _, pi in ipairs(p) do
norm = norm + pi ^ 2
end
norm = norm ^ 0.5
end
local x = {}
do
if first1000 then
x[1] = 1000
end
for _, pi in ipairs(p) do
table.insert(x, pi / norm * j)
end
end
i:add_item(j, x)
end
i:build(n_trees)
local v = {}
do
for k = 1, f do
v[k] = 0
end
if first1000 then
v[1] = 1000
end
end
local nns = i:get_nns_by_vector(v, n)
assert(isSorted(nns))
-- The number of gaps should be equal to the last item minus n-1
for _, x in ipairs(nns) do
if x < n then
found = found + 1
end
end
end
return 1.0 * found / (n * n_rounds)
end
describe("angular annoy test", function()
it("get_nns_by_vector", function()
local f = 3
local i = AnnoyIndex(f)
i:add_item(0, {0, 0, 1})
i:add_item(1, {0, 1, 0})
i:add_item(2, {1, 0, 0})
i:build(10)
assert.same({2, 1, 0}, i:get_nns_by_vector({3, 2, 1}, 3))
assert.same({0, 1, 2}, i:get_nns_by_vector({1, 2, 3}, 3))
assert.same({2, 0, 1}, i:get_nns_by_vector({2, 0, 1}, 3))
end)
it("get_nns_by_item", function()
local f = 3
local i = AnnoyIndex(f)
i:add_item(0, {2, 1, 0})
i:add_item(1, {1, 2, 0})
i:add_item(2, {0, 0, 1})
i:build(10)
assert.same({0, 1, 2}, i:get_nns_by_item(0, 3))
assert.same({1, 0, 2}, i:get_nns_by_item(1, 3))
do
local close_to_2 = i:get_nns_by_item(2, 3)
assert.equal(close_to_2[1], 2)
assert.truthy(
(close_to_2[2] == 0 and close_to_2[3] == 1)
or
(close_to_2[2] == 1 and close_to_2[3] == 0)
)
end
end)
it("dist", function()
local f = 2
local i = AnnoyIndex(f)
i:add_item(0, {0, 1})
i:add_item(1, {1, 1})
assert.equal(round((2 * (1.0 - 2 ^ -0.5)) ^ 0.5), round(i:get_distance(0, 1)))
end)
it("dist_2", function()
local f = 2
local i = AnnoyIndex(f)
i:add_item(0, {1000, 0})
i:add_item(1, {10, 0})
assert.equal(round(0), round(i:get_distance(0, 1)))
end)
it("dist_3", function()
local f = 2
local i = AnnoyIndex(f)
i:add_item(0, {97, 0})
i:add_item(1, {42, 42})
local dist = ((1 - 2 ^ -0.5) ^ 2 + (2 ^ -0.5) ^ 2) ^ 0.5
assert.equal(round(dist), round(i:get_distance(0, 1)))
end)
it("dist_degen", function()
local f = 2
local i = AnnoyIndex(f)
i:add_item(0, {1, 0})
i:add_item(1, {0, 0})
assert.equal(round(2.0 ^ 0.5), round(i:get_distance(0, 1)))
end)
it("large_index", function()
-- Generate pairs of random points where the pair is super close
local f = 10
local i = AnnoyIndex(f)
for j = 0, 10000 - 1, 2 do
local p = randomVector(f, 0, 1)
local f1 = math.random() + 1
local f2 = math.random() + 1
local x = {}
local y = {}
for k, pi in ipairs(p) do
x[k] = f1 * pi + gauss(0, 1e-2)
y[k] = f2 * pi + gauss(0, 1e-2)
end
i:add_item(j, x)
i:add_item(j+1, y)
end
i:build(10)
for j = 0, 10000 - 1, 2 do
assert.same({j, j+1}, i:get_nns_by_item(j, 2))
assert.same({j+1, j}, i:get_nns_by_item(j+1, 2))
end
end)
it("precision_1", function()
assert.truthy(precision(true, 1) >= 0.98)
end)
it("precision_10", function()
assert.truthy(precision(true, 10) >= 0.98)
end)
it("precision_100", function()
assert.truthy(precision(true, 100) >= 0.98)
end)
it("precision_1000", function()
assert.truthy(precision(true, 1000) >= 0.98)
end)
it("load_save_get_item_vector", function()
local f = 3
local i = AnnoyIndex(f)
i:add_item(0, {1.1, 2.2, 3.3})
i:add_item(1, {4.4, 5.5, 6.6})
i:add_item(2, {7.7, 8.8, 9.9})
assert.same(roundArray({1.1, 2.2, 3.3}), roundArray(i:get_item_vector(0)))
assert.truthy(i:build(10))
assert.truthy(i:save('blah.ann'))
assert.same(roundArray({4.4, 5.5, 6.6}), roundArray(i:get_item_vector(1)))
local j = AnnoyIndex(f)
assert.truthy(j:load('blah.ann'))
assert.same(roundArray({7.7, 8.8, 9.9}), roundArray(i:get_item_vector(2)))
end)
it("get_nns_search_k", function()
local f = 3
local i = AnnoyIndex(f)
i:add_item(0, {0, 0, 1})
i:add_item(1, {0, 1, 0})
i:add_item(2, {1, 0, 0})
i:build(10)
assert.same({0, 1, 2}, i:get_nns_by_item(0, 3, 10))
assert.same({2, 1, 0}, i:get_nns_by_vector({3, 2, 1}, 3, 10))
end)
it("include_dists", function()
-- Double checking issue 112
local f = 40
local i = AnnoyIndex(f)
local v = randomVector(f, 0, 1)
i:add_item(0, v)
local neg_v = {}
do
for k, value in ipairs(v) do
neg_v[k] = -value
end
end
i:add_item(1, neg_v)
i:build(10)
local indices, dists = i:get_nns_by_item(0, 2, 10, true)
assert.same({0, 1}, indices)
assert.same(roundArray({0.0, 2.0}), roundArray(dists))
end)
it("include_dists_check_ranges", function()
local f = 3
local i = AnnoyIndex(f)
for j = 0, 100000 - 1 do
i:add_item(j, randomVector(f, 0, 1))
end
i:build(10)
local include_distances = true
local _, dists = i:get_nns_by_item(0, 100000, -1, include_distances)
assert.truthy(max(dists) < 2.0)
assert.equal(round(0.0), round(min(dists)))
end)
end)
describe("euclidean annoy test", function()
it("get_nns_by_vector", function()
local f = 2
local i = AnnoyIndex(f, 'euclidean')
i:add_item(0, {2, 2})
i:add_item(1, {3, 2})
i:add_item(2, {3, 3})
i:build(10)
assert.same({2, 1, 0}, i:get_nns_by_vector({4, 4}, 3))
assert.same({0, 1, 2}, i:get_nns_by_vector({1, 1}, 3))
assert.same({1, 2, 0}, i:get_nns_by_vector({4, 2}, 3))
end)
it("get_nns_by_item", function()
local f = 2
local i = AnnoyIndex(f, 'euclidean')
i:add_item(0, {2, 2})
i:add_item(1, {3, 2})
i:add_item(2, {3, 3})
i:build(10)
assert.same({0, 1, 2}, i:get_nns_by_item(0, 3))
assert.same({2, 1, 0}, i:get_nns_by_item(2, 3))
end)
it("dist", function()
local f = 2
local i = AnnoyIndex(f, 'euclidean')
i:add_item(0, {0, 1})
i:add_item(1, {1, 1})
assert.equal(round(1.0), round(i:get_distance(0, 1)))
end)
it("large_index", function()
-- Generate pairs of random points where the pair is super close
local f = 10
-- local q = randomVector(f, 0, 10)
local i = AnnoyIndex(f, 'euclidean')
for j = 0, 10000 - 1, 2 do
local p = randomVector(f, 0, 1)
local x = {}
local y = {}
for k, pi in ipairs(p) do
x[k] = 1 + pi + gauss(0, 1e-2) -- todo: should be q[i]
y[k] = 1 + pi + gauss(0, 1e-2)
end
i:add_item(j, x)
i:add_item(j+1, y)
end
i:build(10)
for j = 0, 10000 - 1, 2 do
assert.same({j, j+1}, i:get_nns_by_item(j, 2))
assert.same({j+1, j}, i:get_nns_by_item(j+1, 2))
end
end)
it("precision_1", function()
assert.truthy(precision(false, 1) >= 0.98)
end)
it("precision_10", function()
assert.truthy(precision(false, 10) >= 0.98)
end)
it("precision_100", function()
assert.truthy(precision(false, 100) >= 0.98)
end)
it("precision_1000", function()
assert.truthy(precision(false, 1000) >= 0.98)
end)
it("get_nns_with_distances", function()
local f = 3
local i = AnnoyIndex(f, 'euclidean')
i:add_item(0, {0, 0, 2})
i:add_item(1, {0, 1, 1})
i:add_item(2, {1, 0, 0})
i:build(10)
do
local l, d = i:get_nns_by_item(0, 3, -1, true)
assert.same({0, 1, 2}, l)
assert.same(
roundArray({0, 2, 5}),
roundArray({d[1]^2, d[2]^2, d[3]^2})
)
end
do
local l, d = i:get_nns_by_vector({2, 2, 2}, 3, -1, true)
assert.same({1, 0, 2}, l)
assert.same(
roundArray({6, 8, 9}),
roundArray({d[1]^2, d[2]^2, d[3]^2})
)
end
end)
it("include_dists", function()
local f = 40
local i = AnnoyIndex(f)
local v = randomVector(f, 0, 1)
i:add_item(0, v)
local neg_v = {}
do
for k, value in ipairs(v) do
neg_v[k] = -value
end
end
i:add_item(1, neg_v)
i:build(10)
local indices, dists = i:get_nns_by_item(0, 2, 10, true)
assert.same({0, 1}, indices)
assert.same(round(0.0), round(dists[1]))
end)
end)
describe("index test", function()
it("not_found_tree", function()
local i = AnnoyIndex(10)
assert.has_error(function()
i:load('nonexists.tree')
end)
end)
it("binary_compatibility", function()
local i = AnnoyIndex(10)
i:load('test/test.tree')
-- This might change in the future if we change the search
-- algorithm, but in that case let's update the test
assert.same(
{0, 85, 42, 11, 54, 38, 53, 66, 19, 31},
i:get_nns_by_item(0, 10)
)
end)
it("load_unload", function()
-- Issue #108
local i = AnnoyIndex(10)
for _ = 1, 100000 do
i:load('test/test.tree')
i:unload()
end
end)
it("construct_load_destruct", function()
for x = 1, 100000 do
local i = AnnoyIndex(10)
i:load('test/test.tree')
if x % 100 == 0 then
collectgarbage()
end
end
end)
it("construct_destruct", function()
for _ = 1, 100000 do
local i = AnnoyIndex(10)
i:add_item(1000, randomVector(10, 0, 1))
end
end)
it("save_twice", function()
-- Issue #100
local t = AnnoyIndex(10)
t:save("t.ann")
t:save("t.ann")
end)
it("load_save", function()
-- Issue #61
local i = AnnoyIndex(10)
i:load('test/test.tree')
local u = i:get_item_vector(99)
i:save('i.tree')
local v = i:get_item_vector(99)
assert.same(u, v)
local j = AnnoyIndex(10)
j:load('test/test.tree')
local w = i:get_item_vector(99) -- maybe s/i/j/?
assert.same(u, w)
-- Ensure specifying if prefault is allowed does not impact result
j:save('j.tree', true)
local k = AnnoyIndex(10)
k:load('j.tree', true)
local x = k:get_item_vector(99)
assert.same(u, x)
k:save('k.tree', false)
local l = AnnoyIndex(10)
l:load('k.tree', false)
local y = l:get_item_vector(99)
assert.same(u, y)
end)
it("on_disk_build", function()
local f = 2
local i = AnnoyIndex(f, 'euclidean')
i:on_disk_build('x.tree')
i:add_item(0, {2, 2})
i:add_item(1, {3, 2})
i:add_item(2, {3, 3})
i:build(10)
i:unload()
i:load('x.tree')
assert.same({2, 1, 0}, i:get_nns_by_vector({4, 4}, 3))
assert.same({0, 1, 2}, i:get_nns_by_vector({1, 1}, 3))
assert.same({1, 2, 0}, i:get_nns_by_vector({4, 2}, 3))
end)
end)
describe("types test", function()
local n_points = 1000
local n_trees = 10
-- tests "numpy" and "tuple" are not applicable to Lua
it("wrong_length", function()
local f = 10
local i = AnnoyIndex(f, 'euclidean')
i:add_item(0, randomVector(f, 0, 1))
assert.has_error(function()
i:add_item(1, randomVector(f + 1000, 0, 1))
end)
assert.has_error(function()
i:add_item(2, {})
end)
i:build(n_trees)
end)
it("range_errors", function()
local f = 10
local i = AnnoyIndex(f, 'euclidean')
for j = 0, n_points - 1 do
i:add_item(j, randomVector(f, 0, 1))
end
assert.has_error(function()
i:add_item(-1, randomVector(f))
end)
i:build(n_trees)
for _, bad_index in ipairs({-1000, -1, n_points, n_points + 1000}) do
assert.has_error(function()
i:get_distance(0, bad_index)
end)
assert.has_error(function()
i:get_nns_by_item(bad_index, 1)
end)
assert.has_error(function()
i:get_item_vector(bad_index)
end)
end
end)
end)
describe("memory leaks", function()
it("get_item_vector", function()
local f = 10
local i = AnnoyIndex(f, 'euclidean')
i:add_item(0, randomVector(f, 0, 1))
for j = 0, 100 - 1 do
print(j, '...')
for _ = 1, 1000 * 1000 do
i:get_item_vector(0)
end
end
end)
it("get_lots_of_nns", function()
local f = 10
local i = AnnoyIndex(f, 'euclidean')
i:add_item(0, randomVector(f, 0, 1))
i:build(10)
for _ = 1, 100 do
assert.same({0}, i:get_nns_by_item(0, 999999999))
end
end)
end)
================================================
FILE: test/dot_index_test.py
================================================
# Copyright (c) 2018 Spotify AB
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations under
# the License.
import random
import numpy
import pytest
from annoy import AnnoyIndex
def dot_metric(a, b):
return -numpy.dot(a, b)
def recall(retrieved, relevant):
return float(len(set(relevant) & set(retrieved))) / float(len(set(relevant)))
def test_get_nns_by_vector():
f = 2
i = AnnoyIndex(f, "dot")
i.add_item(0, [2, 2])
i.add_item(1, [3, 2])
i.add_item(2, [3, 3])
i.build(10)
assert i.get_nns_by_vector([4, 4], 3) == [2, 1, 0]
assert i.get_nns_by_vector([1, 1], 3) == [2, 1, 0]
assert i.get_nns_by_vector([4, 2], 3) == [2, 1, 0]
def test_get_nns_by_item():
f = 2
i = AnnoyIndex(f, "dot")
i.add_item(0, [2, 2])
i.add_item(1, [3, 2])
i.add_item(2, [3, 3])
i.build(10)
assert i.get_nns_by_item(0, 3) == [2, 1, 0]
assert i.get_nns_by_item(2, 3) == [2, 1, 0]
def test_dist():
f = 2
i = AnnoyIndex(f, "dot")
i.add_item(0, [0, 1])
i.add_item(1, [1, 1])
i.add_item(2, [0, 0])
i.build(10)
assert i.get_distance(0, 1) == pytest.approx(1.0)
assert i.get_distance(1, 2) == pytest.approx(0.0)
def recall_at(n, n_trees=10, n_points=1000, n_rounds=5):
# the best movie/variable name
total_recall = 0.0
for r in range(n_rounds):
# create random points at distance x
f = 10
idx = AnnoyIndex(f, "dot")
data = numpy.array(
[[random.gauss(0, 1) for z in range(f)] for j in range(n_points)]
)
expected_results = [
sorted(range(n_points), key=lambda j: dot_metric(data[i], data[j]))[:n]
for i in range(n_points)
]
for i, vec in enumerate(data):
idx.add_item(i, vec)
idx.build(n_trees)
for i in range(n_points):
nns = idx.get_nns_by_vector(data[i], n)
total_recall += recall(nns, expected_results[i])
return total_recall / float(n_rounds * n_points)
def test_recall_at_10():
value = recall_at(10)
assert value >= 0.65
def test_recall_at_100():
value = recall_at(100)
assert value >= 0.95
def test_recall_at_1000():
value = recall_at(1000)
assert value >= 0.99
def test_recall_at_1000_fewer_trees():
value = recall_at(1000, n_trees=4)
assert value >= 0.99
def test_get_nns_with_distances():
f = 3
i = AnnoyIndex(f, "dot")
i.add_item(0, [0, 0, 2])
i.add_item(1, [0, 1, 1])
i.add_item(2, [1, 0, 0])
i.build(10)
l, d = i.get_nns_by_item(0, 3, -1, True)
assert l == [0, 1, 2]
assert d[0] == pytest.approx(4)
assert d[1] == pytest.approx(2)
assert d[2] == pytest.approx(0)
l, d = i.get_nns_by_vector([2, 2, 2], 3, -1, True)
assert l == [0, 1, 2]
assert d[0] == pytest.approx(4)
assert d[1] == pytest.approx(4)
assert d[2] == pytest.approx(2)
def test_include_dists():
f = 40
i = AnnoyIndex(f, "dot")
v = numpy.random.normal(size=f)
i.add_item(0, v)
i.add_item(1, -v)
i.build(10)
indices, dists = i.get_nns_by_item(0, 2, 10, True)
assert indices == [0, 1]
assert dists[0] == pytest.approx(numpy.dot(v, v))
def test_distance_consistency():
n, f = 1000, 3
i = AnnoyIndex(f, "dot")
for j in range(n):
i.add_item(j, numpy.random.normal(size=f))
i.build(10)
for a in random.sample(range(n), 100):
indices, dists = i.get_nns_by_item(a, 100, include_distances=True)
for b, dist in zip(indices, dists):
assert dist == pytest.approx(
numpy.dot(i.get_item_vector(a), i.get_item_vector(b))
)
assert dist == pytest.approx(i.get_distance(a, b))
================================================
FILE: test/euclidean_index_test.py
================================================
# Copyright (c) 2013 Spotify AB
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations under
# the License.
import random
import numpy
import pytest
from annoy import AnnoyIndex
def test_get_nns_by_vector():
f = 2
i = AnnoyIndex(f, "euclidean")
i.add_item(0, [2, 2])
i.add_item(1, [3, 2])
i.add_item(2, [3, 3])
i.build(10)
assert i.get_nns_by_vector([4, 4], 3) == [2, 1, 0]
assert i.get_nns_by_vector([1, 1], 3) == [0, 1, 2]
assert i.get_nns_by_vector([4, 2], 3) == [1, 2, 0]
def test_get_nns_by_item():
f = 2
i = AnnoyIndex(f, "euclidean")
i.add_item(0, [2, 2])
i.add_item(1, [3, 2])
i.add_item(2, [3, 3])
i.build(10)
assert i.get_nns_by_item(0, 3) == [0, 1, 2]
assert i.get_nns_by_item(2, 3) == [2, 1, 0]
def test_dist():
f = 2
i = AnnoyIndex(f, "euclidean")
i.add_item(0, [0, 1])
i.add_item(1, [1, 1])
i.add_item(2, [0, 0])
assert i.get_distance(0, 1) == pytest.approx(1.0**0.5)
assert i.get_distance(1, 2) == pytest.approx(2.0**0.5)
def test_large_index():
# Generate pairs of random points where the pair is super close
f = 10
[random.gauss(0, 10) for z in range(f)]
i = AnnoyIndex(f, "euclidean")
for j in range(0, 10000, 2):
p = [random.gauss(0, 1) for z in range(f)]
x = [1 + pi + random.gauss(0, 1e-2) for pi in p] # todo: should be q[i]
y = [1 + pi + random.gauss(0, 1e-2) for pi in p]
i.add_item(j, x)
i.add_item(j + 1, y)
i.build(10)
for j in range(0, 10000, 2):
assert i.get_nns_by_item(j, 2) == [j, j + 1]
assert i.get_nns_by_item(j + 1, 2) == [j + 1, j]
def precision(n, n_trees=10, n_points=10000, n_rounds=10):
found = 0
for r in range(n_rounds):
# create random points at distance x
f = 10
i = AnnoyIndex(f, "euclidean")
for j in range(n_points):
p = [random.gauss(0, 1) for z in range(f)]
norm = sum([pi**2 for pi in p]) ** 0.5
x = [pi / norm * j for pi in p]
i.add_item(j, x)
i.build(n_trees)
nns = i.get_nns_by_vector([0] * f, n)
assert nns == sorted(nns) # should be in order
# The number of gaps should be equal to the last item minus n-1
found += len([x for x in nns if x < n])
return 1.0 * found / (n * n_rounds)
def test_precision_1():
assert precision(1) >= 0.98
def test_precision_10():
assert precision(10) >= 0.98
def test_precision_100():
assert precision(100) >= 0.98
def test_precision_1000():
assert precision(1000) >= 0.98
def test_get_nns_with_distances():
f = 3
i = AnnoyIndex(f, "euclidean")
i.add_item(0, [0, 0, 2])
i.add_item(1, [0, 1, 1])
i.add_item(2, [1, 0, 0])
i.build(10)
l, d = i.get_nns_by_item(0, 3, -1, True)
assert l == [0, 1, 2]
assert d[0] ** 2 == pytest.approx(0)
assert d[1] ** 2 == pytest.approx(2)
assert d[2] ** 2 == pytest.approx(5)
l, d = i.get_nns_by_vector([2, 2, 2], 3, -1, True)
assert l == [1, 0, 2]
assert d[0] ** 2 == pytest.approx(6)
assert d[1] ** 2 == pytest.approx(8)
assert d[2] ** 2 == pytest.approx(9)
def test_include_dists():
f = 40
i = AnnoyIndex(f, "euclidean")
v = numpy.random.normal(size=f)
i.add_item(0, v)
i.add_item(1, -v)
i.build(10)
indices, dists = i.get_nns_by_item(0, 2, 10, True)
assert indices == [0, 1]
assert dists[0] == pytest.approx(0)
def test_distance_consistency():
n, f = 1000, 3
i = AnnoyIndex(f, "euclidean")
for j in range(n):
i.add_item(j, numpy.random.normal(size=f))
i.build(10)
for a in random.sample(range(n), 100):
indices, dists = i.get_nns_by_item(a, 100, include_distances=True)
for b, dist in zip(indices, dists):
assert dist == pytest.approx(i.get_distance(a, b))
u = numpy.array(i.get_item_vector(a))
v = numpy.array(i.get_item_vector(b))
assert dist == pytest.approx(numpy.dot(u - v, u - v) ** 0.5)
assert dist == pytest.approx(
sum([(x - y) ** 2 for x, y in zip(u, v)]) ** 0.5
)
def test_rounding_error():
# https://github.com/spotify/annoy/issues/314
i = AnnoyIndex(1, "euclidean")
i.add_item(0, [0.7125930])
i.add_item(1, [0.7123166])
assert i.get_distance(0, 1) >= 0.0
================================================
FILE: test/examples_test.py
================================================
def execfile(fn):
with open(fn) as f:
exec(f.read())
def simple_test():
execfile("examples/simple_test.py")
def mmap_test():
execfile("examples/mmap_test.py")
def precision_test():
execfile("examples/precision_test.py")
================================================
FILE: test/hamming_index_test.py
================================================
# Copyright (c) 2013 Spotify AB
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations under
# the License.
import numpy
import pytest
from annoy import AnnoyIndex
def test_basic_conversion():
f = 100
i = AnnoyIndex(f, "hamming")
u = numpy.random.binomial(1, 0.5, f)
v = numpy.random.binomial(1, 0.5, f)
i.add_item(0, u)
i.add_item(1, v)
u2 = i.get_item_vector(0)
v2 = i.get_item_vector(1)
assert numpy.dot(u - u2, u - u2) == pytest.approx(0.0)
assert numpy.dot(v - v2, v - v2) == pytest.approx(0.0)
assert i.get_distance(0, 0) == pytest.approx(0.0)
assert i.get_distance(1, 1) == pytest.approx(0.0)
assert i.get_distance(0, 1) == pytest.approx(numpy.dot(u - v, u - v))
assert i.get_distance(1, 0) == pytest.approx(numpy.dot(u - v, u - v))
def test_basic_nns():
f = 100
i = AnnoyIndex(f, "hamming")
u = numpy.random.binomial(1, 0.5, f)
v = numpy.random.binomial(1, 0.5, f)
i.add_item(0, u)
i.add_item(1, v)
i.build(10)
assert i.get_nns_by_item(0, 99) == [0, 1]
assert i.get_nns_by_item(1, 99) == [1, 0]
rs, ds = i.get_nns_by_item(0, 99, include_distances=True)
assert rs == [0, 1]
assert ds[0] == pytest.approx(0)
assert ds[1] == pytest.approx(numpy.dot(u - v, u - v))
def test_save_load():
f = 100
i = AnnoyIndex(f, "hamming")
u = numpy.random.binomial(1, 0.5, f)
v = numpy.random.binomial(1, 0.5, f)
i.add_item(0, u)
i.add_item(1, v)
i.build(10)
i.save("blah.ann")
j = AnnoyIndex(f, "hamming")
j.load("blah.ann")
rs, ds = j.get_nns_by_item(0, 99, include_distances=True)
assert rs == [0, 1]
assert ds[0] == pytest.approx(0)
assert ds[1] == pytest.approx(numpy.dot(u - v, u - v))
def test_many_vectors():
f = 10
i = AnnoyIndex(f, "hamming")
for x in range(100000):
i.add_item(x, numpy.random.binomial(1, 0.5, f))
i.build(10)
rs, ds = i.get_nns_by_vector([0] * f, 10000, include_distances=True)
assert min(ds) >= 0
assert max(ds) <= f
dists = []
for x in range(1000):
rs, ds = i.get_nns_by_vector(
numpy.random.binomial(1, 0.5, f), 1, search_k=1000, include_distances=True
)
dists.append(ds[0])
avg_dist = 1.0 * sum(dists) / len(dists)
assert avg_dist <= 0.42
@pytest.mark.skip # will fix later
def test_zero_vectors():
# Mentioned on the annoy-user list
bitstrings = [
"0000000000011000001110000011111000101110111110000100000100000000",
"0000000000011000001110000011111000101110111110000100000100000001",
"0000000000011000001110000011111000101110111110000100000100000010",
"0010010100011001001000010001100101011110000000110000011110001100",
"1001011010000110100101101001111010001110100001101000111000001110",
"0111100101111001011110010010001100010111000111100001101100011111",
"0011000010011101000011010010111000101110100101111000011101001011",
"0011000010011100000011010010111000101110100101111000011101001011",
"1001100000111010001010000010110000111100100101001001010000000111",
"0000000000111101010100010001000101101001000000011000001101000000",
"1000101001010001011100010111001100110011001100110011001111001100",
"1110011001001111100110010001100100001011000011010010111100100111",
]
vectors = [[int(bit) for bit in bitstring] for bitstring in bitstrings]
f = 64
idx = AnnoyIndex(f, "hamming")
for i, v in enumerate(vectors):
idx.add_item(i, v)
idx.build(10)
idx.save("idx.ann")
idx = AnnoyIndex(f, "hamming")
idx.load("idx.ann")
js, ds = idx.get_nns_by_item(0, 5, include_distances=True)
assert js[0] == 0
assert ds[:4] == [0, 1, 1, 22]
================================================
FILE: test/holes_test.py
================================================
# Copyright (c) 2013 Spotify AB
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations under
# the License.
import random
import numpy
from annoy import AnnoyIndex
def test_random_holes():
f = 10
index = AnnoyIndex(f, "angular")
valid_indices = random.sample(range(2000), 1000) # leave holes
for i in valid_indices:
v = numpy.random.normal(size=(f,))
index.add_item(i, v)
index.build(10)
for i in valid_indices:
js = index.get_nns_by_item(i, 10000)
for j in js:
assert j in valid_indices
for i in range(1000):
v = numpy.random.normal(size=(f,))
js = index.get_nns_by_vector(v, 10000)
for j in js:
assert j in valid_indices
def _test_holes_base(n, f=100, base_i=100000):
annoy = AnnoyIndex(f, "angular")
for i in range(n):
annoy.add_item(base_i + i, numpy.random.normal(size=(f,)))
annoy.build(100)
res = annoy.get_nns_by_item(base_i, n)
assert set(res) == set([base_i + i for i in range(n)])
def test_root_one_child():
# See https://github.com/spotify/annoy/issues/223
_test_holes_base(1)
def test_root_two_children():
_test_holes_base(2)
def test_root_some_children():
# See https://github.com/spotify/annoy/issues/295
_test_holes_base(10)
def test_root_many_children():
_test_holes_base(1000)
================================================
FILE: test/index_test.py
================================================
# Copyright (c) 2013 Spotify AB
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the sp
gitextract__vrufcg9/ ├── .github/ │ └── workflows/ │ ├── ci.yml │ └── publish.yml ├── .gitignore ├── CMakeLists.txt ├── LICENSE ├── MANIFEST.in ├── README.rst ├── README_GO.rst ├── README_Lua.md ├── RELEASE.md ├── annoy/ │ ├── __init__.py │ ├── __init__.pyi │ └── py.typed ├── annoy-dev-1.rockspec ├── debian/ │ ├── changelog │ ├── compat │ ├── control │ └── rules ├── examples/ │ ├── mmap_test.py │ ├── precision_test.cpp │ ├── precision_test.py │ ├── s_compile_cpp.sh │ └── simple_test.py ├── setup.cfg ├── setup.py ├── src/ │ ├── annoygomodule.h │ ├── annoygomodule.i │ ├── annoylib.h │ ├── annoyluamodule.cc │ ├── annoymodule.cc │ ├── kissrandom.h │ └── mman.h ├── test/ │ ├── accuracy_test.py │ ├── angular_index_test.py │ ├── annoy_test.go │ ├── annoy_test.lua │ ├── dot_index_test.py │ ├── euclidean_index_test.py │ ├── examples_test.py │ ├── hamming_index_test.py │ ├── holes_test.py │ ├── index_test.py │ ├── manhattan_index_test.py │ ├── memory_leak_test.py │ ├── multithreaded_build_test.py │ ├── on_disk_build_test.py │ ├── seed_test.py │ ├── threading_test.py │ └── types_test.py └── tox.ini
SYMBOL INDEX (294 symbols across 24 files)
FILE: annoy/__init__.pyi
class _Vector (line 5) | class _Vector(Protocol, Sized):
method __getitem__ (line 6) | def __getitem__(self, __index: int) -> float: ...
class AnnoyIndex (line 8) | class AnnoyIndex:
method __init__ (line 10) | def __init__(self, f: int, metric: Literal["angular", "euclidean", "ma...
method load (line 11) | def load(self, fn: str, prefault: bool = ...) -> Literal[True]: ...
method save (line 12) | def save(self, fn: str, prefault: bool = ...) -> Literal[True]: ...
method get_nns_by_item (line 14) | def get_nns_by_item(self, i: int, n: int, search_k: int = ..., include...
method get_nns_by_item (line 16) | def get_nns_by_item(
method get_nns_by_item (line 20) | def get_nns_by_item(
method get_nns_by_vector (line 24) | def get_nns_by_vector(
method get_nns_by_vector (line 28) | def get_nns_by_vector(
method get_nns_by_vector (line 32) | def get_nns_by_vector(
method get_item_vector (line 35) | def get_item_vector(self, __i: int) -> list[float]: ...
method add_item (line 36) | def add_item(self, i: int, vector: _Vector) -> None: ...
method on_disk_build (line 37) | def on_disk_build(self, fn: str) -> Literal[True]: ...
method build (line 38) | def build(self, n_trees: int, n_jobs: int = ...) -> Literal[True]: ...
method unbuild (line 39) | def unbuild(self) -> Literal[True]: ...
method unload (line 40) | def unload(self) -> Literal[True]: ...
method get_distance (line 41) | def get_distance(self, __i: int, __j: int) -> float: ...
method get_n_items (line 42) | def get_n_items(self) -> int: ...
method get_n_trees (line 43) | def get_n_trees(self) -> int: ...
method verbose (line 44) | def verbose(self, __v: bool) -> Literal[True]: ...
method set_seed (line 45) | def set_seed(self, __s: int) -> None: ...
FILE: examples/precision_test.cpp
function precision (line 20) | int precision(int f=40, int n=1000000){
function help (line 132) | void help(){
function feedback (line 140) | void feedback(int f, int n){
function main (line 148) | int main(int argc, char **argv) {
FILE: src/annoygomodule.h
function namespace (line 6) | namespace GoAnnoy {
function class (line 174) | class AnnoyIndexEuclidean : public AnnoyIndex {
function class (line 182) | class AnnoyIndexManhattan : public AnnoyIndex {
function class (line 190) | class AnnoyIndexDotProduct : public AnnoyIndex {
FILE: src/annoylib.h
function namespace (line 125) | namespace Annoy {
function normalize (line 435) | inline void normalize(Node* node, int f) {
function update_mean (line 444) | inline void update_mean(Node* mean, Node* new_node, T norm, int c, int f) {
function Base (line 450) | struct Angular : Base {
function Angular (line 538) | struct DotProduct : Angular {
function Base (line 706) | struct Hamming : Base {
function Base (line 806) | struct Minkowski : Base {
function Minkowski (line 843) | struct Euclidean : Minkowski {
function Minkowski (line 874) | struct Manhattan : Minkowski {
function virtual (line 918) | virtual S get_n_items() const = 0;
function add_item (line 982) | bool add_item(S item, const T* w, char** error=NULL) {
function on_disk_build (line 1012) | bool on_disk_build(const char* file, char** error=NULL) {
function unbuild (line 1080) | bool unbuild(char** error=NULL) {
function reinitialize (line 1129) | void reinitialize() {
function unload (line 1141) | void unload() {
function T (line 1224) | T get_distance(S i, S j) const {
function get_nns_by_item (line 1228) | void get_nns_by_item(S item, size_t n, int search_k, vector<S>* result, ...
function get_nns_by_vector (line 1234) | void get_nns_by_vector(const T* w, size_t n, int search_k, vector<S>* re...
function verbose (line 1246) | void verbose(bool v) {
function get_item (line 1250) | void get_item(S item, T* v) const {
function set_seed (line 1256) | void set_seed(R seed) {
function thread_build (line 1260) | void thread_build(int q, int thread_idx, ThreadedBuildPolicy& threaded_b...
function _allocate_size (line 1319) | void _allocate_size(S n, ThreadedBuildPolicy& threaded_build_policy) {
function _allocate_size (line 1327) | void _allocate_size(S n) {
function Node (line 1333) | Node* _get(const S i) const {
function _split_imbalance (line 1337) | double _split_imbalance(const vector<S>& left_indices, const vector<S>& ...
function S (line 1344) | S _make_tree(const vector<S>& indices, bool is_root, Random& _random, Th...
function _get_all_nns (line 1447) | void _get_all_nns(const T* v, size_t n, int search_k, vector<S>* result,...
function class (line 1507) | class AnnoyIndexSingleThreadedBuildPolicy {
function class (line 1529) | class AnnoyIndexMultiThreadedBuildPolicy {
function lock_n_nodes (line 1564) | void lock_n_nodes() {
function unlock_n_nodes (line 1567) | void unlock_n_nodes() {
function lock_nodes (line 1571) | void lock_nodes() {
function unlock_nodes (line 1574) | void unlock_nodes() {
function lock_shared_nodes (line 1578) | void lock_shared_nodes() {
function unlock_shared_nodes (line 1581) | void unlock_shared_nodes() {
function lock_roots (line 1585) | void lock_roots() {
function unlock_roots (line 1588) | void unlock_roots() {
FILE: src/annoyluamodule.cc
class LuaAnnoy (line 34) | class LuaAnnoy {
class LuaArrayProxy (line 41) | class LuaArrayProxy {
method LuaArrayProxy (line 43) | LuaArrayProxy(lua_State* L, int object, int f)
method toVector (line 64) | static void toVector(lua_State* L, int object, int f, AnnoyT* dst) {
method pushVector (line 72) | static void pushVector(lua_State* L, const Vector& v) {
method Impl (line 84) | static Impl* getAnnoy(lua_State* L, int object) {
method getItemIndex (line 90) | static int getItemIndex(lua_State* L, int object, int size = -1) {
method gc (line 99) | static int gc(lua_State* L) {
method tostring (line 105) | static int tostring(lua_State* L) {
method add_item (line 115) | static int add_item(lua_State* L) {
method build (line 122) | static int build(lua_State* L) {
method on_disk_build (line 131) | static int on_disk_build(lua_State* L) {
method save (line 139) | static int save(lua_State* L) {
method load (line 152) | static int load(lua_State* L) {
method unload (line 167) | static int unload(lua_State* L) {
type Searcher (line 174) | struct Searcher {
method Searcher (line 182) | Searcher(lua_State* L) {
method pushResults (line 196) | int pushResults(lua_State* L) {
method get_nns_by_item (line 205) | static int get_nns_by_item(lua_State* L) {
method get_nns_by_vector (line 213) | static int get_nns_by_vector(lua_State* L) {
method get_item_vector (line 223) | static int get_item_vector(lua_State* L) {
method get_distance (line 233) | static int get_distance(lua_State* L) {
method get_n_items (line 242) | static int get_n_items(lua_State* L) {
method luaL_Reg (line 248) | static const luaL_Reg* getMetatable() {
method luaL_Reg (line 257) | static const luaL_Reg* getMethods() {
method createNew (line 275) | static void createNew(lua_State* L, int f) {
function lua_an_make (line 288) | static int lua_an_make(lua_State* L) {
function luaopen_annoy (line 314) | int luaopen_annoy(lua_State* L) {
FILE: src/annoymodule.cc
class Annoy::AnnoyIndexInterface<int32_t, float> (line 65) | class Annoy::AnnoyIndexInterface<int32_t, float>
class HammingWrapper (line 67) | class HammingWrapper : public AnnoyIndexInterface<int32_t, float> {
method _pack (line 74) | void _pack(const float* src, uint64_t* dst) const {
method _unpack (line 82) | void _unpack(const uint64_t* src, float* dst) const {
method HammingWrapper (line 88) | HammingWrapper(int f) : _f_external(f), _f_internal((f + 63) / 64), _i...
method add_item (line 89) | bool add_item(int32_t item, const float* w, char**error) {
method build (line 94) | bool build(int q, int n_threads, char** error) { return _index.build(q...
method unbuild (line 95) | bool unbuild(char** error) { return _index.unbuild(error); }
method save (line 96) | bool save(const char* filename, bool prefault, char** error) { return ...
method unload (line 97) | void unload() { _index.unload(); }
method load (line 98) | bool load(const char* filename, bool prefault, char** error) { return ...
method get_distance (line 99) | float get_distance(int32_t i, int32_t j) const { return _index.get_dis...
method get_nns_by_item (line 100) | void get_nns_by_item(int32_t item, size_t n, int search_k, vector<int3...
method get_nns_by_vector (line 109) | void get_nns_by_vector(const float* w, size_t n, int search_k, vector<...
method get_n_items (line 120) | int32_t get_n_items() const { return _index.get_n_items(); }
method get_n_trees (line 121) | int32_t get_n_trees() const { return _index.get_n_trees(); }
method verbose (line 122) | void verbose(bool v) { _index.verbose(v); }
method get_item (line 123) | void get_item(int32_t item, float* v) const {
method set_seed (line 128) | void set_seed(uint64_t q) { _index.set_seed(q); }
method on_disk_build (line 129) | bool on_disk_build(const char* filename, char** error) { return _index...
function PyObject (line 140) | static PyObject *
function py_an_init (line 175) | static int
function py_an_dealloc (line 187) | static void
function PyObject (line 201) | static PyObject *
function PyObject (line 220) | static PyObject *
function PyObject (line 239) | PyObject*
function check_constraints (line 286) | bool check_constraints(py_annoy *self, int32_t item, bool building) {
function PyObject (line 298) | static PyObject*
function convert_list_to_vector (line 323) | bool
function PyObject (line 354) | static PyObject*
function PyObject (line 381) | static PyObject*
function PyObject (line 415) | static PyObject*
function PyObject (line 443) | static PyObject *
function PyObject (line 460) | static PyObject *
function PyObject (line 485) | static PyObject *
function PyObject (line 501) | static PyObject *
function PyObject (line 512) | static PyObject *
function PyObject (line 529) | static PyObject *
function PyObject (line 538) | static PyObject *
function PyObject (line 547) | static PyObject *
function PyObject (line 561) | static PyObject *
type PyModuleDef (line 641) | struct PyModuleDef
function PyObject (line 654) | PyObject *create_module(void) {
function PyMODINIT_FUNC (line 675) | PyMODINIT_FUNC PyInit_annoylib(void) {
function PyMODINIT_FUNC (line 679) | PyMODINIT_FUNC initannoylib(void) {
FILE: src/kissrandom.h
function namespace (line 11) | namespace Annoy {
function kiss (line 68) | struct Kiss64Random {
function flip (line 104) | inline int flip() {
function index (line 108) | inline size_t index(size_t n) {
function set_seed (line 112) | inline void set_seed(uint64_t seed) {
FILE: src/mman.h
function __map_mman_error (line 42) | static int __map_mman_error(const DWORD err, const int deferr)
function DWORD (line 50) | static DWORD __map_mmap_prot_page(const int prot)
function DWORD (line 71) | static DWORD __map_mmap_prot_file(const int prot)
function munmap (line 159) | inline int munmap(void *addr, size_t len)
function mprotect (line 169) | inline int mprotect(void *addr, size_t len, int prot)
function msync (line 182) | inline int msync(void *addr, size_t len, int flags)
function mlock (line 192) | inline int mlock(const void *addr, size_t len)
function munlock (line 202) | inline int munlock(const void *addr, size_t len)
function ftruncate (line 213) | inline int ftruncate(const int fd, const int64_t size) {
FILE: test/accuracy_test.py
function _get_index (line 29) | def _get_index(dataset, custom_distance=None, custom_dim=None):
function _test_index (line 67) | def _test_index(dataset, exp_accuracy, custom_metric=None, custom_dim=No...
function test_glove_25 (line 92) | def test_glove_25():
function test_nytimes_16 (line 96) | def test_nytimes_16():
function test_lastfm_dot (line 100) | def test_lastfm_dot():
function test_lastfm_angular (line 104) | def test_lastfm_angular():
FILE: test/angular_index_test.py
function test_get_nns_by_vector (line 23) | def test_get_nns_by_vector():
function test_get_nns_by_item (line 36) | def test_get_nns_by_item():
function test_dist (line 49) | def test_dist():
function test_dist_2 (line 58) | def test_dist_2():
function test_dist_3 (line 67) | def test_dist_3():
function test_dist_degen (line 78) | def test_dist_degen():
function test_large_index (line 87) | def test_large_index():
function precision (line 106) | def precision(n, n_trees=10, n_points=10000, n_rounds=10, search_k=100000):
function test_precision_1 (line 128) | def test_precision_1():
function test_precision_10 (line 132) | def test_precision_10():
function test_precision_100 (line 136) | def test_precision_100():
function test_precision_1000 (line 140) | def test_precision_1000():
function test_load_save_get_item_vector (line 144) | def test_load_save_get_item_vector():
function test_get_nns_search_k (line 160) | def test_get_nns_search_k():
function test_include_dists (line 172) | def test_include_dists():
function test_include_dists_check_ranges (line 187) | def test_include_dists_check_ranges():
function test_distance_consistency (line 198) | def test_distance_consistency():
function test_only_one_item (line 228) | def test_only_one_item():
function test_no_items (line 242) | def test_no_items():
function test_single_vector (line 257) | def test_single_vector():
FILE: test/annoy_test.go
type AnnoyTestSuite (line 29) | type AnnoyTestSuite struct
method SetupTest (line 42) | func (suite *AnnoyTestSuite) SetupTest() {
method TestFileHandling (line 45) | func (suite *AnnoyTestSuite) TestFileHandling() {
method TestOnDiskBuild (line 109) | func (suite *AnnoyTestSuite) TestOnDiskBuild() {
method TestGetNnsByVector (line 146) | func (suite *AnnoyTestSuite) TestGetNnsByVector() {
method TestGetNnsByItem (line 203) | func (suite *AnnoyTestSuite) TestGetNnsByItem() {
method TestGetItem (line 222) | func (suite *AnnoyTestSuite) TestGetItem() {
method TestGetDistance (line 244) | func (suite *AnnoyTestSuite) TestGetDistance() {
method TestGetDotProductDistance (line 255) | func (suite *AnnoyTestSuite) TestGetDotProductDistance() {
method TestLargeEuclideanIndex (line 267) | func (suite *AnnoyTestSuite) TestLargeEuclideanIndex() {
function Round (line 33) | func Round(f float64) float64 {
function RoundPlus (line 37) | func RoundPlus(f float64, places int) float64 {
function TestAnnoyTestSuite (line 300) | func TestAnnoyTestSuite(t *testing.T) {
FILE: test/dot_index_test.py
function dot_metric (line 23) | def dot_metric(a, b):
function recall (line 27) | def recall(retrieved, relevant):
function test_get_nns_by_vector (line 31) | def test_get_nns_by_vector():
function test_get_nns_by_item (line 44) | def test_get_nns_by_item():
function test_dist (line 56) | def test_dist():
function recall_at (line 68) | def recall_at(n, n_trees=10, n_points=1000, n_rounds=5):
function test_recall_at_10 (line 98) | def test_recall_at_10():
function test_recall_at_100 (line 103) | def test_recall_at_100():
function test_recall_at_1000 (line 108) | def test_recall_at_1000():
function test_recall_at_1000_fewer_trees (line 113) | def test_recall_at_1000_fewer_trees():
function test_get_nns_with_distances (line 118) | def test_get_nns_with_distances():
function test_include_dists (line 139) | def test_include_dists():
function test_distance_consistency (line 152) | def test_distance_consistency():
FILE: test/euclidean_index_test.py
function test_get_nns_by_vector (line 23) | def test_get_nns_by_vector():
function test_get_nns_by_item (line 36) | def test_get_nns_by_item():
function test_dist (line 48) | def test_dist():
function test_large_index (line 59) | def test_large_index():
function precision (line 77) | def precision(n, n_trees=10, n_points=10000, n_rounds=10):
function test_precision_1 (line 99) | def test_precision_1():
function test_precision_10 (line 103) | def test_precision_10():
function test_precision_100 (line 107) | def test_precision_100():
function test_precision_1000 (line 111) | def test_precision_1000():
function test_get_nns_with_distances (line 115) | def test_get_nns_with_distances():
function test_include_dists (line 136) | def test_include_dists():
function test_distance_consistency (line 149) | def test_distance_consistency():
function test_rounding_error (line 167) | def test_rounding_error():
FILE: test/examples_test.py
function execfile (line 1) | def execfile(fn):
function simple_test (line 6) | def simple_test():
function mmap_test (line 10) | def mmap_test():
function precision_test (line 14) | def precision_test():
FILE: test/hamming_index_test.py
function test_basic_conversion (line 22) | def test_basic_conversion():
function test_basic_nns (line 39) | def test_basic_nns():
function test_save_load (line 55) | def test_save_load():
function test_many_vectors (line 72) | def test_many_vectors():
function test_zero_vectors (line 94) | def test_zero_vectors():
FILE: test/holes_test.py
function test_random_holes (line 22) | def test_random_holes():
function _test_holes_base (line 41) | def _test_holes_base(n, f=100, base_i=100000):
function test_root_one_child (line 50) | def test_root_one_child():
function test_root_two_children (line 55) | def test_root_two_children():
function test_root_some_children (line 59) | def test_root_some_children():
function test_root_many_children (line 64) | def test_root_many_children():
FILE: test/index_test.py
function test_not_found_tree (line 23) | def test_not_found_tree():
function test_binary_compatibility (line 29) | def test_binary_compatibility():
function test_load_unload (line 37) | def test_load_unload():
function test_construct_load_destruct (line 45) | def test_construct_load_destruct():
function test_construct_destruct (line 51) | def test_construct_destruct():
function test_save_twice (line 57) | def test_save_twice():
function test_load_save (line 67) | def test_load_save():
function test_save_without_build (line 92) | def test_save_without_build():
function test_unbuild_with_loaded_tree (line 101) | def test_unbuild_with_loaded_tree():
function test_seed (line 108) | def test_seed():
function test_unknown_distance (line 114) | def test_unknown_distance():
function test_metric_kwarg (line 119) | def test_metric_kwarg():
function test_metric_f_kwargs (line 128) | def test_metric_f_kwargs():
function test_item_vector_after_save (line 132) | def test_item_vector_after_save():
function test_prefault (line 149) | def test_prefault():
function test_fail_save (line 155) | def test_fail_save():
function test_overwrite_index (line 161) | def test_overwrite_index():
function test_get_n_trees (line 194) | def test_get_n_trees():
function test_write_failed (line 200) | def test_write_failed():
function test_dimension_mismatch (line 219) | def test_dimension_mismatch():
function test_add_after_save (line 234) | def test_add_after_save():
function test_build_twice (line 248) | def test_build_twice():
function test_very_large_index (line 259) | def test_very_large_index():
FILE: test/manhattan_index_test.py
function test_get_nns_by_vector (line 23) | def test_get_nns_by_vector():
function test_get_nns_by_item (line 36) | def test_get_nns_by_item():
function test_dist (line 48) | def test_dist():
function test_large_index (line 59) | def test_large_index():
function precision (line 76) | def precision(n, n_trees=10, n_points=10000, n_rounds=10):
function test_precision_1 (line 98) | def test_precision_1():
function test_precision_10 (line 102) | def test_precision_10():
function test_precision_100 (line 106) | def test_precision_100():
function test_precision_1000 (line 110) | def test_precision_1000():
function test_get_nns_with_distances (line 114) | def test_get_nns_with_distances():
function test_include_dists (line 135) | def test_include_dists():
function test_distance_consistency (line 148) | def test_distance_consistency():
FILE: test/memory_leak_test.py
function test_get_item_vector (line 21) | def test_get_item_vector():
function test_get_lots_of_nns (line 31) | def test_get_lots_of_nns():
function test_build_unbuid (line 40) | def test_build_unbuid():
function test_include_distances (line 54) | def test_include_distances():
FILE: test/multithreaded_build_test.py
function _test_building_with_threads (line 6) | def _test_building_with_threads(n_jobs):
function test_one_thread (line 16) | def test_one_thread():
function test_two_threads (line 20) | def test_two_threads():
function test_four_threads (line 24) | def test_four_threads():
function test_eight_threads (line 28) | def test_eight_threads():
FILE: test/on_disk_build_test.py
function setUp (line 23) | def setUp():
function add_items (line 28) | def add_items(i):
function check_nns (line 34) | def check_nns(i):
function test_on_disk (line 40) | def test_on_disk():
FILE: test/seed_test.py
function test_seeding (line 20) | def test_seeding():
FILE: test/threading_test.py
function test_threads (line 22) | def test_threads():
FILE: test/types_test.py
function test_numpy (line 23) | def test_numpy(n_points=1000, n_trees=10):
function test_tuple (line 36) | def test_tuple(n_points=1000, n_trees=10):
function test_wrong_length (line 45) | def test_wrong_length(n_points=1000, n_trees=10):
function test_range_errors (line 57) | def test_range_errors(n_points=1000, n_trees=10):
function test_missing_len (line 74) | def test_missing_len():
function test_missing_getitem (line 89) | def test_missing_getitem():
function test_short (line 105) | def test_short():
function test_non_float (line 122) | def test_non_float():
Condensed preview — 50 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (228K chars).
[
{
"path": ".github/workflows/ci.yml",
"chars": 575,
"preview": "name: Annoy\n\non:\n push:\n branches:\n - main\n pull_request:\n\njobs:\n unit-tests:\n runs-on: ubuntu-22.04\n s"
},
{
"path": ".github/workflows/publish.yml",
"chars": 1940,
"preview": "name: Publish\n\non:\n push:\n tags:\n - 'v*.*.*'\n\njobs:\n build:\n runs-on: ${{ matrix.os }}\n strategy:\n "
},
{
"path": ".gitignore",
"chars": 128,
"preview": "*.egg-info/\n*.egg/\n*.so\n*.o\nbuild/\ndist/\n.vscode/\n*.pdb\n\nMANIFEST\n*.py[cod]\n*.idea\n\n# testing\n*.ann\n*.tree\n*.annoy\n*.idx"
},
{
"path": "CMakeLists.txt",
"chars": 917,
"preview": "cmake_minimum_required(VERSION 3.15...3.25 FATAL_ERROR)\n\nproject(Annoy\n DESCRIPTION \"Approximate Nearest Neighbors Oh Y"
},
{
"path": "LICENSE",
"chars": 11362,
"preview": "\n Apache License\n Version 2.0, January 2004\n "
},
{
"path": "MANIFEST.in",
"chars": 102,
"preview": "include README.rst LICENSE ann.png\ninclude src/annoylib.h\ninclude src/kissrandom.h\ninclude src/mman.h\n"
},
{
"path": "README.rst",
"chars": 12483,
"preview": "Annoy\n-----\n\n\n\n.. figure:: https://raw.github.com/spotify/annoy/master/ann.png\n :alt: Annoy example\n :align: center\n"
},
{
"path": "README_GO.rst",
"chars": 2028,
"preview": "Install\n-------\n\nTo install, you'll need Swig (tested with Swig 4.2.1 on Ubuntu 24.04), and then just::\n\n swig -go -int"
},
{
"path": "README_Lua.md",
"chars": 2150,
"preview": "Install\n-------\n\nTo install, you'll need Lua (binary + library) and LuaRocks.\n\nIf you have Python and Pip, you can get L"
},
{
"path": "RELEASE.md",
"chars": 517,
"preview": "How to release\n--------------\n\n1. Make sure you're on master. `git checkout master && git fetch && git reset --hard orig"
},
{
"path": "annoy/__init__.py",
"chars": 684,
"preview": "# Copyright (c) 2013 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n# use th"
},
{
"path": "annoy/__init__.pyi",
"chars": 2030,
"preview": "\nfrom typing import Sized, overload\nfrom typing_extensions import Literal, Protocol\n\nclass _Vector(Protocol, Sized):\n "
},
{
"path": "annoy/py.typed",
"chars": 0,
"preview": ""
},
{
"path": "annoy-dev-1.rockspec",
"chars": 1712,
"preview": "-- Copyright (c) 2016 Boris Nagaev\n--\n-- Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n-- "
},
{
"path": "debian/changelog",
"chars": 147,
"preview": "spotify-annoy (1.0.0) unstable; urgency=low\n\n * Initial release.\n\n -- Erik Bernhardsson <erikbern@spotify.com> Wed, 20"
},
{
"path": "debian/compat",
"chars": 2,
"preview": "7\n"
},
{
"path": "debian/control",
"chars": 407,
"preview": "Source: spotify-annoy\nSection: non-free/net\nPriority: extra\nMaintainer: Erik Bernhardsson <erikbern@spotify.com>\nBuild-D"
},
{
"path": "debian/rules",
"chars": 30,
"preview": "#!/usr/bin/make -f\n\n%:\n\tdh $@\n"
},
{
"path": "examples/mmap_test.py",
"chars": 288,
"preview": "from annoy import AnnoyIndex\n\na = AnnoyIndex(3, 'angular')\na.add_item(0, [1, 0, 0])\na.add_item(1, [0, 1, 0])\na.add_item("
},
{
"path": "examples/precision_test.cpp",
"chars": 4835,
"preview": "/*\n * precision_test.cpp\n\n *\n * Created on: Jul 13, 2016\n * Author: Claudio Sanhueza\n * Contact: csanhuezalob"
},
{
"path": "examples/precision_test.py",
"chars": 1074,
"preview": "from __future__ import print_function\nimport random, time\nfrom annoy import AnnoyIndex\n\ntry:\n xrange\nexcept NameError"
},
{
"path": "examples/s_compile_cpp.sh",
"chars": 172,
"preview": "#!/bin/bash\n\n\necho \"compiling precision example...\"\ncmd=\"g++ precision_test.cpp -DANNOYLIB_MULTITHREADED_BUILD -o precis"
},
{
"path": "examples/simple_test.py",
"chars": 229,
"preview": "from annoy import AnnoyIndex\n\na = AnnoyIndex(3, 'angular')\na.add_item(0, [1, 0, 0])\na.add_item(1, [0, 1, 0])\na.add_item("
},
{
"path": "setup.cfg",
"chars": 36,
"preview": "[nosetests]\nattr=!slow\nnocapture=1\n\n"
},
{
"path": "setup.py",
"chars": 4249,
"preview": "#!/usr/bin/env python\n# -*- coding: utf-8 -*-\n#\n# Copyright (c) 2013 Spotify AB\n#\n# Licensed under the Apache License, V"
},
{
"path": "src/annoygomodule.h",
"chars": 4894,
"preview": "#include \"annoylib.h\"\n#include \"kissrandom.h\"\n\nusing namespace Annoy;\n\nnamespace GoAnnoy {\n\n\nclass AnnoyVectorFloat {\n "
},
{
"path": "src/annoygomodule.i",
"chars": 2975,
"preview": "%module annoy\n\nnamespace Annoy {}\n\n%{\n#include \"annoygomodule.h\"\n%}\n\n\n// const float *\n%typemap(gotype) (const float *) "
},
{
"path": "src/annoylib.h",
"chars": 49435,
"preview": "// Copyright (c) 2013 Spotify AB\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n// us"
},
{
"path": "src/annoyluamodule.cc",
"chars": 8493,
"preview": "// Copyright (c) 2016 Boris Nagaev\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n// "
},
{
"path": "src/annoymodule.cc",
"chars": 21226,
"preview": "// Copyright (c) 2013 Spotify AB\n//\n// Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n// us"
},
{
"path": "src/kissrandom.h",
"chars": 2633,
"preview": "#ifndef ANNOY_KISSRANDOM_H\n#define ANNOY_KISSRANDOM_H\n\n#if defined(_MSC_VER) && _MSC_VER == 1500\ntypedef unsigned __int3"
},
{
"path": "src/mman.h",
"chars": 5791,
"preview": "\n// This is from https://code.google.com/p/mman-win32/\n// \n// Licensed under MIT\n\n#ifndef _MMAN_WIN32_H\n#define _MMAN_WI"
},
{
"path": "test/accuracy_test.py",
"chars": 2981,
"preview": "# Copyright (c) 2013 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n# use th"
},
{
"path": "test/angular_index_test.py",
"chars": 7595,
"preview": "# Copyright (c) 2013 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n# use th"
},
{
"path": "test/annoy_test.go",
"chars": 8277,
"preview": "/*\n# Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n# use this file except in compliance wi"
},
{
"path": "test/annoy_test.lua",
"chars": 16466,
"preview": "-- Copyright (c) 2016 Boris Nagaev\n--\n-- Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n-- "
},
{
"path": "test/dot_index_test.py",
"chars": 4209,
"preview": "# Copyright (c) 2018 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n# use th"
},
{
"path": "test/euclidean_index_test.py",
"chars": 4874,
"preview": "# Copyright (c) 2013 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n# use th"
},
{
"path": "test/examples_test.py",
"chars": 250,
"preview": "def execfile(fn):\n with open(fn) as f:\n exec(f.read())\n\n\ndef simple_test():\n execfile(\"examples/simple_test"
},
{
"path": "test/hamming_index_test.py",
"chars": 4238,
"preview": "# Copyright (c) 2013 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n# use th"
},
{
"path": "test/holes_test.py",
"chars": 1837,
"preview": "# Copyright (c) 2013 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n# use th"
},
{
"path": "test/index_test.py",
"chars": 7093,
"preview": "# Copyright (c) 2013 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n# use th"
},
{
"path": "test/manhattan_index_test.py",
"chars": 4554,
"preview": "# Copyright (c) 2013 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n# use th"
},
{
"path": "test/memory_leak_test.py",
"chars": 1785,
"preview": "# Copyright (c) 2013 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n# use th"
},
{
"path": "test/multithreaded_build_test.py",
"chars": 560,
"preview": "import numpy\n\nfrom annoy import AnnoyIndex\n\n\ndef _test_building_with_threads(n_jobs):\n n, f = 10000, 10\n n_trees ="
},
{
"path": "test/on_disk_build_test.py",
"chars": 1333,
"preview": "# Copyright (c) 2013 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n# use th"
},
{
"path": "test/seed_test.py",
"chars": 1108,
"preview": "# Copyright (c) 2013 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n# use th"
},
{
"path": "test/threading_test.py",
"chars": 950,
"preview": "# Copyright (c) 2013 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n# use th"
},
{
"path": "test/types_test.py",
"chars": 3611,
"preview": "# Copyright (c) 2013 Spotify AB\n#\n# Licensed under the Apache License, Version 2.0 (the \"License\"); you may not\n# use th"
},
{
"path": "tox.ini",
"chars": 1120,
"preview": "[tox]\nenvlist=py{26,27,33,34,35,36,37,38,39,310,311,312,313}, go, lua\n\n[testenv]\nsetenv =\n TRAVIS = {env:TRAVIS:}\ncomma"
}
]
About this extraction
This page contains the full source code of the spotify/annoy GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 50 files (211.3 KB), approximately 64.6k tokens, and a symbol index with 294 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.