Repository: jina-ai/annlite
Branch: main
Commit: f0174dee0af6
Files: 97
Total size: 1.1 MB
Directory structure:
gitextract_ifbk854h/
├── .gitattributes
├── .github/
│ ├── release-template.ejs
│ ├── requirements-test.txt
│ └── workflows/
│ ├── cd.yml
│ ├── ci.yml
│ ├── force-release.yml
│ └── tag.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CHANGELOG.md
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── annlite/
│ ├── __init__.py
│ ├── container.py
│ ├── core/
│ │ ├── __init__.py
│ │ ├── codec/
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ ├── pq.py
│ │ │ ├── projector.py
│ │ │ └── vq.py
│ │ └── index/
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── flat_index.py
│ │ ├── hnsw/
│ │ │ ├── __init__.py
│ │ │ └── index.py
│ │ └── pq_index.py
│ ├── enums.py
│ ├── executor.py
│ ├── filter.py
│ ├── helper.py
│ ├── hubble_tools.py
│ ├── index.py
│ ├── math.py
│ ├── profile.py
│ ├── storage/
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── kv.py
│ │ └── table.py
│ └── utils.py
├── benchmarks/
│ ├── filtering_bench.py
│ └── hnsw_bench.py
├── bindings/
│ ├── hnsw_bindings.cpp
│ └── pq_bindings.pyx
├── examples/
│ ├── annlite_vs_simpleindexer.py
│ ├── filter_example.py
│ ├── hnsw_example.py
│ ├── pq_benchmark.py
│ ├── pqlinearscann_benchmark_with_filtering.py
│ └── utils.py
├── executor/
│ ├── Dockerfile
│ ├── README.md
│ ├── benchmark.py
│ ├── config.yml
│ ├── executor.py
│ └── requirements.txt
├── include/
│ └── hnswlib/
│ ├── bruteforce.h
│ ├── fusefilter.h
│ ├── hnswalg.h
│ ├── hnswlib.h
│ ├── space_ip.h
│ ├── space_l2.h
│ ├── space_pq.h
│ └── visited_list_pool.h
├── notebooks/
│ └── fashion_product_search.ipynb
├── pyproject.toml
├── requirements.txt
├── scripts/
│ ├── black.sh
│ ├── get-all-test-paths.sh
│ ├── get-last-release-note.py
│ ├── release.sh
│ └── update-version.sh
├── setup.py
└── tests/
├── __init__.py
├── conftest.py
├── docarray/
│ ├── __init__.py
│ ├── test_add.py
│ ├── test_del.py
│ ├── test_find.py
│ ├── test_get.py
│ └── test_save_load.py
├── executor/
│ ├── __init__.py
│ └── test_executor.py
├── test_codec.py
├── test_crud.py
├── test_dump.py
├── test_enums.py
├── test_filter.py
├── test_hnsw_load_save.py
├── test_index.py
├── test_pq_bind.py
├── test_pq_index.py
├── test_projector.py
├── test_projector_index.py
├── test_store.py
└── test_table.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitattributes
================================================
notebooks/* linguist-vendored
include/* linguist-vendored
bindings/* linguist-vendored
*.h linguist-detectable=false
*.cpp linguist-detectable=false
================================================
FILE: .github/release-template.ejs
================================================
<% var groupCommits = [
{
name: 'breaking',
show: true,
list: []
}, {
name: 'feat',
show: true,
list: []
}, {
name: 'perf',
show: true,
list: []
}, {
name: 'fix',
show: true,
list: []
}, {
name: 'refactor',
show: true,
list: []
}, {
name: 'docs',
show: true,
list: []
}, {
name: 'test',
show: true,
list: []
}, {
name: 'other',
show: true,
list: []
}
]
var all_titles = {};
var all_commiters = {};
var commitHref = "https://github.com/jina-ai/docarray/commit/"
commits.forEach(function (commit) {
var result = (commit.title).match(/^(\w*)(\((.*)\))?\: (.*)$/);
var type = result && result[1];
var scope = result && result[3];
var title = result && result[4];
var committer = commit.authorName
if (!(committer in all_commiters)) {
all_commiters[committer] = 1
}
if (!(title in all_titles)) {
all_titles[title] = 1
if( title != null && (title.indexOf('💥')>-1 || title.indexOf(':boom:')>-1) ){
groupCommits.find(item => item.name === 'breaking').list.push({
type: type,
scope: scope,
title: title,
commit: commit
})
} else if(type == 'fix' || type == 'fixed'){
groupCommits.find(item => item.name === 'fix').list.push({
type: type,
scope: scope,
title: title,
commit: commit
})
} else if(type == 'perf' || type == 'performance'){
groupCommits.find(item => item.name === 'perf').list.push({
type: type,
scope: scope,
title: title,
commit: commit
})
} else if(type == 'feat' || type == 'feature'){
groupCommits.find(item => item.name === 'feat').list.push({
type: type,
scope: scope,
title: title,
commit: commit
})
} else if(type == 'refactor'){
groupCommits.find(item => item.name === 'refactor').list.push({
type: type,
scope: scope,
title: title,
commit: commit
})
} else if(type == 'docs' || type == 'doc'){
groupCommits.find(item => item.name === 'docs').list.push({
type: type,
scope: scope,
title: title,
commit: commit
})
} else if(type == 'test' || type == 'tests' || type == 'ci'){
groupCommits.find(item => item.name === 'test').list.push({
type: type,
scope: scope,
title: title,
commit: commit
})
} else {
groupCommits.find(item => item.name === 'other').list.push({
type: type,
scope: scope,
title: title,
commit: commit
})
}
}
});
var listCommits = function(list, key){
list.forEach(function (ct) {
var type = ct.type;
var scope = ct.scope;
var title = '';
var commit = ct.commit;
if(type){
if(key != 'other'){
title = (scope? '__'+scope+'__: ':'') + ct.title;
}else{
title = '__' + type + (scope? '('+scope+')':'') + '__ : ' + ct.title;
}
}else{
title = commit.title;
}
%> - <% if(typeof commitHref === 'undefined' || commitHref === '') { %>[```<%=commit.sha1.slice(0,8)%>```]<% } else { %>[[```<%=commit.sha1.slice(0,8)%>```](<%=commitHref%><%=commit.sha1%>)]<%}%> __-__ <%=title%> (*<%= commit.authorName %>*)
<% })} %>
🙇 We'd like to thank all contributors for this new release! In particular,
<% Object.keys(all_commiters).forEach(function (key) { %> <%= key %>, <% }) %> 🙇
<%
for(var i of groupCommits){
if(i.list.length == 0) continue;
if (i.name === 'breaking' && i.show) { %>
### 💥 Breaking changes
<% } else if (i.name === 'fix' && i.show) { %>
### 🐞 Bug fixes
<% } else if( i.name === 'feat' && i.show) { %>
### 🆕 New Features
<% } else if(i.name === 'perf' && i.show) { %>
### ⚡ Performance Improvements
<% } else if(i.name === 'refactor' && i.show) { %>
### 🧼 Code Refactoring
<% } else if(i.name === 'docs' && i.show) { %>
### 📗 Documentation
<% } else if(i.name === 'test' && i.show) { %>
### 🏁 Unit Test and CICD
<% } else if (i.name === 'other' && i.show) { %>
### 🍹 Other Improvements
<% }
i.show && listCommits(i.list, i);
} %>
================================================
FILE: .github/requirements-test.txt
================================================
pytest
pytest-custom_exit_code
================================================
FILE: .github/workflows/cd.yml
================================================
name: CD
on:
push:
branches:
- main
jobs:
prep-testbed:
if: |
!startsWith(github.event.head_commit.message, 'chore') &&
!startsWith(github.event.head_commit.message, 'build: hotfix') &&
!endsWith(github.event.head_commit.message, 'reformatted by jina-dev-bot')
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- id: set-matrix
run: |
sudo apt-get install jq
echo "::set-output name=matrix::$(bash scripts/get-all-test-paths.sh)"
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
build-wheels:
needs: [prep-testbed]
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest, windows-latest, macos-latest]
cibw_arch: ["auto64"]
python-version: [['3.7', "cp37-*"], ['3.8', "cp38-*"], ['3.9', "cp39-*"], ['3.10', "cp310-*"]]
steps:
- uses: actions/checkout@v3
with:
fetch-depth: 100
- name: Set up Python ${{ matrix.python-version[0] }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version[0] }}
- name: Update version
shell: bash
run: |
git fetch --depth=1 origin +refs/tags/*:refs/tags/*
./scripts/update-version.sh
- name: Build sdist
if: ${{ matrix.os == 'ubuntu-latest' && matrix.python-version[0] == '3.7' }}
run: |
pip install build
python -m build --sdist
- name: Build wheels
uses: pypa/cibuildwheel@v2.10.2
with:
package-dir: ./
env:
CIBW_ENVIRONMENT: >
STAN_BACKEND="${{ env.STAN_BACKEND }}"
CIBW_MANYLINUX_X86_64_IMAGE: manylinux2014
CIBW_BUILD: ${{ matrix.python-version[1] }}
CIBW_SKIP: "*musllinux*"
CIBW_ARCHS: ${{ matrix.cibw_arch }}
# CIBW_ARCH_MACOS: x86_64 arm64
CIBW_BUILD_FRONTEND: build
- uses: actions/upload-artifact@v3
with:
path: |
./wheelhouse/*.whl
./dist/*.tar.gz
core-test:
needs: [ prep-testbed, build-wheels ]
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ ubuntu-latest, windows-latest, macos-latest ]
cibw_arch: [ "auto64" ]
python-version: [ [ '3.7', "cp37-*" ] ]
test-path: ${{fromJson(needs.prep-testbed.outputs.matrix)}}
env:
JINA_HIDE_SURVEY: "1"
steps:
- uses: actions/checkout@v3
with:
fetch-depth: 100
- name: Set up Python ${{ matrix.python-version[0] }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version[0] }}
- name: Prepare enviroment
run: |
python -m pip install --upgrade pip
pip install jina
pip install --pre docarray
pip install pytest pytest-html pytest-cov pytest-mock pytest-repeat pytest-custom-exit-code pytest-timeout pytest-reraise
- uses: actions/download-artifact@v3
with:
name: artifact
- name: Install annlite linux
if: ${{ matrix.os == 'ubuntu-latest' }}
run: |
pip install wheelhouse/*${{ matrix.python-version[1] }}**linux*.whl
- name: Install annlite macos
if: ${{ matrix.os == 'macos-latest' }}
run: |
pip install wheelhouse/*${{ matrix.python-version[1] }}**macos**x86_64*.whl
- name: Install annlite win
if: ${{ matrix.os == 'windows-latest'}}
run: |
pip install --pre --find-links=wheelhouse/ annlite
- name: Test unix
id: test_unix
if: ${{ matrix.os == 'ubuntu-latest' || matrix.os == 'macos-latest' }}
run: |
cd ..
mv annlite/tests/ ./
pytest --suppress-no-test-exit-code --cov=annlite --cov-report=xml \
-v -s -m "not gpu" ${{ matrix.test-path }}
echo "::set-output name=codecov_flag::annlite"
timeout-minutes: 30
- name: Test win
id: test_win
if: ${{ matrix.os == 'windows-latest'}}
env:
PYTHONIOENCODING: 'utf-8'
run: |
cd ..
move annlite/tests/ ./
cd tests/
pytest -v -s -m "not gpu" -k "test"
timeout-minutes: 30
- name: Check codecov file
id: check_files
uses: andstor/file-existence-action@v1
with:
files: "coverage.xml"
- name: Upload coverage from test to Codecov
uses: codecov/codecov-action@v2
if: steps.check_files.outputs.files_exists == 'true' && ${{ matrix.python-version[0] }} == '3.7' && ${{ matrix.matrix.os }} == 'ubuntu-latest'
with:
file: coverage.xml
flags: ${{ steps.test.outputs.codecov_flag }}
fail_ci_if_error: false
token: ${{ secrets.CODECOV_TOKEN }} # not required for public repos
prerelease:
needs: [core-test]
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
with:
fetch-depth: 100
- uses: actions/download-artifact@v3
with:
name: artifact
- name: Pre-release (.devN)
run: |
git fetch --depth=1 origin +refs/tags/*:refs/tags/*
pip install twine
./scripts/release.sh
env:
TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
JINA_SLACK_WEBHOOK: ${{ secrets.JINA_SLACK_WEBHOOK }}
================================================
FILE: .github/workflows/ci.yml
================================================
name: CI
on:
pull_request:
jobs:
commit-lint:
runs-on: ubuntu-latest
steps:
- name: find the prev warning if exist
uses: peter-evans/find-comment@v1
id: fc
with:
issue-number: ${{ github.event.pull_request.number }}
comment-author: "github-actions[bot]"
body-includes: "bad commit message"
- name: Delete comment if exist
if: ${{ steps.fc.outputs.comment-id != 0 }}
uses: actions/github-script@v3
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
script: |
github.issues.deleteComment({
owner: context.repo.owner,
repo: context.repo.repo,
comment_id: ${{ steps.fc.outputs.comment-id }},
})
- uses: actions/checkout@v2
with:
fetch-depth: 0
- run: 'echo "module.exports = {extends: [''@commitlint/config-conventional'']}" > commitlint.config.js'
- uses: wagoid/commitlint-github-action@v1
env:
GITHUB_TOKEN: "${{ secrets.GITHUB_TOKEN }}"
- name: if lint failed
if: ${{ failure() }}
uses: peter-evans/create-or-update-comment@v1
with:
issue-number: ${{ github.event.pull_request.number }}
body: |
Thanks for your contribution :heart:
:broken_heart: Unfortunately, this PR has one ore more **bad commit messages**, it can not be merged. To fix this problem, please refer to:
- [Commit Message Guideline for the First Time Contributor](https://github.com/jina-ai/jina/issues/553)
- [Contributing Guideline](https://github.com/jina-ai/jina/blob/master/CONTRIBUTING.md)
Note, other CI tests will *not* *start* until the commit messages get fixed.
This message will be deleted automatically when the commit messages get fixed.
reaction-type: "eyes"
# lint-flake-8:
# runs-on: ubuntu-latest
# steps:
# - uses: actions/checkout@v2
# - name: Set up Python 3.7
# uses: actions/setup-python@v2
# with:
# python-version: 3.7
# - name: Lint with flake8
# run: |
# pip install flake8
# # stop the build if there are Python syntax errors or undefined names
# flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics --exclude .git,__pycache__,docs/source/conf.py,old,build,dist,tests/,jina/resources/,bindings
# # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
# flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics --exclude .git,__pycache__,docs/source/conf.py,old,build,dist,tests/,jina/resources/
check-black:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
with:
fetch-depth: 0
- name: Set up Python 3.7
uses: actions/setup-python@v2
with:
python-version: 3.7
- id: file_changes
uses: Ana06/get-changed-files@v1.2
- name: check black
run: ./scripts/black.sh
env:
CHANGED_FILES: ${{ steps.file_changes.outputs.added_modified }}
prep-testbed:
runs-on: ubuntu-latest
needs: [commit-lint, check-black]
steps:
- uses: actions/checkout@v2
- id: set-matrix
run: |
sudo apt-get install jq
echo "::set-output name=matrix::$(bash scripts/get-all-test-paths.sh)"
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
core-test:
needs: [prep-testbed]
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest, windows-latest]
cpython-version: ["cp37-*"]
python-version: [3.7]
cibw_arch: ["auto64"]
test-path: ${{fromJson(needs.prep-testbed.outputs.matrix)}}
env:
JINA_HIDE_SURVEY: "1"
steps:
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v3
with:
python-version: ${{ matrix.python-version }}
- uses: actions/checkout@v3
- name: Prepare enviroment
run: |
python -m pip install --upgrade pip
pip install jina
pip install --pre docarray
pip install pytest pytest-html pytest-cov pytest-mock pytest-repeat pytest-custom-exit-code pytest-timeout pytest-reraise
- name: Build wheel
uses: pypa/cibuildwheel@v2.10.2
with:
package-dir: ./
env:
CIBW_ENVIRONMENT: >
STAN_BACKEND="${{ env.STAN_BACKEND }}"
CIBW_MANYLINUX_X86_64_IMAGE: manylinux2014
CIBW_BUILD: ${{ matrix.cpython-version }}
CIBW_SKIP: "*musllinux*"
CIBW_ARCHS: ${{ matrix.cibw_arch }}
CIBW_BUILD_FRONTEND: build
- name: Install annlite unix
if: ${{ matrix.os == 'ubuntu-latest' }}
run: |
pip install wheelhouse/*linux*.whl
- name: Install annlite win
if: ${{ matrix.os == 'windows-latest'}}
run: |
pip install --pre --find-links=wheelhouse/ annlite
- name: Test unix
id: test_unix
if: ${{ matrix.os == 'ubuntu-latest' || matrix.os == 'macos-latest' }}
run: |
cd ..
mv annlite/tests/ ./
pytest --suppress-no-test-exit-code --cov=annlite --cov-report=xml \
-v -s -m "not gpu" ${{ matrix.test-path }}
echo "::set-output name=codecov_flag::annlite"
timeout-minutes: 30
- name: Test win
id: test_win
if: ${{ matrix.os == 'windows-latest'}}
env:
PYTHONIOENCODING: 'utf-8'
run: |
cd ..
move annlite/tests/ ./
cd tests/
pytest -v -s -m "not gpu" -k "test"
echo "::set-output name=codecov_flag::annlite"
timeout-minutes: 30
- name: Check codecov file
id: check_files
uses: andstor/file-existence-action@v1
with:
files: "coverage.xml"
- name: Upload coverage from test to Codecov
uses: codecov/codecov-action@v2
if: steps.check_files.outputs.files_exists == 'true' && ${{ matrix.python-version }} == '3.7'
with:
file: coverage.xml
flags: ${{ steps.test.outputs.codecov_flag }}
fail_ci_if_error: false
token: ${{ secrets.CODECOV_TOKEN }} # not required for public repos
# just for blocking the merge until all parallel core-test are successful
success-all-test:
needs: [core-test]
if: always()
runs-on: ubuntu-latest
steps:
- uses: technote-space/workflow-conclusion-action@v2
- name: Check Failure
if: env.WORKFLOW_CONCLUSION == 'failure'
run: exit 1
- name: Success
if: ${{ success() }}
run: echo "All Done"
================================================
FILE: .github/workflows/force-release.yml
================================================
name: Manual Release
on:
workflow_dispatch:
inputs:
release_token:
description: 'Your release token'
required: true
release_reason:
description: 'Short reason for this manual release'
required: true
jobs:
token-check:
runs-on: ubuntu-latest
steps:
- run: echo "success!"
if: "${{ github.event.inputs.release_token }} == ${{ env.release_token }}"
env:
release_token: ${{ secrets.ANNLITE_RELEASE_TOKEN }}
build-wheels:
needs: [token-check]
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ ubuntu-latest, windows-latest, macos-latest ]
cibw_arch: [ "auto64" ]
python-version: [ [ '3.7', "cp37-*" ], [ '3.8', "cp38-*" ], [ '3.9', "cp39-*" ], [ '3.10', "cp310-*" ] ]
steps:
- uses: actions/checkout@v3
with:
fetch-depth: 100
- name: Set up Python ${{ matrix.python-version[0] }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version[0] }}
- name: Build sdist
if: ${{ matrix.os == 'ubuntu-latest' && matrix.python-version[0] == '3.7' }}
run: |
pip install build
python -m build --sdist
- name: Build wheels
uses: pypa/cibuildwheel@v2.10.2
with:
package-dir: ./
env:
CIBW_ENVIRONMENT: >
STAN_BACKEND="${{ env.STAN_BACKEND }}"
CIBW_MANYLINUX_X86_64_IMAGE: manylinux2014
CIBW_BUILD: ${{ matrix.python-version[1] }}
CIBW_SKIP: "*musllinux*"
CIBW_ARCHS: ${{ matrix.cibw_arch }}
# CIBW_ARCH_MACOS: x86_64 arm64
CIBW_BUILD_FRONTEND: build
- uses: actions/upload-artifact@v3
with:
path: |
./wheelhouse/*.whl
./dist/*.tar.gz
regular-release:
needs: build-wheels
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
with:
token: ${{ secrets.JINA_DEV_BOT }}
fetch-depth: 100 # means max contribute history is limited to 100 lines
# submodules: true
- uses: actions/setup-python@v2
with:
python-version: 3.7
- uses: actions/download-artifact@v3
with:
name: artifact
- run: |
git fetch --depth=1 origin +refs/tags/*:refs/tags/*
npm install git-release-notes
pip install twine wheel
./scripts/release.sh final "${{ github.event.inputs.release_reason }}" "${{github.actor}}"
env:
TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
JINA_SLACK_WEBHOOK: ${{ secrets.JINA_SLACK_WEBHOOK }}
- if: failure()
run: echo "nothing to release"
- name: bumping master version
uses: ad-m/github-push-action@v0.6.0
with:
github_token: ${{ secrets.JINA_DEV_BOT }}
tags: true
branch: main
================================================
FILE: .github/workflows/tag.yml
================================================
name: Release CD
on:
push:
tags:
- "v*" # push to version tags trigger the build
#on:
# push:
# branches-ignore:
# - '**' # temporally disable this action
jobs:
create-release:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v2
with:
ref: 'main'
- uses: actions/setup-python@v2
with:
python-version: 3.7
- run: |
python scripts/get-last-release-note.py
- name: Create Release
id: create_release
uses: actions/create-release@v1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # This token is provided by Actions, you do not need to create your own token
with:
tag_name: ${{ github.ref }}
release_name: 💫 Patch ${{ github.ref }}
body_path: 'tmp.md'
draft: false
prerelease: false
================================================
FILE: .gitignore
================================================
# Initially taken from Github's Python gitignore file
# local dev
.vscode
bindings/pq_bindings.cpp
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
docs/.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
.idea/
toy*.py
.DS_Store
post/
toy*.ipynb
data/
*.c
.nes_cache
toy*.yml
*.tmp
/junit/
/tests/junit/
/docs/chapters/proto/docs.md
# IntelliJ IDEA
*.iml
.idea
# Rust
/target
Cargo.lock
toy*.py
================================================
FILE: .pre-commit-config.yaml
================================================
repos:
#- repo: https://github.com/terrencepreilly/darglint
# rev: v1.5.8
# hooks:
# - id: darglint
# files: annlite/
# exclude: docs/
# args:
# - --message-template={path}:{line} {msg_id} {msg}
# - -s=sphinx
# - -z=full
# - -v=2
#- repo: https://github.com/pycqa/pydocstyle
# rev: 5.1.1 # pick a git hash / tag to point to
# hooks:
# - id: pydocstyle
# files: annlite/
# exclude: docs/
# args:
# - --select=D101,D102,D103
- repo: https://github.com/timothycrosley/isort
rev: 5.12.0
hooks:
- id: isort
args: ["--profile", "black"]
- repo: https://github.com/psf/black
rev: 22.3.0
hooks:
- id: black
types: [python]
exclude: docs/
args:
- -S
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.0.1
hooks:
- id: trailing-whitespace
- id: check-yaml
- id: end-of-file-fixer
- id: requirements-txt-fixer
- id: double-quote-string-fixer
- id: check-merge-conflict
- id: fix-encoding-pragma
args: ["--remove"]
- id: mixed-line-ending
args: ["--fix=lf"]
- repo: https://github.com/pre-commit/mirrors-clang-format
rev: "v14.0.3"
hooks:
- id: clang-format
================================================
FILE: CHANGELOG.md
================================================
## Release Note (`0.3.0`)
> Release time: 2022-03-04 11:49:06
🙇 We'd like to thank all contributors for this new release! In particular,
numb3r3, felix-wang, David Buchaca Prats, 🙇
### 🆕 New Features
- [[```e4afbe17```](https://github.com/jina-ai/docarray/commit/e4afbe17c724f48de19bfbe6d6c127e62a8bcd5a)] __-__ add cd workflow (*numb3r3*)
### 🐞 Bug fixes
- [[```2aeb5377```](https://github.com/jina-ai/docarray/commit/2aeb537779df6649fab143affa5c5decf8b2e364)] __-__ format (*numb3r3*)
- [[```27ab16c2```](https://github.com/jina-ai/docarray/commit/27ab16c2622606e7bf77af5286620f0cdec4abb0)] __-__ build dep (*numb3r3*)
- [[```7887ac95```](https://github.com/jina-ai/docarray/commit/7887ac95a7954c783264151cd26ae47319a47baa)] __-__ turn on upload pypi (#109) (*felix-wang*)
- [[```2e4739ac```](https://github.com/jina-ai/docarray/commit/2e4739ac2f19e78bc23889ee2a1baef79be65e5d)] __-__ version (*numb3r3*)
- [[```293a0dbf```](https://github.com/jina-ai/docarray/commit/293a0dbfe8b063c66ec4b8b710f44eb1c8ff4041)] __-__ release script (*numb3r3*)
- [[```e77c95d9```](https://github.com/jina-ai/docarray/commit/e77c95d97fe54bb409ce39931de86fb45fdc6f41)] __-__ cov (*numb3r3*)
- [[```287de379```](https://github.com/jina-ai/docarray/commit/287de379686fc4eb91afd99f32a85e3dcbc6bf27)] __-__ install in cd (*numb3r3*)
- [[```88b8e0b6```](https://github.com/jina-ai/docarray/commit/88b8e0b6c033f33d76e5ec3e7e95f3de14d7ed70)] __-__ pip install in ci (*numb3r3*)
- [[```348e0444```](https://github.com/jina-ai/docarray/commit/348e044463039c1a4ed60c43c1f83353378a86f7)] __-__ release (*numb3r3*)
- [[```beb35f29```](https://github.com/jina-ai/docarray/commit/beb35f29c1f695728e7c43b235abb97a71f797ca)] __-__ setup pytest (*numb3r3*)
- [[```e9ea4b89```](https://github.com/jina-ai/docarray/commit/e9ea4b8942d48034a7a471e2b91a6b542913f7c9)] __-__ workflow yml (*numb3r3*)
- [[```5f0e21ec```](https://github.com/jina-ai/docarray/commit/5f0e21ec85a09a1dabc618841eb9fcc4c56c34bb)] __-__ ci yml (*numb3r3*)
- [[```db58e283```](https://github.com/jina-ai/docarray/commit/db58e2835f758c94ec73fa3ccc795c286fdb4e86)] __-__ setup.py (*numb3r3*)
- [[```bdc1c7a2```](https://github.com/jina-ai/docarray/commit/bdc1c7a276086b702c807112d0603c245476944a)] __-__ __cicd__: add scripts (*numb3r3*)
### 🧼 Code Refactoring
- [[```9c693adb```](https://github.com/jina-ai/docarray/commit/9c693adb438742b633d04f554e442ceff9da923d)] __-__ remove pqlite namings (*David Buchaca Prats*)
- [[```793f4711```](https://github.com/jina-ai/docarray/commit/793f4711c42d3a717e9e60e3548c222f1ecc7de2)] __-__ rename project (*numb3r3*)
### 🍹 Other Improvements
- [[```f0c6d809```](https://github.com/jina-ai/docarray/commit/f0c6d809d47a6142f5ccf40c7cc7cf24045ad2e5)] __-__ bump version (*numb3r3*)
## Release Note (`0.3.1`)
> Release time: 2022-03-04 16:29:52
🙇 We'd like to thank all contributors for this new release! In particular,
felix-wang, Jina Dev Bot, 🙇
### 🆕 New Features
- [[```44298038```](https://github.com/jina-ai/docarray/commit/44298038e4e79f3dd0c3532ba6ad5b64d1a35caf)] __-__ add tag release workflow (#110) (*felix-wang*)
### 🍹 Other Improvements
- [[```d15289f9```](https://github.com/jina-ai/docarray/commit/d15289f9912f158c7b9a7ec7d5f58188ecb19cf2)] __-__ __version__: the next version will be 0.3.1 (*Jina Dev Bot*)
## Release Note (`0.3.2`)
> Release time: 2022-06-09 10:04:30
🙇 We'd like to thank all contributors for this new release! In particular,
David Buchaca Prats, Gustavo Ye, felix-wang, Han Xiao, numb3r3, Jina Dev Bot, 🙇
### 🆕 New Features
- [[```a7804baf```](https://github.com/jina-ai/docarray/commit/a7804bafc90064f929afff62192f19e44406b5d1)] __-__ add filter method (#121) (*David Buchaca Prats*)
### 🐞 Bug fixes
- [[```9448fcab```](https://github.com/jina-ai/docarray/commit/9448fcab060efe5b59108f3cac183aeb432a81d6)] __-__ only apply black with changed py files (#118) (*felix-wang*)
- [[```e4e706e3```](https://github.com/jina-ai/docarray/commit/e4e706e313ba5cbfb7083a5dea9e75b8d2813394)] __-__ upgrade traverse path (#113) (*felix-wang*)
- [[```a0fc7cac```](https://github.com/jina-ai/docarray/commit/a0fc7cac7722d108143f6590c3435c77376f76bd)] __-__ tag yml (*numb3r3*)
### 🍹 Other Improvements
- [[```2ce1ec22```](https://github.com/jina-ai/docarray/commit/2ce1ec2283b381f5153ea60141a6bb474bbf0f0c)] __-__ __cpp/h__: clang-format (#117) (*Gustavo Ye*)
- [[```a6168400```](https://github.com/jina-ai/docarray/commit/a61684000af4518aafa41d1d9dea47766bedf247)] __-__ update readme (*Han Xiao*)
- [[```aec33c75```](https://github.com/jina-ai/docarray/commit/aec33c75b8ce883e044a828fbae7142f71dbb05a)] __-__ __version__: the next version will be 0.3.2 (*Jina Dev Bot*)
## Release Note (`0.3.3`)
> Release time: 2022-07-04 13:57:38
🙇 We'd like to thank all contributors for this new release! In particular,
felix-wang, Gustavo Ye, Jina Dev Bot, 🙇
### 🐞 Bug fixes
- [[```e240128f```](https://github.com/jina-ai/docarray/commit/e240128f403bde04bc21e1ac5ce3baa5507db687)] __-__ proto version bump (#126) (*felix-wang*)
- [[```f62809b5```](https://github.com/jina-ai/docarray/commit/f62809b5d90f4fca1762c6a1c13beee40e972212)] __-__ change input type to `data_t` (#123) (*Gustavo Ye*)
### 🍹 Other Improvements
- [[```f7f7b751```](https://github.com/jina-ai/docarray/commit/f7f7b75104a28f1860d3cdc94c87f526e742db51)] __-__ __version__: the next version will be 0.3.3 (*Jina Dev Bot*)
## Release Note (`0.3.4`)
> Release time: 2022-07-20 07:12:22
🙇 We'd like to thank all contributors for this new release! In particular,
Jie Fu, Jina Dev Bot, 🙇
### 🆕 New Features
- [[```1a9de05c```](https://github.com/jina-ai/docarray/commit/1a9de05cdcbb861cd036b2ed9a3a66fa84e707f9)] __-__ copy executor.py to annlite folder (#129) (*Jie Fu*)
### 🍹 Other Improvements
- [[```5225e6cb```](https://github.com/jina-ai/docarray/commit/5225e6cbec7df3fbf4dbd7ab404d34bb7d899a29)] __-__ __version__: the next version will be 0.3.4 (*Jina Dev Bot*)
## Release Note (`0.3.5`)
> Release time: 2022-07-29 08:17:02
🙇 We'd like to thank all contributors for this new release! In particular,
Jie Fu, Jina Dev Bot, 🙇
### 🆕 New Features
- [[```534eed81```](https://github.com/jina-ai/docarray/commit/534eed810e6b56238f0b71cc4123783236eac7fd)] __-__ support load and save hnsw (#133) (*Jie Fu*)
### 🍹 Other Improvements
- [[```1f76fb5d```](https://github.com/jina-ai/docarray/commit/1f76fb5dedc79d331dc655d6409718457c6c05d7)] __-__ Feat task queue (#131) (*Jie Fu*)
- [[```103b5bf3```](https://github.com/jina-ai/docarray/commit/103b5bf31b59ca041d7ca70b8e0442421bcdf279)] __-__ __version__: the next version will be 0.3.5 (*Jina Dev Bot*)
## Release Note (`0.3.6`)
> Release time: 2022-08-23 10:45:36
🙇 We'd like to thank all contributors for this new release! In particular,
felix-wang, Jie Fu, Gustavo Ye, Jina Dev Bot, 🙇
### 🆕 New Features
- [[```690098be```](https://github.com/jina-ai/docarray/commit/690098be576fd5d6149e0502b9d2bf100b726f27)] __-__ add pq support for hnsw searching (#122) (*Gustavo Ye*)
- [[```ad5a5fe3```](https://github.com/jina-ai/docarray/commit/ad5a5fe39293f771b9945102b45f05bebfaf3ad6)] __-__ indexer dumploader (#137) (*felix-wang*)
- [[```29019e3d```](https://github.com/jina-ai/docarray/commit/29019e3da94d0a6025c91919a8978966f073b608)] __-__ integrate annlite with projector (#136) (*Jie Fu*)
- [[```14c95986```](https://github.com/jina-ai/docarray/commit/14c9598602741a5558c02a98bd123b34cddc32b8)] __-__ implement pca (#135) (*Jie Fu*)
### 🐞 Bug fixes
- [[```9f74de8f```](https://github.com/jina-ai/docarray/commit/9f74de8f34bc51ab740a05e9bfabf0be4ec77b87)] __-__ reload duplicate storage (#143) (*felix-wang*)
- [[```7010d778```](https://github.com/jina-ai/docarray/commit/7010d77869dfd2a44ebaeb4a697bbac3e01d6970)] __-__ fix update/delete (#140) (*Jie Fu*)
- [[```8a05887d```](https://github.com/jina-ai/docarray/commit/8a05887dc68d4219190de8767c98fc5a1740e3a2)] __-__ composite pca and pq (#139) (*felix-wang*)
### 🍹 Other Improvements
- [[```157a1a9c```](https://github.com/jina-ai/docarray/commit/157a1a9c189f73f5757d2ec60c4427e41cd130bd)] __-__ __version__: the next version will be 0.3.6 (*Jina Dev Bot*)
## Release Note (`0.3.7`)
> Release time: 2022-08-26 04:48:47
🙇 We'd like to thank all contributors for this new release! In particular,
Jie Fu, Jina Dev Bot, 🙇
### 🐞 Bug fixes
- [[```01cf7a89```](https://github.com/jina-ai/docarray/commit/01cf7a8960f27abcbc414faf2d274e0d8da470db)] __-__ fix np.int64 issue (#146) (*Jie Fu*)
### 🍹 Other Improvements
- [[```b602acce```](https://github.com/jina-ai/docarray/commit/b602acce8f7f2ac085f9cbccae6fadeee1ebcb85)] __-__ __version__: the next version will be 0.3.7 (*Jina Dev Bot*)
## Release Note (`0.3.8`)
> Release time: 2022-08-26 11:12:19
🙇 We'd like to thank all contributors for this new release! In particular,
felix-wang, Jina Dev Bot, 🙇
### 🐞 Bug fixes
- [[```a5eaa53e```](https://github.com/jina-ai/docarray/commit/a5eaa53e84aa374e061ca0bfddc7797fe369f8ec)] __-__ insert when not found at update (#151) (*felix-wang*)
- [[```18af2d4c```](https://github.com/jina-ai/docarray/commit/18af2d4c42ced2ca9d393456ba9722953edce383)] __-__ normalize training data for cosine metric (#150) (*felix-wang*)
### 🍹 Other Improvements
- [[```f0fb1c4b```](https://github.com/jina-ai/docarray/commit/f0fb1c4b37c89fea31fc24fb47b60181f0cd3218)] __-__ __version__: the next version will be 0.3.8 (*Jina Dev Bot*)
## Release Note (`0.3.9`)
> Release time: 2022-08-31 03:46:53
🙇 We'd like to thank all contributors for this new release! In particular,
felix-wang, Gustavo Ye, Jina Dev Bot, 🙇
### 🐞 Bug fixes
- [[```27589456```](https://github.com/jina-ai/docarray/commit/27589456fe72c3e9a1fe7d6c8d198e0e71fd9a89)] __-__ reload indexer (#154) (*felix-wang*)
- [[```cdbe7256```](https://github.com/jina-ai/docarray/commit/cdbe725691cf5cc19db5df250d6d9d384bc94437)] __-__ __py__: roll back the original decorator (#142) (*Gustavo Ye*)
### 🍹 Other Improvements
- [[```f34062b2```](https://github.com/jina-ai/docarray/commit/f34062b2249a3fce8cb51dc9e8b8497d33a54a2f)] __-__ __version__: the next version will be 0.3.9 (*Jina Dev Bot*)
## Release Note (`0.3.10`)
> Release time: 2022-09-06 09:40:07
🙇 We'd like to thank all contributors for this new release! In particular,
felix-wang, numb3r3, Jie Fu, Jina Dev Bot, 🙇
### 🐞 Bug fixes
- [[```bc7f5ae2```](https://github.com/jina-ai/docarray/commit/bc7f5ae2e8e83cef919052a9d8a3a954a1cd813f)] __-__ allow columns as dict format (#160) (*felix-wang*)
- [[```25c3ec3d```](https://github.com/jina-ai/docarray/commit/25c3ec3d616a9ece53c710d734ba71d0eb6fc19c)] __-__ thread safe index and search (#157) (*felix-wang*)
- [[```d235cb83```](https://github.com/jina-ai/docarray/commit/d235cb83a2d8399f1120b3ebf6e54392ecb37efa)] __-__ executor unittest (#156) (*felix-wang*)
### 🏁 Unit Test and CICD
- [[```e6543a28```](https://github.com/jina-ai/docarray/commit/e6543a28894aad5481d1a6a5bbd8f070ca979a90)] __-__ add docarray tests (#153) (*Jie Fu*)
### 🍹 Other Improvements
- [[```6a774648```](https://github.com/jina-ai/docarray/commit/6a774648d772ad98f42641c6094192bf4e7d1877)] __-__ update logo (*numb3r3*)
- [[```33fd66be```](https://github.com/jina-ai/docarray/commit/33fd66be1135ebe63f2a1e1366196c8712e2bc63)] __-__ update readme (*numb3r3*)
- [[```e1ad3a55```](https://github.com/jina-ai/docarray/commit/e1ad3a55d4732a115e6a925075b7b5ed768cd9f8)] __-__ update log (*numb3r3*)
- [[```c2630257```](https://github.com/jina-ai/docarray/commit/c26302572de4e755d64ffee71b017e798f790e9e)] __-__ update readme (#149) (*felix-wang*)
- [[```c1fd8cfe```](https://github.com/jina-ai/docarray/commit/c1fd8cfe746d94de2ebef843f6e13384f266773b)] __-__ __version__: the next version will be 0.3.10 (*Jina Dev Bot*)
## Release Note (`0.3.11`)
> Release time: 2022-09-08 06:17:25
🙇 We'd like to thank all contributors for this new release! In particular,
felix-wang, Jina Dev Bot, 🙇
### 🐞 Bug fixes
- [[```0b435eb6```](https://github.com/jina-ai/docarray/commit/0b435eb6c5a737d370b389ba3c2395796febe95f)] __-__ bump pybind11 (#162) (*felix-wang*)
### 🍹 Other Improvements
- [[```9c43f02c```](https://github.com/jina-ai/docarray/commit/9c43f02c5ec9face32ee58858b8ab25d65edbf12)] __-__ __version__: the next version will be 0.3.11 (*Jina Dev Bot*)
## Release Note (`0.3.12`)
> Release time: 2022-09-19 05:15:49
🙇 We'd like to thank all contributors for this new release! In particular,
felix-wang, YangXiuyu, numb3r3, Jina Dev Bot, 🙇
### 🆕 New Features
- [[```4eb61d3d```](https://github.com/jina-ai/docarray/commit/4eb61d3dea5a86d5212dafaaa0c1ecd87abc399a)] __-__ add limit and offset for filtering (#167) (*felix-wang*)
### 📗 Documentation
- [[```6ecfa833```](https://github.com/jina-ai/docarray/commit/6ecfa833e3d0f69a1a1b4992724753c89fa8d96f)] __-__ fix typo (#168) (*YangXiuyu*)
### 🍹 Other Improvements
- [[```6902a8b1```](https://github.com/jina-ai/docarray/commit/6902a8b15763e73fd1cd90827ccfdc758f2fce32)] __-__ add description about hnsw parameters (#169) (*felix-wang*)
- [[```aec2d605```](https://github.com/jina-ai/docarray/commit/aec2d605d43394638168282c78e13f7f2150200a)] __-__ update readme (*numb3r3*)
- [[```95541bad```](https://github.com/jina-ai/docarray/commit/95541bad119dd2bb03e9894f32f57578cd3d8a7a)] __-__ __version__: the next version will be 0.3.12 (*Jina Dev Bot*)
## Release Note (`0.3.13`)
> Release time: 2022-09-26 02:04:59
🙇 We'd like to thank all contributors for this new release! In particular,
felix-wang, Jie Fu, numb3r3, Jina Dev Bot, 🙇
### 🆕 New Features
- [[```f98a8336```](https://github.com/jina-ai/docarray/commit/f98a83368e9fc81d82abb2be9c8a6569ee45e177)] __-__ change sqlite in_memory to false (#173) (*Jie Fu*)
### 🐞 Bug fixes
- [[```b5f8694f```](https://github.com/jina-ai/docarray/commit/b5f8694fb5c33af15cc9b4a140a0165bf3aa01f4)] __-__ __hnswlib__: cannot find sufficient data with filtering (#176) (*felix-wang*)
- [[```0aa07863```](https://github.com/jina-ai/docarray/commit/0aa078633d7b6b850a3ea0ffe651dbad06240cc0)] __-__ __ci__: unittest (#175) (*felix-wang*)
### 🧼 Code Refactoring
- [[```896c5006```](https://github.com/jina-ai/docarray/commit/896c5006a6cac5f6b144bcb452ec5b2169ca4c88)] __-__ use rockdb as the storage backend (#171) (*felix-wang*)
### 🍹 Other Improvements
- [[```66257b41```](https://github.com/jina-ai/docarray/commit/66257b4147836a8e8b7c17dd12537143509e78a2)] __-__ Revert "refactor: use rockdb as the storage backend (#171)" (*numb3r3*)
- [[```e56aae75```](https://github.com/jina-ai/docarray/commit/e56aae757e65e5f8294a15b6ba998e00bda97e1a)] __-__ __version__: the next version will be 0.3.13 (*Jina Dev Bot*)
## Release Note (`0.4.0`)
> Release time: 2022-10-24 09:53:20
🙇 We'd like to thank all contributors for this new release! In particular,
numb3r3, Jie Fu, YangXiuyu, felix-wang, Ziniu Yu, Gustavo Ye, Jina Dev Bot, 🙇
### 🆕 New Features
- [[```1cd07f2c```](https://github.com/jina-ai/docarray/commit/1cd07f2c33e4f1f4e756baab4a6cd0a1452996cf)] __-__ hub as remote storage (#177) (*YangXiuyu*)
- [[```90db6006```](https://github.com/jina-ai/docarray/commit/90db60067205d8e64a63c0c7eb4e782ac04d1033)] __-__ add flag for close (#191) (*Jie Fu*)
- [[```3b782900```](https://github.com/jina-ai/docarray/commit/3b7829004edfb4f6cf4b77048a747a0d338c21b3)] __-__ cibuildwheels (#186) (*YangXiuyu*)
### 🐞 Bug fixes
- [[```8c807937```](https://github.com/jina-ai/docarray/commit/8c80793733e96680d2ca846371ebc3c684f1d5d7)] __-__ restore when initializing annlite (#201) (*Jie Fu*)
- [[```98f94d0f```](https://github.com/jina-ai/docarray/commit/98f94d0f4ff1bd0b1bc1b99d828d6fd21932dc2b)] __-__ set is_train=true after loading pca (#199) (*Jie Fu*)
- [[```0b9bb413```](https://github.com/jina-ai/docarray/commit/0b9bb413abe631ea07bd7358160060b16cafa827)] __-__ unittest (#198) (*YangXiuyu*)
- [[```0d1d5a2e```](https://github.com/jina-ai/docarray/commit/0d1d5a2e58eb043397f24a1d6efcee07c22e2cb1)] __-__ pre-release (*numb3r3*)
- [[```fdcf3818```](https://github.com/jina-ai/docarray/commit/fdcf3818f1b8a2b11df96f4c6c72e972bab91238)] __-__ cd release version (#197) (*YangXiuyu*)
- [[```f4bbc495```](https://github.com/jina-ai/docarray/commit/f4bbc49529a2ac4174bbb0cc7c88ab8666bd034c)] __-__ clean codes (*numb3r3*)
- [[```6bedafe4```](https://github.com/jina-ai/docarray/commit/6bedafe4c49ce8c4831b4cc01c1b8702924e4b8f)] __-__ test-paths (*numb3r3*)
- [[```beb79b23```](https://github.com/jina-ai/docarray/commit/beb79b23a1b1412bad402410a0b90393f0e535b6)] __-__ __cd__: combine build and test (*numb3r3*)
- [[```19652ed0```](https://github.com/jina-ai/docarray/commit/19652ed0b6ef0ffc9dc1c3d2dc2640075e4d6d13)] __-__ cd workflow (#196) (*YangXiuyu*)
- [[```4fe2b3be```](https://github.com/jina-ai/docarray/commit/4fe2b3be51ed0b571c6f0b9c9f20a68c18b7486c)] __-__ cd workflow (#195) (*YangXiuyu*)
- [[```4fdacde2```](https://github.com/jina-ai/docarray/commit/4fdacde2a22a5e6fb0edc15bb6f77bf4d8f25834)] __-__ cd workflow (#193) (*YangXiuyu*)
- [[```494fbfcb```](https://github.com/jina-ai/docarray/commit/494fbfcbb96257168542f56ec617a9dc9082c084)] __-__ cd release (#192) (*felix-wang*)
- [[```d2803ce0```](https://github.com/jina-ai/docarray/commit/d2803ce00e41a69470a3001e4677927b407c3282)] __-__ cd workflow (#190) (*felix-wang*)
- [[```fe9db3d9```](https://github.com/jina-ai/docarray/commit/fe9db3d9f7e04cbad96832409dd1d3159195c060)] __-__ build on apple silicon (#188) (*Ziniu Yu*)
- [[```bb402ae9```](https://github.com/jina-ai/docarray/commit/bb402ae9674a8410686b8dde3aea82f1b86fc10b)] __-__ __bindings__: build on windows (#183) (*YangXiuyu*)
- [[```efc18f80```](https://github.com/jina-ai/docarray/commit/efc18f80228ef2371c19fbb205cabb94afd47385)] __-__ update executor parameter (#180) (*YangXiuyu*)
- [[```f142089a```](https://github.com/jina-ai/docarray/commit/f142089a419bba3d7e23b2c34b16455ddbcb805d)] __-__ executor tests (#179) (*felix-wang*)
- [[```79e171d4```](https://github.com/jina-ai/docarray/commit/79e171d411f31dad7231bce1682ac78ff7b3e1b2)] __-__ __annliteindexer__: annlite executor integration (#170) (*YangXiuyu*)
### 🧼 Code Refactoring
- [[```e8c59907```](https://github.com/jina-ai/docarray/commit/e8c59907540bada5282a413b43d64f11596678d3)] __-__ use rocksdb as the docs storage engine (#178) (*felix-wang*)
### 🍹 Other Improvements
- [[```e6dfcd2a```](https://github.com/jina-ai/docarray/commit/e6dfcd2a7ff0fdd149dd606487f7aa0107775c63)] __-__ bump to v0.4.0 (*numb3r3*)
- [[```6ce03fd0```](https://github.com/jina-ai/docarray/commit/6ce03fd04e5bb63da3c3d2cf1c620ac77daf6c2a)] __-__ release (*numb3r3*)
- [[```02857ec4```](https://github.com/jina-ai/docarray/commit/02857ec4103e32139f695adac4f7a40c4e65c67a)] __-__ Add pq dist table support (#158) (*Gustavo Ye*)
- [[```ff54290b```](https://github.com/jina-ai/docarray/commit/ff54290bfb19d07b953aa00d5d52cb1e8f805d3a)] __-__ __version__: the next version will be 0.3.14 (*Jina Dev Bot*)
## Release Note (`0.5.0`)
> Release time: 2022-10-26 07:12:58
🙇 We'd like to thank all contributors for this new release! In particular,
numb3r3, felix-wang, Jina Dev Bot, 🙇
### 🐞 Bug fixes
- [[```de5273ab```](https://github.com/jina-ai/docarray/commit/de5273ab7cf81ba26bb4307cd66f3fc50974b8ef)] __-__ undo wheels uploading (#203) (*felix-wang*)
### 🍹 Other Improvements
- [[```5faf1dfa```](https://github.com/jina-ai/docarray/commit/5faf1dfa3326213c3b9e20de0d8b7aca4dbdf6ba)] __-__ regular release (*numb3r3*)
- [[```95e51f26```](https://github.com/jina-ai/docarray/commit/95e51f269ac130b9d54b004734ff68afc07a9cf1)] __-__ __version__: the next version will be 0.4.1 (*Jina Dev Bot*)
## Release Note (`0.5.1`)
> Release time: 2022-11-03 03:29:55
🙇 We'd like to thank all contributors for this new release! In particular,
numb3r3, Jie Fu, Jina Dev Bot, 🙇
### 🆕 New Features
- [[```dd08f6f2```](https://github.com/jina-ai/docarray/commit/dd08f6f28008aaf020cb972f45e154afaf99cb28)] __-__ add file splitter and merger to support big file transfer to hubble (#202) (*Jie Fu*)
### 🍹 Other Improvements
- [[```83e45001```](https://github.com/jina-ai/docarray/commit/83e4500180e98bb967ab2ec17ca202c1702fb411)] __-__ bump 0.5.1 (*numb3r3*)
- [[```557c624b```](https://github.com/jina-ai/docarray/commit/557c624b3419b8bf70386e66a75d1958ef427738)] __-__ fix force release (*numb3r3*)
- [[```77312214```](https://github.com/jina-ai/docarray/commit/773122144a9ec39663829c64090d6116585fb6e1)] __-__ __version__: the next version will be 0.5.1 (*Jina Dev Bot*)
## Release Note (`0.5.2`)
> Release time: 2022-11-03 07:10:10
🙇 We'd like to thank all contributors for this new release! In particular,
felix-wang, Jina Dev Bot, 🙇
### 🐞 Bug fixes
- [[```3f886f10```](https://github.com/jina-ai/docarray/commit/3f886f10d17195913cea839f3e4ac8631426ad5c)] __-__ restore from latest snapshot (#204) (*felix-wang*)
### 🍹 Other Improvements
- [[```2462afa4```](https://github.com/jina-ai/docarray/commit/2462afa4451c66603cc1e25bf755c97a97b7eec1)] __-__ __version__: the next version will be 0.5.2 (*Jina Dev Bot*)
## Release Note (`0.5.4`)
> Release time: 2022-11-04 04:27:28
🙇 We'd like to thank all contributors for this new release! In particular,
felix-wang, Jina Dev Bot, 🙇
### 🐞 Bug fixes
- [[```f2ce3a9a```](https://github.com/jina-ai/docarray/commit/f2ce3a9a4b35a71a492fbe63203114e0d7fa6b52)] __-__ bump rocksdict (#206) (*felix-wang*)
### 🍹 Other Improvements
- [[```c84635c9```](https://github.com/jina-ai/docarray/commit/c84635c9854b7b3c2b5718941979720b673ed7eb)] __-__ __version__: the next version will be 0.5.3 (*Jina Dev Bot*)
## Release Note (`0.5.5`)
> Release time: 2022-12-22 03:27:38
🙇 We'd like to thank all contributors for this new release! In particular,
Jie Fu, Jina Dev Bot, 🙇
### 🐞 Bug fixes
- [[```61e0a57f```](https://github.com/jina-ai/docarray/commit/61e0a57f901b087fbe6cab16aefb1d46d5fd54a2)] __-__ fix offset factory (#208) (*Jie Fu*)
### 🍹 Other Improvements
- [[```f6fdc9d3```](https://github.com/jina-ai/docarray/commit/f6fdc9d34fd8e8de47e7ac68e6813956c2aeebfe)] __-__ __version__: the next version will be 0.5.5 (*Jina Dev Bot*)
## Release Note (`0.5.6`)
> Release time: 2023-02-08 09:06:21
🙇 We'd like to thank all contributors for this new release! In particular,
Jie Fu, Jina Dev Bot, 🙇
### 🐞 Bug fixes
- [[```906ed7d4```](https://github.com/jina-ai/docarray/commit/906ed7d4159527e8b5f00cdc9106433925f04a43)] __-__ save and load offset2ids for list_like feature (#210) (*Jie Fu*)
### 🏁 Unit Test and CICD
- [[```a2519f0f```](https://github.com/jina-ai/docarray/commit/a2519f0f323d0c3cb2d68ad1cd3eeed4beb40ce6)] __-__ add test for offset2ids saving and loading (#212) (*Jie Fu*)
### 🍹 Other Improvements
- [[```68215807```](https://github.com/jina-ai/docarray/commit/68215807953eed366e1d6f48fbbbbc1abebf13ff)] __-__ __version__: the next version will be 0.5.6 (*Jina Dev Bot*)
## Release Note (`0.5.7`)
> Release time: 2023-02-21 12:34:55
🙇 We'd like to thank all contributors for this new release! In particular,
Jie Fu, Jina Dev Bot, 🙇
### 🐞 Bug fixes
- [[```4a66d0c0```](https://github.com/jina-ai/docarray/commit/4a66d0c06e7497cb94cd18da505a2e309594b20c)] __-__ remove delete tags for cell_table and meta_table (#215) (*Jie Fu*)
### 🍹 Other Improvements
- [[```eee752fe```](https://github.com/jina-ai/docarray/commit/eee752fe091dc5b5b519bb2aa866cb7d5dc48822)] __-__ __version__: the next version will be 0.5.7 (*Jina Dev Bot*)
## Release Note (`0.5.8`)
> Release time: 2023-03-24 11:13:35
🙇 We'd like to thank all contributors for this new release! In particular,
felix-wang, YangXiuyu, Jina Dev Bot, 🙇
### 🐞 Bug fixes
- [[```bfeb7e94```](https://github.com/jina-ai/docarray/commit/bfeb7e94c879db263dc4931b54b6b87dac09df81)] __-__ delete and update (#222) (*felix-wang*)
- [[```7783b30f```](https://github.com/jina-ai/docarray/commit/7783b30fd55a10bd2dd0c75dc0451f0fe1834927)] __-__ __hnsw__: bump hnswlib to v0.6.2 (#185) (*YangXiuyu*)
### 🍹 Other Improvements
- [[```4c145ddd```](https://github.com/jina-ai/docarray/commit/4c145ddd19abb4caec479941d1c0ffb03c4cfcf3)] __-__ __version__: the next version will be 0.5.8 (*Jina Dev Bot*)
## Release Note (`0.5.9`)
> Release time: 2023-04-07 04:16:28
🙇 We'd like to thank all contributors for this new release! In particular,
felix-wang, Jina Dev Bot, 🙇
### 🐞 Bug fixes
- [[```ab807ff6```](https://github.com/jina-ai/docarray/commit/ab807ff69e533ec90e0b0f6c782749267324d177)] __-__ bump rocksdb >= 0.3.9 (#225) (*felix-wang*)
### 🍹 Other Improvements
- [[```3668c3fa```](https://github.com/jina-ai/docarray/commit/3668c3fa6f7a2d2af39549987637e11e385a75bb)] __-__ __version__: the next version will be 0.5.9 (*Jina Dev Bot*)
## Release Note (`0.5.10`)
> Release time: 2023-04-19 03:12:50
🙇 We'd like to thank all contributors for this new release! In particular,
Jie Fu, Jina Dev Bot, 🙇
### 🍹 Other Improvements
- [[```d55d544e```](https://github.com/jina-ai/docarray/commit/d55d544e876653bb47e79e2dd00f859c3ef00ffa)] __-__ bumping docarray (#227) (*Jie Fu*)
- [[```86057c69```](https://github.com/jina-ai/docarray/commit/86057c69401e2b6d63822be2c3c3a0f63d4661a6)] __-__ __version__: the next version will be 0.5.10 (*Jina Dev Bot*)
================================================
FILE: LICENSE
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: MANIFEST.in
================================================
include setup.py
include requirements.txt
include pyproject.toml
global-include *.pyx
recursive-include include/hnswlib/ *
================================================
FILE: Makefile
================================================
pypi: dist
twine upload dist/*
dist:
rm -rf dist/*
pip install build
python -m build --sdist
test:
python -m unittest discover --start-directory tests --pattern "*_test*.py"
clean:
rm -rf *.egg-info build dist tmp var tests/__pycache__ annlite/*.so
.PHONY: dist
================================================
FILE: README.md
================================================
A fast embedded library for approximate nearest neighbor search
## What is AnnLite?
`AnnLite` is a *lightweight* and *embeddable* library for **fast** and **filterable** *approximate nearest neighbor search* (ANNS).
It allows to search for nearest neighbors in a dataset of millions of points with a Pythonic API.
**Highlighted features:**
- 🐥 **Easy-to-use**: a simple API is designed to be used with Python. It is easy to use and intuitive to set up to production.
- 🐎 **Fast**: the library uses a highly optimized approximate nearest neighbor search algorithm (*HNSW*) to search for nearest neighbors.
- 🔎 **Filterable**: the library allows you to search for nearest neighbors within a subset of the dataset.
- 🍱 **Integration**: Smooth integration with neural search ecosystem including [Jina](https://github.com/jina-ai/jina) and [DocArray](https://github.com/jina-ai/docarray),
so that users can easily expose search API with **gRPC** and/or **HTTP**.
The library is easy to install and use. It is designed to be used with Python.
## Installation
To use AnnLite, you need to first install it. The easiest way to install AnnLite is using `pip`:
```bash
pip install -U annlite
```
or install from source:
```bash
python setup.py install
```
## Quick start
Before you start, you need to know some experience about [DocArray](https://github.com/jina-ai/docarray).
`AnnLite` is designed to be used with [DocArray](https://github.com/jina-ai/docarray), so you need to know how to use `DocArray` first.
For example, you can create a `DocArray` with `1000` random vectors with `128` dimensions:
```python
from docarray import DocumentArray
import numpy as np
docs = DocumentArray.empty(1000)
docs.embeddings = np.random.random([1000, 128]).astype(np.float32)
```
### Index
Then you can create an `AnnIndexer` to index the created `docs` and search for nearest neighbors:
```python
from annlite import AnnLite
ann = AnnLite(128, metric='cosine', data_path="/tmp/annlite_data")
ann.index(docs)
```
Note that this will create a directory `/tmp/annlite_data` to persist the documents indexed.
If this directory already exists, the index will be loaded from the directory.
And if you want to create a new index, you can delete the directory first.
### Search
Then you can search for nearest neighbors for some query docs with `ann.search()`:
```python
query = DocumentArray.empty(5)
query.embeddings = np.random.random([5, 128]).astype(np.float32)
result = ann.search(query)
```
Then, you can inspect the retrieved docs for each query doc inside `query` matches:
```python
for q in query:
print(f'Query {q.id}')
for k, m in enumerate(q.matches):
print(f'{k}: {m.id} {m.scores["cosine"]}')
```
```bash
Query ddbae2073416527bad66ff186543eff8
0: 47dcf7f3fdbe3f0b8d73b87d2a1b266f {'value': 0.17575037}
1: 7f2cbb8a6c2a3ec7be024b750964f317 {'value': 0.17735684}
2: 2e7eed87f45a87d3c65c306256566abb {'value': 0.17917466}
Query dda90782f6514ebe4be4705054f74452
0: 6616eecba99bd10d9581d0d5092d59ce {'value': 0.14570713}
1: d4e3147fc430de1a57c9883615c252c6 {'value': 0.15338594}
2: 5c7b8b969d4381f405b8f07bc68f8148 {'value': 0.15743542}
...
```
Or shorten the loop as one-liner using the element & attribute selector:
```python
print(query['@m', ('id', 'scores__cosine')])
```
### Query
You can get specific document by its id:
```python
doc = ann.get_doc_by_id('')
```
And you can also get the documents with `limit` and `offset`, which is useful for pagination:
```python
docs = ann.get_docs(limit=10, offset=0)
```
Furthermore, you can also get the documents ordered by a specific column from the index:
```python
docs = ann.get_docs(limit=10, offset=0, order_by='x', ascending=True)
```
**Note**: the `order_by` column must be one of the `columns` in the index.
### Update
After you have indexed the `docs`, you can update the docs in the index by calling `ann.update()`:
```python
updated_docs = docs.sample(10)
updated_docs.embeddings = np.random.random([10, 128]).astype(np.float32)
ann.update(updated_docs)
```
### Delete
And finally, you can delete the docs from the index by calling `ann.delete()`:
```python
to_delete = docs.sample(10)
ann.delete(to_delete)
```
## Search with filters
To support search with filters, the annlite must be created with `colums` parameter, which is a series of fields you want to filter by.
At the query time, the annlite will filter the dataset by providing `conditions` for certain fields.
```python
import annlite
# the column schema: (name:str, dtype:type, create_index: bool)
ann = annlite.AnnLite(128, columns=[('price', float)], data_path="/tmp/annlite_data")
```
Then you can insert the docs, in which each doc has a field `price` with a float value contained in the `tags`:
```python
import random
docs = DocumentArray.empty(1000)
docs = DocumentArray(
[
Document(id=f'{i}', tags={'price': random.random()})
for i in range(1000)
]
)
docs.embeddings = np.random.random([1000, 128]).astype(np.float32)
ann.index(docs)
```
Then you can search for nearest neighbors with filtering conditions as:
```python
query = DocumentArray.empty(5)
query.embeddings = np.random.random([5, 128]).astype(np.float32)
ann.search(query, filter={"price": {"$lte": 50}}, limit=10)
print(f'the result with filtering:')
for i, q in enumerate(query):
print(f'query [{i}]:')
for m in q.matches:
print(f'\t{m.id} {m.scores["euclidean"].value} (price={m.tags["price"]})')
```
The `conditions` parameter is a dictionary of conditions. The key is the field name, and the value is a dictionary of conditions.
The query language is the same as [MongoDB Query Language](https://docs.mongodb.com/manual/reference/operator/query/).
We currently support a subset of those selectors.
- `$eq` - Equal to (number, string)
- `$ne` - Not equal to (number, string)
- `$gt` - Greater than (number)
- `$gte` - Greater than or equal to (number)
- `$lt` - Less than (number)
- `$lte` - Less than or equal to (number)
- `$in` - Included in an array
- `$nin` - Not included in an array
The query will be performed on the field if the condition is satisfied. The following is an example of a query:
1. A Nike shoes with white color
```python
{
"brand": {"$eq": "Nike"},
"category": {"$eq": "Shoes"},
"color": {"$eq": "White"}
}
```
We also support boolean operators `$or` and `$and`:
```python
{
"$and":
{
"brand": {"$eq": "Nike"},
"category": {"$eq": "Shoes"},
"color": {"$eq": "White"}
}
}
```
2. A Nike shoes or price less than `100$`:
```python
{
"$or":
{
"brand": {"$eq": "Nike"},
"price": {"$lte": 100}
}
}
```
## Dump and Load
By default, the hnsw index is in memory. You can dump the index to `data_path` by calling `.dump()`:
```python
from annlite import AnnLite
ann = AnnLite(128, metric='cosine', data_path="/path/to/data_path")
ann.index(docs)
ann.dump()
```
And you can restore the hnsw index from `data_path` if it exists:
```python
new_ann = AnnLite(128, metric='cosine', data_path="/path/to/data_path")
```
If you didn't dump the hnsw index, the index will be rebuilt from scratch. This will take a while.
## Supported distance metrics
The annlite supports the following distance metrics:
#### Supported distances:
| Distance | parameter | Equation |
|----------------------------------------------------------------------|----------------:|--------------------------------------------------------:|
| [Euclidean](https://en.wikipedia.org/wiki/Euclidean_distance) | `euclidean` | d = sqrt(sum((Ai-Bi)^2)) |
| [Inner product](https://en.wikipedia.org/wiki/Inner_product_space) | `inner_product` | d = 1.0 - sum(Ai\*Bi) |
| [Cosine similarity](https://en.wikipedia.org/wiki/Cosine_similarity) | `cosine` | d = 1.0 - sum(Ai\*Bi) / sqrt(sum(Ai\*Ai) * sum(Bi\*Bi)) |
Note that inner product is not an actual metric. An element can be closer to some other element than to itself.
That allows some speedup if you remove all elements that are not the closest to themselves from the index, e.g.,
`inner_product([1.0, 1.0], [1.0. 1.0]) < inner_product([1.0, 1.0], [2.0, 2.0])`
## HNSW algorithm parameters
The HNSW algorithm has several parameters that can be tuned to improve the search performance.
### Search parameters
- `ef_search` - The size of the dynamic list for the nearest neighbors during search (default: `50`).
The larger the value, the more accurate the search results, but the slower the search speed.
The `ef_search` must be larger than `limit` parameter in `search(..., limit)`.
- `limit` - The maximum number of results to return (default: `10`).
## Construction parameters
- `max_connection` - The number of bi-directional links created for every new element during construction (default: `16`).
Reasonable range is from `2` to `100`. Higher values works better for dataset with higher dimensionality and/or high recall.
This parameter also affects the memory consumption during construction, which is roughly `max_connection * 8-10` bytes per stored element.
As an example for `n_dim=4` random vectors optimal `max_connection` for search is somewhere around `6`,
while for high dimensional datasets, higher `max_connection` are required (e.g. `M=48-64`) for optimal performance at high recall.
The range `max_connection=12-48` is ok for the most of the use cases.
When `max_connection` is changed one has to update the other parameters.
Nonetheless, `ef_search` and `ef_construction` parameters can be roughly estimated by assuming that `max_connection * ef_{construction}` is a constant.
- `ef_construction`: The size of the dynamic list for the nearest neighbors during construction (default: `200`).
Higher values give better accuracy, but increase construction time and memory consumption.
At some point, increasing `ef_construction` does not give any more accuracy.
To set `ef_construction` to a reasonable value, one can measure the recall: if the recall is lower than 0.9, then increase `ef_construction` and re-run the search.
To set the parameters, you can define them when creating the annlite:
```python
from annlite import AnnLite
ann = AnnLite(128, columns=[('price', float)], data_path="/tmp/annlite_data", ef_construction=200, max_connection=16)
```
## Benchmark
One can run `executor/benchmark.py` to get a quick performance overview.
|Stored data| Indexing time | Query size=1 | Query size=8 | Query size=64|
|---|---|---|---|---|
|10000 | 2.970 | 0.002 | 0.013 | 0.100|
|100000 | 76.474 | 0.011 | 0.078 | 0.649|
|500000 | 467.936 | 0.046 | 0.356 | 2.823|
|1000000 | 1025.506 | 0.091 | 0.695 | 5.778|
Results with filtering can be generated from `examples/benchmark_with_filtering.py`. This script should produce a table similar to:
| Stored data |% same filter| Indexing time | Query size=1 | Query size=8 | Query size=64|
|-----|-----|-----|-----|-----|-----|
| 10000 | 5 | 2.869 | 0.004 | 0.030 | 0.270 |
| 10000 | 15 | 2.869 | 0.004 | 0.035 | 0.294 |
| 10000 | 20 | 3.506 | 0.005 | 0.038 | 0.287 |
| 10000 | 30 | 3.506 | 0.005 | 0.044 | 0.356 |
| 10000 | 50 | 3.506 | 0.008 | 0.064 | 0.484 |
| 10000 | 80 | 2.869 | 0.013 | 0.098 | 0.910 |
| 100000 | 5 | 75.960 | 0.018 | 0.134 | 1.092 |
| 100000 | 15 | 75.960 | 0.026 | 0.211 | 1.736 |
| 100000 | 20 | 78.475 | 0.034 | 0.265 | 2.097 |
| 100000 | 30 | 78.475 | 0.044 | 0.357 | 2.887 |
| 100000 | 50 | 78.475 | 0.068 | 0.565 | 4.383 |
| 100000 | 80 | 75.960 | 0.111 | 0.878 | 6.815 |
| 500000 | 5 | 497.744 | 0.069 | 0.561 | 4.439 |
| 500000 | 15 | 497.744 | 0.134 | 1.064 | 8.469 |
| 500000 | 20 | 440.108 | 0.152 | 1.199 | 9.472 |
| 500000 | 30 | 440.108 | 0.212 | 1.650 | 13.267 |
| 500000 | 50 | 440.108 | 0.328 | 2.637 | 21.961 |
| 500000 | 80 | 497.744 | 0.580 | 4.602 | 36.986 |
| 1000000 | 5 | 1052.388 | 0.131 | 1.031 | 8.212 |
| 1000000 | 15 | 1052.388 | 0.263 | 2.191 | 16.643 |
| 1000000 | 20 | 980.598 | 0.351 | 2.659 | 21.193 |
| 1000000 | 30 | 980.598 | 0.461 | 3.713 | 29.794 |
| 1000000 | 50 | 980.598 | 0.732 | 5.975 | 47.356 |
| 1000000 | 80 | 1052.388 | 1.151 | 9.255 | 73.552 |
Note that:
- query times presented are represented in seconds.
- `% same filter` indicates the amount of data that verifies a filter in the database.
- For example, if `% same filter = 10` and `Stored data = 1_000_000` then it means `100_000` example verify the filter.
## Next steps
If you already have experience with Jina and DocArray, you can start using `AnnLite` right away.
Otherwise, you can check out this advanced tutorial to learn how to use `AnnLite`: [here]() in practice.
## 🙋 FAQ
**1. Why should I use `AnnLite`?**
`AnnLite` is easy to use and intuitive to set up in production. It is also very fast and memory efficient, making it a great choice for approximate nearest neighbor search.
**2. How do I use `AnnLite` with Jina?**
We have implemented an executor for `AnnLite` that can be used with Jina.
```python
from jina import Flow
with Flow().add(uses='jinahub://AnnLiteIndexer', uses_with={'n_dim': 128}) as f:
f.post('/index', inputs=docs)
```
3. Does `AnnLite` support search with filters?
```text
Yes.
```
## Documentation
You can find the documentation on [Github]() and [ReadTheDocs]()
## 🤝 Contribute and spread the word
We are also looking for contributors who want to help us improve: code, documentation, issues, feedback! Here is how you can get started:
- Have a look through GitHub issues labeled "Good first issue".
- Read our Contributor Covenant Code of Conduct
- Open an issue or submit your pull request!
## License
`AnnLite` is licensed under the [Apache License 2.0]().
================================================
FILE: annlite/__init__.py
================================================
__version__ = '0.5.11'
from .index import AnnLite
================================================
FILE: annlite/container.py
================================================
import warnings
from pathlib import Path
from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
import numpy as np
from docarray import Document, DocumentArray
from loguru import logger
if TYPE_CHECKING:
from .core.codec.pq import PQCodec
from .core.codec.projector import ProjectorCodec
from .core.index.hnsw import HnswIndex
from .enums import Metric
from .storage.base import ExpandMode
from .storage.kv import DocStorage
from .storage.table import CellTable, MetaTable
VALID_FILTERABLE_DATA_TYPES = [int, str, float]
class CellContainer:
def __init__(
self,
n_dim: int,
metric: Metric = Metric.COSINE,
n_cells: int = 1,
projector_codec: Optional['ProjectorCodec'] = None,
pq_codec: Optional['PQCodec'] = None,
initial_size: Optional[int] = None,
expand_step_size: int = 50000,
expand_mode: 'ExpandMode' = ExpandMode.STEP,
filterable_attrs: Optional[Dict] = None,
serialize_config: Optional[Dict] = None,
data_path: 'Path' = Path('./data'),
**kwargs,
):
self.n_dim = n_dim
self.metric = metric
self.n_cells = n_cells
self.n_components = projector_codec.n_components if projector_codec else None
self.data_path = data_path
self.serialize_config = serialize_config
self._pq_codec = pq_codec
self._projector_codec = projector_codec
self._vec_indexes = [
HnswIndex(
dim=self.n_components or n_dim,
metric=metric,
initial_size=initial_size,
expand_step_size=expand_step_size,
expand_mode=expand_mode,
pq_codec=pq_codec,
**kwargs,
)
for _ in range(n_cells)
]
self._doc_stores = [
DocStorage(
data_path / f'cell_{_}',
serialize_config=serialize_config or {},
lock=True,
)
for _ in range(n_cells)
]
columns = []
if filterable_attrs:
for attr_name, attr_type in filterable_attrs.items():
if isinstance(attr_type, str):
attr_type = eval(attr_type)
if attr_type not in VALID_FILTERABLE_DATA_TYPES:
raise ValueError(
f'Invalid filterable attribute type `{attr_type}` for attribute `{attr_name}`. '
)
columns.append((attr_name, attr_type))
self._cell_tables = [
CellTable(f'table_{c}', columns=columns) for c in range(n_cells)
]
self._meta_table = MetaTable('metas', data_path=data_path, in_memory=True)
def ivf_search(
self,
x: 'np.ndarray',
cells: 'np.ndarray',
where_clause: str = '',
where_params: Tuple = (),
limit: int = 10,
):
dists = []
doc_idx = []
cell_ids = []
count = 0
for cell_id in cells:
cell_table = self.cell_table(cell_id)
cell_size = cell_table.count()
if cell_size == 0:
continue
indices = None
if where_clause:
indices = cell_table.query(
where_clause=where_clause, where_params=where_params
)
if len(indices) == 0:
continue
indices = np.array(indices, dtype=np.int64)
_dists, _doc_idx = self.vec_index(cell_id).search(
x, limit=min(limit, cell_size), indices=indices
)
if count >= limit and _dists[0] > dists[-1][-1]:
continue
dists.append(_dists)
doc_idx.append(_doc_idx)
cell_ids.extend([cell_id] * len(_dists))
count += len(_dists)
cell_ids = np.array(cell_ids, dtype=np.int64)
if len(dists) != 0:
dists = np.hstack(dists)
doc_idx = np.hstack(doc_idx)
indices = dists.argsort(axis=0)[:limit]
dists = dists[indices]
cell_ids = cell_ids[indices]
doc_idx = doc_idx[indices]
doc_ids = []
for cell_id, offset in zip(cell_ids, doc_idx):
doc_id = self.cell_table(cell_id).get_docid_by_offset(offset)
doc_ids.append(doc_id)
return dists, doc_ids, cell_ids
def filter_cells(
self,
cells: 'np.ndarray',
where_clause: str = '',
where_params: Tuple = (),
limit: int = -1,
offset: int = 0,
order_by: Optional[str] = None,
ascending: bool = True,
include_metadata: bool = False,
):
result = DocumentArray()
if len(cells) > 1 and offset > 0:
raise ValueError('Offset is not supported for multiple cells')
for cell_id in cells:
cell_table = self.cell_table(cell_id)
cell_size = cell_table.count()
if cell_size == 0:
continue
indices = cell_table.query(
where_clause=where_clause,
where_params=where_params,
order_by=order_by,
limit=limit,
offset=offset,
ascending=ascending,
)
if len(indices) == 0:
continue
for offset in indices:
doc_id = self.cell_table(cell_id).get_docid_by_offset(offset)
doc = Document(id=doc_id)
if include_metadata or (len(cells) > 1 and order_by):
doc = self.doc_store(cell_id).get([doc_id])[0]
result.append(doc)
if not order_by and len(result) >= limit > 0:
break
# reordering the results from multiple cells
if order_by and len(cells) > 1:
result = sorted(
result, key=lambda d: d.tags.get(order_by), reverse=not ascending
)
if limit > 0:
result = result[:limit]
result = DocumentArray(result)
return result
def search_cells(
self,
query: 'np.ndarray',
cells: 'np.ndarray',
where_clause: str = '',
where_params: Tuple = (),
limit: int = 10,
include_metadata: bool = False,
):
if self._projector_codec:
query = self._projector_codec.encode(query)
topk_dists, topk_docs = [], []
for x, cell_idx in zip(query, cells):
# x.shape = (self.n_dim,)
dists, doc_ids, cells = self.ivf_search(
x,
cells=cell_idx,
where_clause=where_clause,
where_params=where_params,
limit=limit,
)
topk_dists.append(dists)
match_docs = DocumentArray()
for dist, doc_id, cell_id in zip(dists, doc_ids, cells):
doc = Document(id=doc_id)
if include_metadata:
doc = self.doc_store(cell_id).get([doc_id])[0]
doc.scores[self.metric.name.lower()].value = dist
match_docs.append(doc)
topk_docs.append(match_docs)
return topk_dists, topk_docs
def _search_cells(
self,
query: 'np.ndarray',
cells: 'np.ndarray',
where_clause: str = '',
where_params: Tuple = (),
limit: int = 10,
):
if self._projector_codec:
query = self._projector_codec.encode(query)
topk_dists, topk_ids = [], []
for x, cell_idx in zip(query, cells):
dists, ids, cells = self.ivf_search(
x,
cells=cell_idx,
where_clause=where_clause,
where_params=where_params,
limit=limit,
)
topk_dists.append(dists)
topk_ids.append(ids)
return topk_dists, [np.array(ids, dtype=int) for ids in topk_ids]
def insert(
self,
data: 'np.ndarray',
cells: 'np.ndarray',
docs: 'DocumentArray',
only_index: bool = False,
):
assert len(docs) == len(data)
if self._projector_codec:
data = self._projector_codec.encode(data)
unique_cells, unique_cell_counts = np.unique(cells, return_counts=True)
if len(unique_cells) == 1:
cell_id = unique_cells[0]
offsets = self.cell_table(cell_id).insert(docs)
offsets = np.array(offsets, dtype=np.int64)
self.vec_index(cell_id).add_with_ids(data, offsets)
self._meta_table.bulk_add_address([d.id for d in docs], cells, offsets)
if not only_index:
self.doc_store(cell_id).insert(docs)
else:
for cell_id, cell_count in zip(unique_cells, unique_cell_counts):
# TODO: Jina should allow boolean filtering in docarray to avoid this
# and simply use cells == cell_index
indices = np.where(cells == cell_id)[0]
cell_docs = docs[indices.tolist()]
cell_offsets = self.cell_table(cell_id).insert(cell_docs)
cell_offsets = np.array(cell_offsets, dtype=np.int64)
cell_data = data[indices, :]
self.vec_index(cell_id).add_with_ids(cell_data, cell_offsets)
self._meta_table.bulk_add_address(
[d.id for d in cell_docs], [cell_id] * cell_count, cell_offsets
)
if not only_index:
self.doc_store(cell_id).insert(cell_docs)
logger.debug(f'{len(docs)} new docs added')
def _add_vecs(self, data: 'np.ndarray', cells: 'np.ndarray', offsets: 'np.ndarray'):
assert data.shape[0] == cells.shape[0]
assert data.shape[1] == self.n_dim
unique_cells, _ = np.unique(cells, return_counts=True)
for cell_id in unique_cells:
indices = cells == cell_id
x = data[indices, :]
ids = offsets[indices]
self.vec_index(cell_id).add_with_ids(x, ids)
def update(
self,
data: 'np.ndarray',
cells: 'np.ndarray',
docs: 'DocumentArray',
insert_if_not_found: bool = True,
raise_errors_on_not_found: bool = False,
):
update_success = 0
new_data = []
new_cells = []
new_docs = []
for (
x,
doc,
cell_id,
) in zip(data, docs, cells):
_cell_id, _offset = self._meta_table.get_address(doc.id)
if cell_id == _cell_id:
self.vec_index(cell_id).add_with_ids(x.reshape(1, -1), [_offset])
self.doc_store(cell_id).update([doc])
self.meta_table.add_address(doc.id, cell_id, _offset)
update_success += 1
elif _cell_id is None:
if raise_errors_on_not_found and not insert_if_not_found:
raise Exception(
f'The document (id={doc.id}) cannot be updated as'
f'it is not found in the index'
)
elif not (raise_errors_on_not_found or insert_if_not_found):
warnings.warn(
f'The document (id={doc.id}) cannot be updated as '
f'it is not found in the index',
RuntimeWarning,
)
elif insert_if_not_found:
new_data.append(x)
new_cells.append(cell_id)
new_docs.append(doc)
update_success += 1
else:
continue
else:
# DELETE and INSERT
self.vec_index(_cell_id).delete(_offset)
self.cell_table(_cell_id).delete_by_offset(_offset)
self.doc_store(_cell_id).delete([doc.id])
new_data.append(x)
new_cells.append(cell_id)
new_docs.append(doc)
update_success += 1
if len(new_data) > 0:
new_data = np.stack(new_data)
new_cells = np.array(new_cells, dtype=np.int64)
self.insert(new_data, new_cells, new_docs)
logger.debug(
f'total items for updating: {len(docs)}, ' f'success: {update_success}'
)
def delete(
self,
ids: List[str],
raise_errors_on_not_found: bool = False,
):
delete_success = 0
for doc_id in ids:
cell_id, offset = self._meta_table.get_address(doc_id)
if cell_id is not None:
self.vec_index(cell_id).delete([offset])
self.cell_table(cell_id).delete_by_offset(offset)
self.doc_store(cell_id).delete([doc_id])
self.meta_table.delete_address(doc_id)
delete_success += 1
else:
if raise_errors_on_not_found:
raise Exception(
f'The document (id={doc_id}) cannot be updated as'
f'it is not found in the index'
)
else:
continue
logger.debug(
f'total items for updating: {len(ids)}, ' f'success: {delete_success}'
)
def _rebuild_database(self):
"""rebuild doc_store and meta_table after annlite download databse from hubble"""
self._doc_stores = [
DocStorage(
self.data_path / f'cell_{_}',
serialize_config=self.serialize_config or {},
lock=True,
)
for _ in range(self.n_cells)
]
# self._meta_table = MetaTable('metas', data_path=self.data_path, in_memory=False)
def _get_doc_by_id(self, doc_id: str):
cell_id = 0
if self.n_cells > 1:
cell_id, _ = self._meta_table.get_address(doc_id)
da = self.doc_store(cell_id).get([doc_id])
return da[0] if len(da) > 0 else None
def documents_generator(self, cell_id: int, batch_size: int = 1000):
for docs in self.doc_store(cell_id).batched_iterator(batch_size=batch_size):
yield docs
@property
def cell_tables(self):
return self._cell_tables
@property
def cell_indexes(self):
return self._vec_indexes
def cell_table(self, cell_id: int):
return self._cell_tables[cell_id]
def doc_store(self, cell_id: int):
return self._doc_stores[cell_id]
def vec_index(self, cell_id: int):
return self._vec_indexes[cell_id]
@property
def meta_table(self):
return self._meta_table
@property
def total_docs(self):
return sum([store.size for store in self._doc_stores])
@property
def index_size(self):
return sum([table.size for table in self._cell_tables])
================================================
FILE: annlite/core/__init__.py
================================================
from .codec import PQCodec, ProjectorCodec, VQCodec
================================================
FILE: annlite/core/codec/__init__.py
================================================
from .pq import PQCodec
from .projector import ProjectorCodec
from .vq import VQCodec
================================================
FILE: annlite/core/codec/base.py
================================================
import pickle
from abc import ABC, abstractmethod
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from pathlib import Path
class BaseCodec(ABC):
def __init__(self, require_train: bool = True):
self.require_train = require_train
self._is_trained = False if require_train else True
@abstractmethod
def fit(self, *args, **kwargs):
pass
@abstractmethod
def encode(self):
pass
@abstractmethod
def decode(self):
pass
def dump(self, target_path: 'Path'):
pickle.dump(self, target_path.open('wb'), protocol=4)
@staticmethod
def load(from_path: 'Path'):
return pickle.load(from_path.open('rb'))
@property
def is_trained(self):
return self._is_trained
def _check_trained(self):
assert self.is_trained is True, f'{self.__class__.__name__} requires training'
================================================
FILE: annlite/core/codec/pq.py
================================================
from argparse import ArgumentError
import numpy as np
from scipy.cluster.vq import vq
from annlite import pq_bind
from ...enums import Metric
from ...math import l2_normalize
from ...profile import time_profile
from .base import BaseCodec
# from pqlite.pq_bind import precompute_adc_table, dist_pqcodes_to_codebooks
class PQCodec(BaseCodec):
"""Implementation of Product Quantization (PQ) [Jegou11]_.
For the indexing phase of database vectors,
a `D`-dim input vector is divided into `M` `D`/`M`-dim sub-vectors.
Each sub-vector is quantized into a small integer via `Ks` codewords.
For the querying phase, given a new `D`-dim query vector, the distance between the query
and the database PQ-codes are efficiently approximated via Asymmetric Distance.
All vectors must be np.ndarray with np.float32
.. [Jegou11] H. Jegou et al., "Product Quantization for Nearest Neighbor Search", IEEE TPAMI 2011
:param d_vector: the dimensionality of input vectors
:param n_subvectors: The number of sub-space
:param n_clusters: The number of codewords for each subspace
(typically 256, so that each sub-vector is quantized
into 256 bits pqlite.utils.asymmetric_distance= 1 byte = uint8)
:param n_init: Number of times K-Means is trained with different centroid seeds. Best result of
the `n_init` consecutive runs is selected.
"""
def __init__(
self,
dim: int,
n_subvectors: int = 8,
n_clusters: int = 256,
metric: Metric = Metric.EUCLIDEAN,
n_init: int = 4,
):
super(PQCodec, self).__init__(require_train=True)
self.dim = dim
self.n_subvectors = n_subvectors
self.n_clusters = n_clusters
assert (
dim % n_subvectors == 0
), 'input dimension must be dividable by number of sub-space'
self.d_subvector = dim // n_subvectors
self.code_dtype = (
np.uint8
if n_clusters <= 2**8
else (np.uint16 if n_clusters <= 2**16 else np.uint32)
)
# assert (
# metric == Metric.EUCLIDEAN
# ), f'The distance metric `{metric.name}` is not supported yet!'
self.metric = metric
self.normalize_input = False
if self.metric == Metric.COSINE:
self.normalize_input = True
self._codebooks = np.zeros(
(self.n_subvectors, self.n_clusters, self.d_subvector), dtype=np.float32
)
self.kmeans = []
self.n_init = n_init
def __hash__(self):
return hash(
(
self.__class__.__name__,
self.dim,
self.n_subvectors,
self.n_clusters,
self.metric,
self.code_dtype,
)
)
def fit(self, x: 'np.ndarray', iter: int = 100):
"""Train the K-Means for each cartesian product
:param x: Training vectors with shape=(N, D)
:param iter: Number of iterations in Kmeans
"""
from sklearn.cluster import KMeans
assert x.dtype == np.float32
assert x.ndim == 2
if self.normalize_input:
x = l2_normalize(x)
# [m][ks][ds]: m-th subspace, ks-the codeword, ds-th dim
self._codebooks = np.zeros(
(self.n_subvectors, self.n_clusters, self.d_subvector), dtype=np.float32
)
for m in range(self.n_subvectors):
kmeans = KMeans(
n_clusters=self.n_clusters, max_iter=iter, n_init=self.n_init
)
self.kmeans.append(kmeans)
self.kmeans[m].fit(x[:, m * self.d_subvector : (m + 1) * self.d_subvector])
self._codebooks[m] = self.kmeans[m].cluster_centers_
self._is_trained = True
def partial_fit(self, x: 'np.ndarray'):
"""Given a batch of training vectors, update the internal MiniBatchKMeans.
This method is specially designed to be used when data does not fit in memory.
:param x: Training vectors with shape=(N, D)
"""
assert x.ndim == 2
if self.normalize_input:
x = l2_normalize(x)
if len(self.kmeans) > 0:
for m in range(self.n_subvectors):
self.kmeans[m].partial_fit(
x[:, m * self.d_subvector : (m + 1) * self.d_subvector]
)
else:
from sklearn.cluster import MiniBatchKMeans
for m in range(self.n_subvectors):
self.kmeans.append(MiniBatchKMeans(n_clusters=self.n_clusters))
for m in range(self.n_subvectors):
self.kmeans[m].partial_fit(
x[:, m * self.d_subvector : (m + 1) * self.d_subvector]
)
def build_codebook(self):
"""Constructs sub-codebooks from the current parameters of the models in `self.kmeans`
This step is not necessary if full KMeans is trained used calling `.fit`.
"""
self._codebooks = np.zeros(
(self.n_subvectors, self.n_clusters, self.d_subvector), dtype=np.float32
)
for m in range(self.n_subvectors):
self._codebooks[m] = self.kmeans[m].cluster_centers_
self._is_trained = True
def encode(self, x: 'np.ndarray'):
"""Encode input vectors into PQ-codes.
:param x: Input vectors with shape=(N, D) and dtype=np.float32.
:return: np.ndarray: PQ codes with shape=(N, M) and dtype=self.code_dtype
"""
assert x.dtype == np.float32
assert x.ndim == 2
N, D = x.shape
assert (
D == self.d_subvector * self.n_subvectors
), 'input dimension must be Ds * M'
# codes[n][m] : code of n-th vec, m-th subspace
codes = np.empty((N, self.n_subvectors), dtype=self.code_dtype)
for m in range(self.n_subvectors):
sub_vecs = x[:, m * self.d_subvector : (m + 1) * self.d_subvector]
codes[:, m], _ = vq(sub_vecs, self.codebooks[m])
return codes
def decode(self, codes: 'np.ndarray'):
"""Given PQ-codes, reconstruct original D-dimensional vectors
approximately by fetching the codewords.
:param codes: PQ-cdoes with shape=(N, M) and dtype=self.code_dtype.
Each row is a PQ-code
:return: Reconstructed vectors with shape=(N, D) and dtype=np.float32
"""
assert codes.ndim == 2
N, M = codes.shape
assert M == self.n_subvectors
assert codes.dtype == self.code_dtype
vecs = np.empty((N, self.d_subvector * self.n_subvectors), dtype=np.float32)
for m in range(self.n_subvectors):
vecs[:, m * self.d_subvector : (m + 1) * self.d_subvector] = self.codebooks[
m
][codes[:, m], :]
return vecs
def precompute_adc(self, query: object) -> object:
"""Compute a distance table for a query vector.
The distances are computed by comparing each sub-vector of the query
to the codewords for each sub-subspace.
`dtable[m][ks]` contains the squared Euclidean distance between
the `m`-th sub-vector of the query and the `ks`-th codeword
for the `m`-th sub-space (`self.codewords[m][ks]`).
:param query: Input vector with shape=(D, ) and dtype=np.float32
:return: Distance table. which contains dtable with shape=(M, Ks)
and dtype=np.float32
"""
assert query.dtype == np.float32
assert query.ndim == 1, 'input must be a single vector'
# dtable[m] : distance between m-th subvec and m-th codewords (m-th subspace)
# dtable[m][ks] : distance between m-th subvec and ks-th codeword of m-th codewords
# Warning: the following line produces `ValueError: buffer source array is read-only`
# if no `const` is used in the cython implementation using a memoryview
dtable = pq_bind.precompute_adc_table(
query, self.d_subvector, self.n_clusters, self.codebooks
)
return DistanceTable(dtable)
@property
def codebooks(self):
return self._codebooks
# trained pq interface ----------------
def get_codebook(self) -> 'np.ndarray':
"""Return the codebook parameters.
Expect a 3-dimensional matrix is returned,
with shape (`n_subvectors`, `n_clusters`, `d_subvector`) and dtype float32
"""
return np.ascontiguousarray(self.codebooks, dtype='float32')
def get_subspace_splitting(self):
"""Return subspace splitting setting
:return: tuple of (`n_subvectors`, `n_clusters`, `d_subvector`)
"""
return (self.n_subvectors, self.n_clusters, self.d_subvector)
# def get_dist_mat(self, x: np.ndarray):
# """Return the distance tables in form of matrix for multiple queries
# :param query: shape('N', 'D'),
# :return: ndarray with shape('N', `n_subvectors`, `n_clusters`)
# .. note::
# _description_
# """
# assert x.dtype == np.float32
# assert x.ndim == 2
# N, D = x.shape
# assert (
# D == self.d_subvector * self.n_subvectors
# ), 'input dimension must be Ds * M'
# if self.normalize_input:
# x = l2_normalize(x)
# x = x.reshape(
# N,
# self.n_subvectors,
# 1,
# self.d_subvector,
# )
# if self.metric == Metric.EUCLIDEAN:
# # (1, n_subvectors, n_clusters, d_subvector)
# codebook = self.codebooks[np.newaxis, ...]
# # broadcast to (N, n_subvectors, n_clusters, d_subvector)
# dist_vector = (x - codebook) ** 2
# # reduce to (N, n_subvectors, n_clusters)
# dist_mat = np.sum(dist_vector, axis=3)
# elif self.metric in [Metric.INNER_PRODUCT, Metric.COSINE]:
# # (1, n_subvectors, n_clusters, d_subvector)
# codebook = self.codebooks[np.newaxis, ...]
# # broadcast to (N, n_subvectors, n_clusters, d_subvector)
# dist_vector = x * codebook
# # reduce to (N, n_subvectors, n_clusters)
# dist_mat = 1 / self.n_clusters - np.sum(dist_vector, axis=3)
# else:
# raise ArgumentError(f'Unable support metrics {self.metric}')
# return np.ascontiguousarray(dist_mat, dtype='float32')
def get_dist_mat(self, x: np.ndarray):
"""Return the distance tables in form of matrix for multiple queries
:param query: shape('N', 'D'),
:return: ndarray with shape('N', `n_subvectors`, `n_clusters`)
.. note::
_description_
"""
assert x.dtype == np.float32
assert x.ndim == 2
N, D = x.shape
assert (
D == self.d_subvector * self.n_subvectors
), 'input dimension must be Ds * M'
if self.normalize_input:
x = l2_normalize(x)
if self.metric == Metric.EUCLIDEAN:
dist_mat = pq_bind.batch_precompute_adc_table(
x, self.d_subvector, self.n_clusters, self.codebooks
)
elif self.metric in [Metric.INNER_PRODUCT, Metric.COSINE]:
dist_mat = 1 / self.n_clusters - np.array(
pq_bind.batch_precompute_adc_table_ip(
x, self.d_subvector, self.n_clusters, self.codebooks
),
dtype='float32',
)
else:
raise ArgumentError(f'Unable support metrics {self.metric}')
return np.ascontiguousarray(dist_mat, dtype='float32')
# -------------------------------------
class DistanceTable(object):
"""Distance table from query to codeworkds.
Given a query vector, a PQ/OPQ instance compute this DistanceTable class
using :func:`PQ.dtable` or :func:`OPQ.dtable`.
The Asymmetric Distance from query to each database codes can be computed
by :func:`DistanceTable.adist`.
Args:
dtable (np.ndarray): Distance table with shape=(M, Ks) and dtype=np.float32
computed by :func:`PQ.dtable` or :func:`OPQ.dtable`
Attributes:
dtable (np.ndarray): Distance table with shape=(M, Ks) and dtype=np.float32.
Note that dtable[m][ks] contains the squared Euclidean distance between
(1) m-th sub-vector of query and (2) ks-th codeword for m-th subspace.
"""
def __init__(self, dtable: 'np.ndarray'):
assert dtable.ndim == 2
self.dtable = dtable
def adist(self, codes):
"""Given PQ-codes, compute Asymmetric Distances between the query (self.dtable)
and the PQ-codes.
Args:
codes (np.ndarray): PQ codes with shape=(N, M) and
dtype=pq.code_dtype where pq is a pq instance that creates the codes
Returns:
np.ndarray: Asymmetric Distances with shape=(N, ) and dtype=np.float32
"""
assert codes.ndim == 2
dists = pq_bind.dist_pqcodes_to_codebooks(self.dtable, codes)
# The above line is equivalent to the followings:
# dists = np.zeros((N, )).astype(np.float32)
# for n in range(N):
# for m in range(M):
# dists[n] += self.dtable[m][codes[n][m]]
return dists
================================================
FILE: annlite/core/codec/projector.py
================================================
from typing import Optional
import numpy as np
from .base import BaseCodec
class ProjectorCodec(BaseCodec):
"""Implementation of Projector.
:param n_components: number of components to keep.
:param whiten: when True (False by default) the components_ vectors are multiplied
by the square root of n_samples and then divided by the singular
values to ensure uncorrelated outputs with unit component-wise variances.
:param svd_solver:
If auto: The solver is selected by a default policy based on X.shape and
n_components: if the input data is larger than 500x500 and the number of
components to extract is lower than 80% of the smallest dimension of the
data, then the more efficient ‘randomized’ method is enabled. Otherwise
the exact full SVD is computed and optionally truncated afterwards.
If full: run exact full SVD calling the standard LAPACK solver via scipy.
linalg.svd and select the components by postprocessing.
If arpack: run SVD truncated to n_components calling ARPACK solver via
scipy.sparse.linalg.svds. It requires strictly 0 < n_components < min(X.shape).
"""
def __init__(
self,
dim: int,
n_components: int = 128,
whiten: Optional[bool] = False,
svd_solver: Optional[str] = 'auto',
):
super(ProjectorCodec, self).__init__(require_train=True)
self.dim = dim
self.n_components = n_components
assert self.dim >= self.n_components, (
f'the dimension after projector should be less than original dimension, got '
f'original dimension: {self.dim} and projector dimension: {self.n_components}'
)
self.whiten = whiten
self.svd_solver = svd_solver
self.pca = None
def __hash__(self):
return hash(
(
self.__class__.__name__,
self.dim,
self.n_components,
self.whiten,
self.svd_solver,
)
)
def fit(self, x: 'np.ndarray'):
"""Train projector model
:param x: Training vectors with shape=(N, D)
"""
assert x.ndim == 2
assert (
x.shape[1] == self.dim,
), 'dimension of input data must be equal to "dim"'
assert (
x.shape[0] > self.n_components
), 'number of input data must be larger than or equal to n_components'
if self.pca is None:
from sklearn.decomposition import PCA
self.pca = PCA(
n_components=self.n_components,
whiten=self.whiten,
svd_solver=self.svd_solver,
)
self.pca.fit(x)
self._is_trained = True
def partial_fit(self, x: 'np.ndarray'):
"""Given a batch of training vectors, update the internal projector.
This method is specially designed to be used when data does not fit in memory.
:param x: Training vectors with shape=(N, D)
"""
assert x.ndim == 2
assert x.shape[1] == self.dim, 'dimension of input data must be equal to "dim"'
assert (
x.shape[0] > self.n_components
), 'number of input data must be larger than or equal to n_components'
if self.pca is None:
from sklearn.decomposition import IncrementalPCA
self.pca = IncrementalPCA(
n_components=self.n_components,
whiten=self.whiten,
)
self.pca.partial_fit(x)
self._is_trained = True
def encode(self, x: 'np.ndarray'):
"""Encode input vectors using projector.
:param x: Input vectors with shape=(N, D)
:return: np.ndarray: transformed vectors using projector.
"""
assert x.ndim == 2
assert x.shape[1] == self.dim, 'dimension of input data must be equal to "dim"'
return self.pca.transform(x)
def decode(self, x: 'np.ndarray'):
"""Given transformed vectors, reconstruct original D-dimensional vectors
approximately.
:param x: vectors with shape=(N, self.n_components).
:return: Reconstructed vectors with shape=(N, D)
"""
assert x.ndim == 2
assert x.shape[1] == self.n_components
return self.pca.inverse_transform(x)
@property
def components(self):
"""Principal axes in feature space, representing the directions of maximum
variance in the data.
"""
self._check_trained()
return self.pca.components_
@property
def explained_variance_ratio(self):
"""Percentage of variance explained by each of the selected components."""
self._check_trained()
return self.pca.explained_variance_ratio_
@property
def mean(self):
"""Per-feature empirical mean."""
self._check_trained()
return self.pca.mean_
@property
def var(self):
"""Per-feature empirical variance"""
self._check_trained()
return self.pca.var_
================================================
FILE: annlite/core/codec/vq.py
================================================
import numpy as np
from scipy.cluster.vq import vq
from ...enums import Metric
from .base import BaseCodec
class VQCodec(BaseCodec):
def __init__(
self,
n_clusters: int,
metric: Metric = Metric.EUCLIDEAN,
iter: int = 100,
n_init: int = 4,
*args,
**kwargs
):
super(VQCodec, self).__init__(require_train=True)
self.n_clusters = n_clusters
# assert (
# metric == Metric.EUCLIDEAN
# ), f'The distance metric `{metric.name}` is not supported yet!'
self.metric = metric
self._codebook = None
self.iter = iter
self.kmeans = None
self.n_init = n_init
def __hash__(self):
return hash((self.__class__.__name__, self.n_clusters, self.metric))
def fit(self, x: 'np.ndarray'):
"""Given training vectors, run k-means for each sub-space and create
codewords for each sub-space.
:param x: Training vectors with shape=(N, D) and dtype=np.float32.
:param iter: The number of iteration for k-means
"""
from sklearn.cluster import KMeans
assert x.dtype == np.float32
assert x.ndim == 2
self.kmeans = KMeans(self.n_clusters, max_iter=self.iter, n_init=self.n_init)
self.kmeans.fit(x)
self._codebook = self.kmeans.cluster_centers_
self._is_trained = True
def partial_fit(self, x: 'np.ndarray'):
"""Given a batch of training vectors, update the internal MiniBatchKMeans.
This method is specially designed to be used when data does not fit in memory.
:param x: Training vectors with shape=(N, D)
"""
assert x.ndim == 2
if self.kmeans:
self.kmeans.partial_fit(x)
else:
from sklearn.cluster import MiniBatchKMeans
self.kmeans = MiniBatchKMeans(
n_clusters=self.n_clusters, max_iter=self.iter
)
self.kmeans.partial_fit(x)
def build_codebook(self):
"""Constructs a codebook from the current MiniBatchKmeans
This step is not necessary if full KMeans is trained used calling `.fit`.
"""
self._codebook = self.kmeans.cluster_centers_
self._is_trained = True
def encode(self, x: 'np.ndarray'):
"""Encodes each row of the input array `x` it's closest cluster id."""
self._check_trained()
assert x.dtype == np.float32
assert x.ndim == 2
codes, _ = vq(x, self.codebook)
return codes
def decode(self, x: 'np.ndarray'):
return None
@property
def codebook(self):
self._check_trained()
return self._codebook
================================================
FILE: annlite/core/index/__init__.py
================================================
================================================
FILE: annlite/core/index/base.py
================================================
import abc
from typing import List, Optional, Union
import numpy as np
from ...enums import ExpandMode, Metric
from ...helper import str2dtype
class BaseIndex(abc.ABC):
def __init__(
self,
dim: int,
dtype: Union[np.dtype, str] = np.float32,
metric: Metric = Metric.COSINE,
initial_size: Optional[int] = None,
expand_step_size: int = 10240,
expand_mode: ExpandMode = ExpandMode.STEP,
*args,
**kwargs
):
assert expand_step_size > 0
self.initial_size = initial_size or expand_step_size
self.expand_step_size = expand_step_size
self.expand_mode = expand_mode
self.dim = dim
self.dtype = str2dtype(dtype) if isinstance(dtype, str) else dtype
self.metric = metric
self._size = 0
self._capacity = self.initial_size
@property
def capacity(self) -> int:
return self._capacity
@property
def size(self):
return self._size
@abc.abstractmethod
def add_with_ids(self, x: np.ndarray, ids: List[int], **kwargs):
...
@abc.abstractmethod
def delete(self, ids: List[int]):
...
@abc.abstractmethod
def update_with_ids(self, x: np.ndarray, ids: List[int], **kwargs):
...
def reset(self, capacity: Optional[int] = None):
self._size = 0
self._capacity = capacity or self.initial_size
================================================
FILE: annlite/core/index/flat_index.py
================================================
from typing import List, Optional
import numpy as np
from loguru import logger
from ...math import cdist, top_k
from .base import BaseIndex
class FlatIndex(BaseIndex):
def __init__(self, *args, **kwargs):
super(FlatIndex, self).__init__(*args, **kwargs)
self._data = np.zeros((self.initial_size, self.dim), dtype=self.dtype)
def search(
self, x: np.ndarray, limit: int = 10, indices: Optional[np.ndarray] = None
):
_dim = x.shape[-1]
assert (
_dim == self.dim
), f'the query embedding dimension does not match with index dimension: {_dim} vs {self.dim}'
x = x.reshape((-1, self.dim))
data = self._data[: self.size]
data_ids = np.arange(self.size)
if indices is not None:
data = self._data[indices]
data_ids = data_ids[indices]
dists = cdist(x, data, metric=self.metric.name.lower())
dists, idx = top_k(dists, limit, descending=False)
# TODO: change the shape of return
dists = dists[0]
data_ids = data_ids[idx[0]]
return dists, data_ids
def add_with_ids(self, x: np.ndarray, ids: List[int]):
for idx in ids:
if idx >= self._capacity:
self._expand_capacity()
start = self._size
end = start + len(x)
self._data[ids, :] = x
self._size = end
def _expand_capacity(self):
new_block = np.zeros((self.expand_step_size, self.dim), dtype=self.dtype)
self._data = np.concatenate((self._data, new_block), axis=0)
self._capacity += self.expand_step_size
logger.debug(
f'total storage capacity is expanded by {self.expand_step_size}',
)
def reset(self, capacity: Optional[int] = None):
super().reset(capacity=capacity)
self._data = np.zeros((self.capacity, self.dim), dtype=self.dtype)
def delete(self, ids: List[int]):
raise RuntimeError(
f'the deletion operation is not allowed for {self.__class__.__name__}!'
)
def update_with_ids(self, x: np.ndarray, ids: List[int], **kwargs):
self._data[ids, :] = x
================================================
FILE: annlite/core/index/hnsw/__init__.py
================================================
from .index import HnswIndex
================================================
FILE: annlite/core/index/hnsw/index.py
================================================
import math
import os.path
from functools import wraps
from pathlib import Path
from typing import TYPE_CHECKING, List, Optional, Union
import numpy as np
from loguru import logger
from annlite.hnsw_bind import Index
from ....enums import Metric
from ....math import l2_normalize
from ..base import BaseIndex
if TYPE_CHECKING:
from ...codec.base import BaseCodec
def pre_process(f):
@wraps(f)
def pre_processed(self: 'HnswIndex', x: np.ndarray, *args, **kwargs):
if x.ndim == 1:
x = x.reshape((1, -1))
if x.dtype != self.dtype:
x = x.astype(self.dtype)
if self.normalization_enable:
x = l2_normalize(x)
if self.pq_enable:
if not self.pq_codec.is_trained:
raise RuntimeError(
'Please train the PQ before using HNSW quantization backend'
)
elif not self._set_backend_pq:
self._index.loadPQ(self.pq_codec)
self._set_backend_pq = True
kwargs['pre_process_dtables'] = self.pq_codec.get_dist_mat(x)
x = self.pq_codec.encode(x)
assert kwargs['pre_process_dtables'].dtype == 'float32'
assert kwargs['pre_process_dtables'].flags['C_CONTIGUOUS']
return f(self, x, *args, **kwargs)
else:
return f(self, x, *args, **kwargs)
return pre_processed
class HnswIndex(BaseIndex):
def __init__(
self,
dim: int,
dtype: np.dtype = np.float32,
metric: Metric = Metric.COSINE,
ef_construction: int = 200,
ef_search: int = 50,
max_connection: int = 16,
pq_codec: Optional['BaseCodec'] = None,
index_file: Optional[Union[str, Path]] = None,
**kwargs,
):
"""
:param dim: The dimensionality of vectors to index
:param index_file: A file-like object or a string containing a file name.
:param metric: Distance metric type, can be 'euclidean', 'inner_product', or 'cosine'
:param ef_construction: the size of the dynamic list for the nearest neighbors (used during the building).
:param ef_search: the size of the dynamic list for the nearest neighbors (used during the search).
:param max_connection: The number of bi-directional links created for every new element during construction.
Reasonable range for M is 2-100.
"""
super().__init__(dim, dtype=dtype, metric=metric, **kwargs)
self.ef_construction = ef_construction
self.ef_search = ef_search
self.max_connection = max_connection
self.pq_codec = pq_codec
self._set_backend_pq = False
self.index_file = index_file
self._init_hnsw_index()
def _init_hnsw_index(self):
self._index = Index(space=self.space_name, dim=self.dim)
if self.index_file:
if os.path.exists(self.index_file):
logger.info(
f'indexer will be loaded from {self.index_file}',
)
self.load(self.index_file)
else:
raise FileNotFoundError(
f'index path: {self.index_file} does not exist',
)
else:
if self.pq_codec is not None and self.pq_codec.is_trained:
self._index.init_index(
max_elements=self.capacity,
ef_construction=self.ef_construction,
M=self.max_connection,
pq_codec=self.pq_codec,
)
self._set_backend_pq = True
else:
self._index.init_index(
max_elements=self.capacity,
ef_construction=self.ef_construction,
M=self.max_connection,
pq_codec=None,
)
self._set_backend_pq = False
self._index.set_ef(self.ef_search)
def load(self, index_file: Union[str, Path]):
self._index.load_index(str(index_file))
if self.pq_codec:
self._index.loadPQ(self.pq_codec)
def dump(self, index_file: Union[str, Path]):
self._index.save_index(str(index_file))
@pre_process
def add_with_ids(
self,
x: 'np.ndarray',
ids: List[int],
# kwargs maybe used by pre_process
pre_process_dtables=None,
):
max_id = max(ids) + 1
if max_id > self.capacity:
expand_steps = math.ceil(max_id / self.expand_step_size)
self._expand_capacity(expand_steps * self.expand_step_size)
self._index.add_items(x, ids=ids, dtables=pre_process_dtables)
@pre_process
def search(
self,
query: 'np.ndarray',
limit: int = 10,
indices: Optional['np.ndarray'] = None,
# kwargs maybe used by pre_process
pre_process_dtables=None,
):
ef_search = max(self.ef_search, limit)
self._index.set_ef(ef_search)
if indices is not None:
# TODO: add a smart strategy to speed up this case (bruteforce search would be better)
if len(indices) < limit:
limit = len(indices)
ids, dists = self._index.knn_query_with_filter(
query, filters=indices, k=limit, dtables=pre_process_dtables
)
else:
ids, dists = self._index.knn_query(
query, k=limit, dtables=pre_process_dtables
)
# convert squared l2 into euclidean distance
if self.metric == Metric.EUCLIDEAN:
dists = np.sqrt(dists)
return dists[0], ids[0]
def delete(self, ids: List[int]):
for i in ids:
self._index.mark_deleted(i)
def update_with_ids(self, x: 'np.ndarray', ids: List[int], **kwargs):
raise RuntimeError(
f'the update operation is not allowed for {self.__class__.__name__}!'
)
def _expand_capacity(self, new_capacity: int):
self._capacity = new_capacity
self._index.resize_index(new_capacity)
logger.debug(
f'HNSW index capacity is expanded by {self.expand_step_size}',
)
def reset(self, capacity: Optional[int] = None):
super().reset(capacity=capacity)
self._init_hnsw_index()
@property
def size(self):
return self._index.element_count
@property
def space_name(self):
if self.metric == Metric.EUCLIDEAN:
return 'l2'
elif self.metric == Metric.INNER_PRODUCT:
return 'ip'
return 'cosine'
@property
def pq_enable(self):
return self.pq_codec is not None
@property
def normalization_enable(self):
return self.metric == Metric.COSINE
================================================
FILE: annlite/core/index/pq_index.py
================================================
from typing import List, Optional
import numpy as np
from ...math import top_k
from ..codec.pq import PQCodec
from .flat_index import FlatIndex
# TODO: deprecated this index
class PQIndex(FlatIndex): # pragma: no cover
def __init__(
self,
dim: int,
pq_codec: PQCodec,
**kwargs,
):
assert pq_codec is not None
self._dense_dim = dim
super(PQIndex, self).__init__(
pq_codec.n_subvectors, dtype=pq_codec.code_dtype, **kwargs
)
self._pq_codec = pq_codec
def add_with_ids(self, x: np.ndarray, ids: List[int]):
x = self._pq_codec.encode(x)
super(PQIndex, self).add_with_ids(x, ids)
def search(
self, x: np.ndarray, limit: int = 10, indices: Optional[np.ndarray] = None
):
_dim = x.shape[-1]
assert (
_dim == self._pq_codec.dim
), f'the query embedding dimension does not match with index dimension: {_dim} vs {self.dim}'
precomputed = self._pq_codec.precompute_adc(x)
codes = self._data
data_idx = np.arange(self._capacity)
if indices is not None:
codes = self._data[indices]
data_idx = data_idx[indices]
dists = precomputed.adist(codes) # (10000, )
dists = np.expand_dims(dists, axis=0)
dists, ids = top_k(dists, limit, descending=False)
# TODO: change the shape of return
ids = ids[0]
if indices is not None:
ids = data_idx[ids]
return dists[0], ids
================================================
FILE: annlite/enums.py
================================================
from enum import IntEnum
class BetterEnum(IntEnum):
"""The base class of Enum."""
def __str__(self):
return self.name
@classmethod
def from_string(cls, s: str):
"""
Parse the enum from a string.
:param s: string representation of the enum value
:return: enum value
"""
try:
return cls[s.upper()]
except KeyError:
raise ValueError(
f'{s.upper()} is not a valid enum for {cls!r}, must be one of {list(cls)}'
)
class Metric(BetterEnum):
EUCLIDEAN = 1
INNER_PRODUCT = 2
COSINE = 3
class ExpandMode(BetterEnum):
STEP = 1
DOUBLE = 2
ADAPTIVE = 3
================================================
FILE: annlite/executor.py
================================================
import threading
import time
import traceback
import warnings
from threading import Thread
from typing import Dict, List, Optional, Tuple, Union
from docarray import Document, DocumentArray
from jina import Executor, requests
from jina.logging.logger import JinaLogger
INDEX_BATCH_SIZE = 1024
class AnnLiteIndexer(Executor):
"""A simple indexer that wraps the AnnLite indexer and adds a simple interface for indexing and searching.
:param n_dim: Dimensionality of vectors to index
:param metric: Distance metric type. Can be 'euclidean', 'inner_product', or 'cosine'
:param limit: Number of results to get for each query document in search
:param n_components: Number of components to use for dimensionality reduction
:param match_args: the arguments to `DocumentArray`'s match function
:param data_path: the workspace of the AnnLiteIndexer but not support when shards > 1.
:param ef_construction: The construction time/accuracy trade-off
:param ef_search: The query time accuracy/speed trade-off
:param max_connection: The maximum number of outgoing connections in the
graph (the "M" parameter)
:param include_metadata: If True, return the document metadata in response
:param index_access_paths: Default traversal paths on docs
(used for indexing, delete and update), e.g. '@r', '@c', '@r,c'
:param search_access_paths: Default traversal paths on docs
(used for search), e.g. '@r', '@c', '@r,c'
:param columns: A list or dict of column names to index.
:param dim: Deprecated, use n_dim instead
"""
def __init__(
self,
n_dim: int = 0,
metric: str = 'cosine',
limit: int = 10,
n_components: Optional[int] = None,
match_args: Optional[Dict] = None,
data_path: Optional[str] = None,
ef_construction: Optional[int] = None,
ef_search: Optional[int] = None,
max_connection: Optional[int] = None,
include_metadata: bool = True,
index_access_paths: str = '@r',
search_access_paths: str = '@r',
columns: Optional[Union[List[Tuple[str, str]], Dict[str, str]]] = None,
list_like: Optional[bool] = False,
dim: int = None,
*args,
**kwargs,
):
super().__init__(*args, **kwargs)
self.logger = JinaLogger(self.__class__.__name__)
n_dim = n_dim or dim
if not n_dim:
raise ValueError('Please specify the dimension of the vectors to index!')
self.n_components = n_components
self.metric = metric
self.match_args = match_args or {}
self.include_metadata = include_metadata
if limit:
self.match_args.update({'limit': limit})
self.index_access_paths = index_access_paths
if 'index_traversal_paths' in kwargs:
warnings.warn(
f'`index_traversal_paths` is deprecated. Use `index_access_paths` instead.'
)
self.index_access_paths = kwargs['index_traversal_paths']
self.search_access_paths = search_access_paths
if 'search_traversal_paths' in kwargs:
warnings.warn(
f'`search_traversal_paths` is deprecated. Use `search_access_paths` instead.'
)
self.search_access_paths = kwargs['search_traversal_paths']
self._data_buffer = DocumentArray()
self._index_batch_size = INDEX_BATCH_SIZE
self._max_length_queue = 2 * self._index_batch_size
self._index_lock = threading.Lock()
self.logger = JinaLogger(getattr(self.metas, 'name', self.__class__.__name__))
if getattr(self.runtime_args, 'shards', 1) > 1 and data_path:
raise ValueError(
'`data_path` is not supported when shards > 1, please use `workspace` instead'
)
config = {
'n_dim': n_dim,
'n_components': n_components,
'metric': metric,
'ef_construction': ef_construction,
'ef_search': ef_search,
'max_connection': max_connection,
'data_path': data_path or self.workspace or './workspace',
'columns': columns,
'list_like': list_like,
}
self._index = DocumentArray(storage='annlite', config=config)
# start indexing thread in background to group indexing requests
# together and perform batch indexing at once
self._start_index_loop()
@requests(on='/index')
def index(
self, docs: Optional[DocumentArray] = None, parameters: dict = {}, **kwargs
):
"""Index new documents
:param docs: the Documents to index
:param parameters: dictionary with options for indexing
Keys accepted:
- 'access_paths': traversal paths on docs, e.g. '@r', '@c', '@r,c'
"""
if not docs:
return
access_paths = parameters.get('access_paths', self.index_access_paths)
flat_docs = docs[access_paths]
if len(flat_docs) == 0:
return
while len(self._data_buffer) >= self._max_length_queue:
time.sleep(0.001)
with self._index_lock:
self._data_buffer.extend(flat_docs)
def _start_index_loop(self):
"""Start the indexing loop in background.
This loop is responsible for batch indexing the documents in the buffer.
"""
def _index_loop():
try:
while True:
# if the buffer is none, will break the loop
if self._data_buffer is None:
break
# if the buffer is empty, will wait for new documents to be added
if len(self._data_buffer) == 0:
time.sleep(0.1) # sleep for 100ms
continue
# acquire the lock to prevent threading issues
with self._index_lock:
batch_docs = self._data_buffer.pop(
range(
self._index_batch_size
if len(self._data_buffer) > self._index_batch_size
else len(self._data_buffer)
)
)
self._index.extend(batch_docs)
self.logger.debug(f'indexing {len(batch_docs)} docs done...')
except Exception as e:
self.logger.error(traceback.format_exc())
raise e
self._index_thread = Thread(target=_index_loop, daemon=False)
self._index_thread.start()
@requests(on='/update')
def update(
self, docs: Optional[DocumentArray] = None, parameters: dict = {}, **kwargs
):
"""Update existing documents
:param docs: the Documents to update
:param parameters: dictionary with options for updating
Keys accepted:
- 'access_paths': traversal paths on docs, e.g. '@r', '@c', '@r,c'
- 'raise_errors_on_not_found': if True, raise an error if a document is not found. Default is False.
"""
if not docs:
return
access_paths = parameters.get('access_paths', self.index_access_paths)
raise_errors_on_not_found = parameters.get('raise_errors_on_not_found', False)
flat_docs = docs[access_paths]
if len(flat_docs) == 0:
return
with self._index_lock:
if len(self._data_buffer) > 0:
raise RuntimeError(
f'Cannot update documents while the pending documents in the buffer are not indexed yet. '
'Please wait for the pending documents to be indexed.'
)
for doc in flat_docs:
try:
self._index[doc.id] = doc
except IndexError:
if raise_errors_on_not_found:
raise Exception(
f'The document (id={doc.id}) cannot be updated as'
f'it is not found in the index'
)
else:
self.logger.warning(
f'cannot update doc {doc.id} as it does not exist in storage'
)
@requests(on='/delete')
def delete(self, parameters: dict = {}, **kwargs):
"""Delete existing documents
Delete entries from the index by id
:param parameters: parameters to the request
"""
delete_ids = parameters.get('ids', [])
if len(delete_ids) == 0:
return
with self._index_lock:
if len(self._data_buffer) > 0:
raise RuntimeError(
f'Cannot delete documents while the pending documents in the buffer are not indexed yet. '
'Please wait for the pending documents to be indexed.'
)
del self._index[delete_ids]
@requests(on='/search')
def search(
self, docs: Optional[DocumentArray] = None, parameters: dict = {}, **kwargs
):
"""Perform a vector similarity search and retrieve Document matches
Search can be performed with candidate filtering. Filters are a triplet (column,operator,value).
More than a filter can be applied during search. Therefore, conditions for a filter are specified as a list triplets.
Each triplet contains:
- column: Column used to filter.
- operator: Binary operation between two values. Some supported operators include `['>','<','=','<=','>=']`.
- value: value used to compare a candidate.
:param docs: the Documents to search with
:param parameters: dictionary for parameters for the search operation
Keys accepted:
- 'access_paths' (str): traversal paths on docs, e.g. '@r', '@c', '@r,c'
- 'filter' (dict): the filtering conditions on document tags
- 'limit' (int): nr of matches to get per Document
"""
if not docs:
return
access_paths = parameters.get('access_paths', self.search_access_paths)
flat_docs = docs[access_paths]
match_args = (
{**self.match_args, **parameters}
if parameters is not None
else self.match_args
)
with self._index_lock:
# if len(self._data_buffer) > 0:
# raise RuntimeError(
# f'Cannot search documents while the pending documents in the buffer are not indexed yet. '
# 'Please wait for the pending documents to be indexed.'
# )
flat_docs.match(self._index, **match_args)
@requests(on='/backup')
def backup(self, parameters: Optional[Dict] = {}, **kwargs):
"""
Backup data to local or remote.
Use api of
Keys accepted:
- 'target' (str): the name of indexer you want to backup as
"""
target_name = parameters.get('target_name', None)
token = parameters.get('token', None)
if target_name:
target_name = f'{target_name}_{self.runtime_args.shard_id}'
with self._index_lock:
if len(self._data_buffer) > 0:
raise RuntimeError(
f'Cannot backup documents while the pending documents in the buffer are not indexed yet. '
'Please wait for the pending documents to be indexed.'
)
self._index._annlite.backup(target_name, token)
if self._index._list_like:
self._index._save_offset2ids()
@requests(on='/restore')
def restore(self, parameters: Optional[Dict] = {}, **kwargs):
"""
Restore data from local or remote.
Use api of
"""
source_name = parameters.get('source_name', None)
token = parameters.get('token', None)
if source_name:
source_name = f'{source_name}_{self.runtime_args.shard_id}'
self._index._annlite.restore(source_name, token)
if self._index._list_like:
self._index._load_offset2ids()
@requests(on='/filter')
def filter(self, parameters: Dict, **kwargs):
"""
Query documents from the indexer by the filter `query` object in parameters. The `query` object must follow the
specifications in the `find` method of `DocumentArray` using annlite: https://docarray.jina.ai/fundamentals/documentarray/find/#filter-with-query-operators
:param parameters: Dictionary to define the `filter` that you want to use.
"""
return self._index.find(parameters.get('filter', None))
@requests(on='/fill_embedding')
def fill_embedding(self, docs: DocumentArray, **kwargs):
"""
retrieve embedding of Documents by id
:param docs: DocumentArray to search with
"""
for doc in docs:
doc.embedding = self._index[doc.id].embedding
@requests(on='/status')
def status(self, **kwargs) -> DocumentArray:
"""Return the document containing status information about the indexer.
The status will contain information on the total number of indexed and deleted
documents, and on the number of (searchable) documents currently in the index.
"""
status = Document(
tags={
'appending_size': len(self._data_buffer),
'total_docs': len(self._index),
'index_size': len(self._index),
}
)
return DocumentArray([status])
def flush(self):
"""Flush all the data in the buffer to the index"""
while len(self._data_buffer) > 0:
time.sleep(0.1)
@requests(on='/clear')
def clear(self, **kwargs):
"""Clear the index of all entries."""
self.flush()
with self._index_lock:
self._data_buffer = None
self._index_thread.join()
self._data_buffer = DocumentArray()
self._index.clear()
self._start_index_loop()
def close(self, **kwargs):
"""Close the index."""
super().close()
self.flush()
# wait for the index thread to finish
with self._index_lock:
self._data_buffer = None
self._index_thread.join()
# WARNING: the commented code below hangs the close in pytest `pytest tests/test_*.py`
# But don't know why. It works fine in `pytest tests/test_executor.py` and normal python execution
del self._index
================================================
FILE: annlite/filter.py
================================================
from typing import Dict
LOGICAL_OPERATORS = {'$and': 'AND', '$or': 'OR'}
COMPARISON_OPERATORS = {
'$lt': '<',
'$gt': '>',
'$lte': '<=',
'$gte': '>=',
'$eq': '=',
'$neq': '!=',
}
MEMBERSHIP_OPERATORS = {'$in': 'IN', '$nin': 'NOT IN'}
def _sql_parsing(data, default_logic: str = 'AND'):
"""
:param data: JSON Object (dict).
:param parameters: dict.
:return: where clause (str) built from data
"""
where_clause = ''
parameters = []
if isinstance(data, dict):
for i, (key, value) in enumerate(data.items()):
if key in LOGICAL_OPERATORS:
clause, params = _sql_parsing(
value, default_logic=LOGICAL_OPERATORS[key]
)
if i == 0:
where_clause += clause
else:
where_clause += f' {LOGICAL_OPERATORS[key]} {clause}'
parameters.extend(params)
elif key.startswith('$'):
raise ValueError(
f'The operator {key} is not supported yet, please double check the given filters!'
)
else:
if i > 0:
where_clause += f' {default_logic} '
items = list(value.items())
if len(items) == 0:
raise ValueError(f'The query express is illegal: {data}')
elif len(items) > 1:
clause_list, params_list = [], []
for op, val in items:
_clause, _params = _sql_parsing({key: {op: val}})
clause_list.append(_clause)
params_list.extend(_params)
where_clause += f' AND '.join(clause_list)
parameters.extend(params_list)
else:
op, val = items[0]
if op in LOGICAL_OPERATORS:
clause, params = _sql_parsing(
val, default_logic=LOGICAL_OPERATORS[op]
)
where_clause += clause
parameters.extend(params)
elif op in COMPARISON_OPERATORS:
parameters.append(val)
where_clause += f'({key} {COMPARISON_OPERATORS[op]} ?)'
elif op in MEMBERSHIP_OPERATORS:
parameters.extend(val)
where_clause += f'({key} {MEMBERSHIP_OPERATORS[op]}({", ".join(["?"]*len(val))}))'
else:
raise ValueError(
f'The operator {op} is not supported yet, please double check the given filters!'
)
elif isinstance(data, list):
clause_list, params_list = [], []
for d in data:
_clause, _params = _sql_parsing(d)
clause_list.append(_clause)
params_list.extend(_params)
where_clause += '(' + f' {default_logic} '.join(clause_list) + ')'
parameters.extend(params_list)
elif isinstance(data, str):
return data, parameters
else:
raise ValueError(f'The query express is illegal: {data}')
return where_clause, tuple(parameters)
class Filter(object):
"""A class to parse query language to SQL where clause."""
def __init__(self, tree_data: Dict = {}):
self.tree_data = tree_data
def parse_where_clause(self):
return _sql_parsing(self.tree_data or {})
================================================
FILE: annlite/helper.py
================================================
import sys
import numpy as np
from loguru import logger
def setup_logging(debug: bool):
"""
Setup the log formatter for AnnLite.
"""
log_level = 'INFO'
if debug:
log_level = 'DEBUG'
logger.remove()
logger.add(
sys.stdout,
colorize=True,
level=log_level,
)
def str2dtype(dtype_str: str):
if dtype_str in ['double', 'float64']:
dtype = np.float64
elif dtype_str in ['half', 'float16']:
dtype = np.float16
elif dtype_str in ['float', 'float32']:
dtype = np.float32
elif dtype_str in ['bfloat16']:
dtype = np.bfloat16
elif dtype_str in ['long', 'int64']:
dtype = np.int64
elif dtype_str in ['int', 'int32']:
dtype = np.int32
elif dtype_str in ['int16']:
dtype = np.int16
elif dtype_str in ['int8']:
dtype = np.int8
elif dtype_str in ['uint8']:
dtype = np.uint8
elif dtype_str in ['bool']:
dtype = np.bool
else:
raise TypeError(f'Unrecognized dtype string: {dtype_str}')
return dtype
================================================
FILE: annlite/hubble_tools.py
================================================
import os
import platform
import shutil
import time
from pathlib import Path
from typing import Optional, Union
from filesplit.merge import Merge
from filesplit.split import Split
from loguru import logger
ignored_extn = ['.DS_Store']
def get_size(input: Path) -> float:
import os
return os.stat(str(input)).st_size / (1024 * 1024)
def make_archive(input: Path, output_name: str) -> Path:
"""
This function will create a zip archive of the input file (tmp.zip) at the
same folder of input path.
"""
output_path = shutil.make_archive(
os.path.join(str(input.parent), output_name),
'zip',
str(input.parent),
str(input.name),
)
return Path(output_path)
class Uploader:
def __init__(self, size_limit=1024, client=None):
"""
This class create a filesplit object to split the file into small pieces and
upload them on to hubble.
:params size_limit: The max size of split files.
:params client: hubble client used for uploading.
"""
self.size_limit = size_limit
self.client = client
def upload_file(
self, input: Path, target_name: str, type: str, cell_id: Union[int, str]
):
logger.info(f'Start to upload single file: {input} to hubble ...')
size = get_size(input)
if size > self.size_limit:
split_list = self._split_file(input)
self.upload_directory(split_list, target_name, type, cell_id, merge=False)
shutil.rmtree(split_list)
else:
if self._check_exists(target_name, type, input.name):
return
self._upload_hubble(input, target_name, type, input.name, cell_id)
def upload_directory(
self,
input: Path,
target_name: str,
type: str,
cell_id: Union[int, str],
merge: bool = True,
):
def _upload():
if self._check_exists(target_name, type, str(idx) + '.zip'):
return
Path.mkdir(input.parent / str(idx))
for f in split_list:
shutil.copy(f, input.parent / str(idx))
output_path = make_archive(input.parent / str(idx), str(idx) + '.zip')
self._upload_hubble(
output_path, target_name, type, str(idx) + '.zip', cell_id
)
Path(output_path).unlink()
shutil.rmtree(input.parent / str(idx))
logger.info(f'Start to upload directory: {input} to hubble ...')
if merge:
size_list = list(
zip(list(input.iterdir()), [get_size(f) for f in list(input.iterdir())])
)
sorted_size_list = sorted(size_list, key=lambda x: x[1])
split_list = []
total_size = 0
idx = 0
for file_name, file_size in sorted_size_list:
for extn in ignored_extn:
if extn in str(file_name):
continue
if total_size + file_size > self.size_limit:
if len(split_list) == 0:
raise Exception(
f'The smallest file: {file_size} is bigger '
f'than size_limit. Please set a larger value '
f'of size_limit, now is {self.size_limit}MB.'
)
_upload()
idx += 1
total_size = 0
split_list = [file_name]
else:
split_list.append(file_name)
total_size += file_size
if len(split_list) > 0:
_upload()
else:
for idx, file_name in enumerate(list(input.glob('*'))):
if self._check_exists(target_name, type, str(file_name.name)):
continue
self._upload_hubble(
file_name, target_name, type, str(file_name.name), cell_id
)
def archive_and_upload(
self,
target_name: str,
type: str,
file_name: str,
cell_id: Union[int, str],
root_path: Path,
upload_folder: str,
):
if self._check_exists(target_name, type, file_name):
return
upload_file = shutil.make_archive(
os.path.join(str(root_path), f'{target_name}_{type}'),
'zip',
str(root_path),
upload_folder,
)
logger.info(
f'Start to upload: {upload_file} to hubble. '
f'[target_name: {target_name}, '
f'type: {type}, '
f'file_name: {file_name}, '
f'cell_id: {cell_id}].'
)
self.client.upload_artifact(
f=upload_file,
metadata={
'name': target_name,
'type': type,
'file_name': file_name,
'cell': cell_id,
},
)
Path(upload_file).unlink()
def _check_exists(self, target_name: str, type: str, file_name: str) -> bool:
art_list = self.client.list_artifacts(
filter={
'metaData.name': target_name,
'metaData.type': f'{type}',
'metaData.file_name': f'{file_name}',
}
)
if len(art_list['data']) != 0:
logger.info(
f'[target_name: {target_name}, type: {type}, file_name: {file_name}] '
f'already exists on hubble, will skip it ...'
)
return True
else:
return False
def _split_file(self, input: Path) -> Path:
output_dir = input / f'{input}_split'
if output_dir.exists():
logger.info(
f'Origin file: {str(input)} has already been split to: {output_dir}, will skip ...'
)
return output_dir
Path.mkdir(output_dir)
Split(str(input), str(output_dir)).bysize(size=self.size_limit * 1024 * 1024)
num_files = len(list(output_dir.glob('*')))
logger.info(
f'Origin file: {str(input)} has been split '
f'into {num_files} parts. Output file: {output_dir}'
)
return output_dir
def _upload_hubble(
self,
upload_file: Path,
target_name: str,
type: str,
file_name: str,
cell_id: Union[str, int],
):
logger.info(
f'Start to upload: {upload_file} to hubble. '
f'[target_name: {target_name}, '
f'type: {type}, '
f'file_name: {file_name}, '
f'cell_id: {cell_id}].'
)
start_time = time.time()
failed_times = 0
while True:
try:
self.client.upload_artifact(
f=str(upload_file),
metadata={
'name': target_name,
'type': type,
'file_name': file_name,
'cell': cell_id,
},
show_progress=True,
)
break
except Exception as e:
logger.info(e)
failed_times += 1
if failed_times == 3:
logger.info(
f'Tried more than 3 times to upload {upload_file}, type is: {type}, will exist...'
)
return
else:
continue
logger.info(
f'Takes {time.time() - start_time} seconds to upload {upload_file}.'
)
class Merger:
def __init__(self, restore_path, client):
"""
This class creates an object to download and merge the split files from hubble.
:param restore_path: tmp directory for downloading and merging files.
:param client: hubble client used for merging files.
"""
self.restore_path = restore_path
self.restore_path.mkdir(parents=True)
self.client = client
def merge_file(self, inputdir: Path, outputdir: Path, outputfilename: Path):
Merge(
inputdir=str(inputdir),
outputdir=str(outputdir),
outputfilename=str(outputfilename),
).merge()
def get_artifact_ids(self, art_list, type: str, cell_id: Optional[int] = None):
ids = [
[
art['_id'],
art['metaData']['type'],
art['metaData']['file_name'],
art['metaData']['cell'],
]
for art in art_list['data']
if type == art['metaData']['type']
]
if cell_id:
ids = [item for item in ids if int(item[3]) == cell_id]
ids = [[item[0], item[1], item[2]] for item in ids]
return ids
def download(self, ids, download_folder):
Path.mkdir(self.restore_path / download_folder)
for ids, type, file_name in ids:
self.client.download_artifact(
id=ids,
f=str(self.restore_path / download_folder / file_name),
show_progress=True,
)
================================================
FILE: annlite/index.py
================================================
import hashlib
import logging
import os
import platform
import warnings
from pathlib import Path
from typing import TYPE_CHECKING, Dict, List, Optional, Union
import numpy as np
from docarray.math.ndarray import to_numpy_array
from loguru import logger
if TYPE_CHECKING:
from docarray import DocumentArray
from .container import CellContainer
from .core import PQCodec, ProjectorCodec, VQCodec
from .enums import Metric
from .filter import Filter
from .helper import setup_logging
from .math import cdist, top_k
MAX_TRAINING_DATA_SIZE = 10240
class AnnLite(CellContainer):
""":class:`AnnLite` is an approximate nearest neighbor search library.
To create a :class:`AnnLite` object, simply:
.. highlight:: python
.. code-block:: python
ann = AnnLite(256, metric='cosine')
:param n_dim: dimensionality of input vectors. there are 2 constraints on dim:
(1) it needs to be divisible by n_subvectors; (2) it needs to be a multiple of 4.*
:param metric: distance metric type, can be 'euclidean', 'inner_product', or 'cosine'.
:param n_subvectors: number of sub-quantizers, essentially this is the byte size of
each quantized vector, default is None.
:param n_cells: number of coarse quantizer clusters, default is 1.
:param n_probe: number of cells to search for each query, default is 16.
:param n_components: number of components to keep.
:param initial_size: initial capacity assigned to each voronoi cell of coarse quantizer.
``n_cells * initial_size`` is the number of vectors that can be stored initially.
if any cell has reached its capacity, that cell will be automatically expanded.
If you need to add vectors frequently, a larger value for init_size is recommended.
:param columns: the columns to be indexed for fast filtering, default is None.
:param filterable_attrs: a dict of attributes to be indexed for fast filtering, default is None.
The key is the attribute name, and the value is the attribute type. And it only works when ``columns`` is None.
:param data_path: path to the directory where the data is stored.
:param create_if_missing: if False, do not create the directory path if it is missing.
:param read_only: if True, the index is not writable.
:param verbose: if True, will print the debug logging info.
.. note::
Remember that the shape of any tensor that contains data points has to be `[n_data, dim]`.
"""
def __init__(
self,
n_dim: int,
metric: Union[str, Metric] = 'cosine',
n_cells: int = 1,
n_subvectors: Optional[int] = None,
n_clusters: Optional[int] = 256,
n_probe: int = 16,
n_components: Optional[int] = None,
initial_size: Optional[int] = None,
expand_step_size: int = 10240,
columns: Optional[Union[Dict, List]] = None,
filterable_attrs: Optional[Dict] = None,
data_path: Union[Path, str] = Path('./data'),
create_if_missing: bool = True,
read_only: bool = False,
verbose: bool = False,
**kwargs,
):
setup_logging(verbose)
if 'dim' in kwargs:
warnings.warn(
'The argument `dim` will be deprecated, please use `n_dim` instead.'
)
n_dim = kwargs['dim']
if n_subvectors:
assert (
n_dim % n_subvectors == 0
), '"n_dim" needs to be divisible by "n_subvectors"'
self.n_dim = n_dim
self.n_components = n_components
self.n_subvectors = n_subvectors
self.n_clusters = n_clusters
self.n_probe = max(n_probe, n_cells)
self.n_cells = n_cells
self.size_limit = 2048
if isinstance(metric, str):
metric = Metric.from_string(metric)
self.metric = metric
self._use_smart_probing = True
self.read_only = read_only
data_path = Path(data_path)
if create_if_missing:
data_path.mkdir(parents=True, exist_ok=True)
self.data_path = data_path
self._projector_codec = None
if self._projector_codec_path.exists():
logger.info(
f'Load pre-trained projector codec (n_components={self.n_components}) from {self.model_path}'
)
self._projector_codec = ProjectorCodec.load(self._projector_codec_path)
elif n_components:
logger.info(
f'Initialize Projector codec (n_components={self.n_components})'
)
self._projector_codec = ProjectorCodec(
n_dim, n_components=self.n_components
)
self._vq_codec = None
if self._vq_codec_path.exists():
logger.info(
f'Load trained VQ codec (K={self.n_cells}) from {self.model_path}'
)
self._vq_codec = VQCodec.load(self._vq_codec_path)
elif n_cells > 1:
logger.info(f'Initialize VQ codec (K={self.n_cells})')
self._vq_codec = VQCodec(self.n_cells, metric=self.metric)
self._pq_codec = None
if self._pq_codec_path.exists():
logger.info(
f'Load trained PQ codec (n_subvectors={self.n_subvectors}) from {self.model_path}'
)
self._pq_codec = PQCodec.load(self._pq_codec_path)
elif n_subvectors:
logger.info(f'Initialize PQ codec (n_subvectors={self.n_subvectors})')
self._pq_codec = PQCodec(
dim=n_dim
if not self._projector_codec
else self._projector_codec.n_components,
n_subvectors=self.n_subvectors,
n_clusters=self.n_clusters,
metric=self.metric,
)
if columns is not None:
if filterable_attrs:
logger.warning('`filterable_attrs` will be overwritten by `columns`.')
filterable_attrs = {}
for n, t in columns.items() if isinstance(columns, dict) else columns:
filterable_attrs[n] = t
super(AnnLite, self).__init__(
n_dim,
metric=metric,
projector_codec=self._projector_codec,
pq_codec=self._pq_codec,
n_cells=n_cells,
initial_size=initial_size,
expand_step_size=expand_step_size,
filterable_attrs=filterable_attrs,
data_path=data_path,
**kwargs,
)
if not self.is_trained and self.total_docs > 0:
# train the index from scratch based on the data in the data_path
logger.info(f'Train the index by reading data from {self.data_path}')
total_size = 0
# TODO: add a progress bar
for docs in self.documents_generator(0, batch_size=1024):
x = to_numpy_array(docs.embeddings)
total_size += x.shape[0]
self.partial_train(x, auto_save=True, force_train=True)
if total_size >= MAX_TRAINING_DATA_SIZE:
break
logger.info(f'Total training data size: {total_size}')
if self.total_docs > 0:
self.restore()
def _sanity_check(self, x: 'np.ndarray'):
assert x.ndim == 2, 'inputs must be a 2D array'
assert (
x.shape[1] == self.n_dim
), f'inputs must have the same dimension as the index , got {x.shape[1]}, expected {self.n_dim}'
return x.shape
def train(self, x: 'np.ndarray', auto_save: bool = True, force_train: bool = False):
"""Train the index with the given data.
:param x: the ndarray data for training.
:param auto_save: if False, will not dump the trained model to ``model_path``.
:param force_train: if True, enforce to retrain the model, and overwrite the model if ``auto_save=True``.
"""
n_data, _ = self._sanity_check(x)
if self.is_trained and not force_train:
logger.warning(
'The indexer has been trained or is not trainable. Please use ``force_train=True`` to retrain.'
)
return
if self._projector_codec:
logger.info(
f'Start training Projector codec (n_components={self.n_components}) with {n_data} data...'
)
self._projector_codec.fit(x)
if self._vq_codec:
logger.info(
f'Start training VQ codec (K={self.n_cells}) with {n_data} data...'
)
self._vq_codec.fit(x)
if self._pq_codec:
logger.info(
f'Start training PQ codec (n_subvectors={self.n_subvectors}) with {n_data} data...'
)
self._pq_codec.fit(x)
logger.info(f'The annlite is successfully trained!')
if auto_save:
self.dump_model()
def partial_train(
self, x: np.ndarray, auto_save: bool = True, force_train: bool = False
):
"""Partially train the index with the given data.
:param x: the ndarray data for training.
:param auto_save: if False, will not dump the trained model to ``model_path``.
:param force_train: if True, enforce to retrain the model, and overwrite the model if ``auto_save=True``.
"""
n_data, _ = self._sanity_check(x)
if self.is_trained and not force_train:
logger.warning(
'The annlite has been trained or is not trainable. Please use ``force_train=True`` to retrain.'
)
return
if self._projector_codec:
logging.info(
f'Partial training Projector codec (n_components={self.n_components}) with {n_data} data...'
)
self._projector_codec.partial_fit(x)
if self._vq_codec:
logger.info(
f'Partial training VQ codec (K={self.n_cells}) with {n_data} data...'
)
self._vq_codec.partial_fit(x)
if self._pq_codec:
logger.info(
f'Partial training PQ codec (n_subvectors={self.n_subvectors}) with {n_data} data...'
)
self._pq_codec.partial_fit(x)
if auto_save:
self.dump_model()
def index(self, docs: 'DocumentArray', **kwargs):
"""Add the documents to the index.
:param docs: the document array to be indexed.
"""
if self.read_only:
logger.error('The indexer is readonly, cannot add new documents')
return
if not self.is_trained:
raise RuntimeError(f'The indexer is not trained, cannot add new documents')
x = to_numpy_array(docs.embeddings)
n_data, _ = self._sanity_check(x)
assigned_cells = (
self._vq_codec.encode(x)
if self._vq_codec
else np.zeros(n_data, dtype=np.int64)
)
return super(AnnLite, self).insert(x, assigned_cells, docs)
def update(
self,
docs: 'DocumentArray',
raise_errors_on_not_found: bool = False,
insert_if_not_found: bool = True,
**kwargs,
):
"""Update the documents in the index.
:param insert_if_not_found: whether to raise error when updated id is not found.
:param raise_errors_on_not_found: whether to raise exception when id not found.
:param docs: the document array to be updated.
"""
if self.read_only:
logger.error('The indexer is readonly, cannot update documents')
return
if not self.is_trained:
raise RuntimeError(f'The indexer is not trained, cannot add new documents')
x = to_numpy_array(docs.embeddings)
n_data, _ = self._sanity_check(x)
assigned_cells = (
self._vq_codec.encode(x)
if self._vq_codec
else np.zeros(n_data, dtype=np.int64)
)
return super(AnnLite, self).update(
x,
assigned_cells,
docs,
raise_errors_on_not_found=raise_errors_on_not_found,
insert_if_not_found=insert_if_not_found,
)
def search(
self,
docs: 'DocumentArray',
filter: Optional[dict] = None,
limit: int = 10,
include_metadata: bool = True,
**kwargs,
):
"""Search the index, and attach matches to the query Documents in `docs`
:param docs: the document array to be searched.
:param filter: the filter to be applied to the search.
:param limit: the number of results to get for each query document in search
:param include_metadata: whether to return document metadata in response.
"""
if not self.is_trained:
raise RuntimeError(f'The indexer is not trained, cannot add new documents')
query_np = to_numpy_array(docs.embeddings)
match_dists, match_docs = self.search_by_vectors(
query_np, filter=filter, limit=limit, include_metadata=include_metadata
)
for doc, matches in zip(docs, match_docs):
doc.matches = matches
def search_by_vectors(
self,
query_np: 'np.ndarray',
filter: Optional[dict] = None,
limit: int = 10,
include_metadata: bool = True,
):
"""Search the index by vectors, and return the matches.
:param query_np: the query vectors.
:param filter: the filter to be applied to the search.
:param limit: the number of results to get for each query document in search
:param include_metadata: whether to return document metadata in response.
"""
cells = self._cell_selection(query_np, limit)
where_clause, where_params = Filter(filter or {}).parse_where_clause()
match_dists, match_docs = self.search_cells(
query=query_np,
cells=cells,
where_clause=where_clause,
where_params=where_params,
limit=limit,
include_metadata=include_metadata,
)
return match_dists, match_docs
def filter(
self,
filter: Dict,
limit: int = 10,
offset: int = 0,
order_by: Optional[str] = None,
ascending: bool = True,
include_metadata: bool = True,
):
"""Find the documents by the filter.
:param filter: the filter to be applied to the search.
:param limit: the number of results.
:param offset: the offset of the results.
:param order_by: the field to order the results.
:param ascending: whether to order the results in ascending order.
:param include_metadata: whether to return document metadata in response.
"""
cells = [x for x in range(self.n_cells)]
where_clause, where_params = Filter(filter or {}).parse_where_clause()
match_docs = self.filter_cells(
cells=cells,
where_clause=where_clause,
where_params=where_params,
limit=limit,
offset=offset,
order_by=order_by,
ascending=ascending,
include_metadata=include_metadata,
)
if limit > 0:
return match_docs[:limit]
return match_docs
def get_doc_by_id(self, doc_id: str):
"""Get the document by id.
:param doc_id: the document id.
"""
return self._get_doc_by_id(doc_id)
def get_docs(
self,
filter: Optional[dict] = None,
limit: int = 10,
offset: int = 0,
order_by: Optional[str] = None,
ascending: bool = True,
):
"""Get the documents.
:param filter: the filter to be applied to the search.
:param limit: the number of results.
:param offset: the offset of the results.
:param order_by: the field to order the results.
:param ascending: whether to order the results in ascending order. It only works when `order_by` is specified.
"""
return self.filter(
filter=filter,
limit=limit,
offset=offset,
order_by=order_by,
ascending=ascending,
include_metadata=True,
)
def _cell_selection(self, query_np, limit):
n_data, _ = self._sanity_check(query_np)
if self._vq_codec:
dists = cdist(
query_np, self._vq_codec.codebook, metric=self.metric.name.lower()
)
dists, cells = top_k(dists, k=self.n_probe)
else:
cells = np.zeros((n_data, 1), dtype=np.int64)
# if self.use_smart_probing and self.n_probe > 1:
# p = -topk_sims.abs().sqrt()
# p = torch.softmax(p / self.smart_probing_temperature, dim=-1)
#
# # p_norm = p.norm(dim=-1)
# # sqrt_d = self.n_probe ** 0.5
# # score = 1 - (p_norm * sqrt_d - 1) / (sqrt_d - 1) - 1e-6
# # n_probe_list = torch.ceil(score * (self.n_probe) ).long()
#
# max_n_probe = torch.tensor(self.n_probe, device=self.device)
# normalized_entropy = - torch.sum(p * torch.log2(p) / torch.log2(max_n_probe), dim=-1)
# n_probe_list = torch.ceil(normalized_entropy * max_n_probe).long()
# else:
# n_probe_list = None
return cells
def search_numpy(
self,
query_np: 'np.ndarray',
filter: Dict = {},
limit: int = 10,
**kwargs,
):
"""Search the index and return distances to the query and ids of the closest documents.
:param query_np: matrix containing query vectors as rows
:param filter: the filtering conditions
:param limit: the number of results to get for each query document in search
"""
if not self.is_trained:
raise RuntimeError(f'The indexer is not trained, cannot add new documents')
dists, doc_ids = self._search_numpy(query_np, filter, limit)
return dists, doc_ids
def _search_numpy(self, query_np: 'np.ndarray', filter: Dict = {}, limit: int = 10):
"""Search approximate nearest vectors in different cells, returns distances and ids
:param query_np: matrix containing query vectors as rows
:param filter: the filtering conditions
:param limit: the number of results to get for each query document in search
"""
cells = self._cell_selection(query_np, limit)
where_clause, where_params = Filter(filter).parse_where_clause()
dists, ids = self._search_cells(
query=query_np,
cells=cells,
where_clause=where_clause,
where_params=where_params,
limit=limit,
)
return dists, ids
def delete(
self,
docs: Union['DocumentArray', List[str]],
raise_errors_on_not_found: bool = False,
):
"""Delete entries from the index by id
:param raise_errors_on_not_found: whether to raise exception when id not found.
:param docs: the documents to delete
"""
super().delete(
docs if isinstance(docs, list) else docs[:, 'id'], raise_errors_on_not_found
)
def clear(self):
"""Clear the whole database"""
for cell_id in range(self.n_cells):
self.vec_index(cell_id).reset()
self.cell_table(cell_id).clear()
self.doc_store(cell_id).clear()
self.meta_table.clear()
def close(self):
for cell_id in range(self.n_cells):
self.doc_store(cell_id).close()
def encode(self, x: 'np.ndarray'):
n_data, _ = self._sanity_check(x)
if self._projector_codec:
x = self._projector_codec.encode(x)
if self._vq_codec:
x = self._pq_codec.encode(x)
return x
def decode(self, x: 'np.ndarray'):
assert len(x.shape) == 2
assert x.shape[1] == self.n_subvectors
if self._pq_codec:
x = self._pq_codec.decode(x)
if self._projector_codec:
x = self._projector_codec.decode(x)
return x
@property
def params_hash(self):
model_metas = (
f'n_dim: {self.n_dim} '
f'metric: {self.metric} '
f'n_cells: {self.n_cells} '
f'n_components: {self.n_components} '
f'n_subvectors: {self.n_subvectors}'
)
return hashlib.md5(f'{model_metas}'.encode()).hexdigest()
@property
def model_path(self):
return self.data_path / f'parameters-{self.params_hash}'
@property
def _vq_codec_path(self):
return self.model_path / f'vq_codec.params'
@property
def _pq_codec_path(self):
return self.model_path / f'pq_codec.params'
@property
def _projector_codec_path(self):
return self.model_path / f'projector_codec.params'
@property
def index_hash(self):
latest_commit = self.meta_table.get_latest_commit()
date_time = latest_commit[-1] if latest_commit else None
if date_time:
if platform.system() == 'Windows':
return date_time.isoformat('#', 'hours')
return date_time.isoformat('#', 'seconds')
else:
import datetime
return (
datetime.datetime.utcnow().isoformat('#', 'hours')
if platform.system() == 'Windows'
else datetime.datetime.utcnow().isoformat('#', 'seconds')
)
@property
def index_path(self):
if self.index_hash:
return (
self.data_path
/ f'snapshot-{self.params_hash}'
/ f'{self.index_hash}-SNAPSHOT'
)
return None
@property
def snapshot_path(self):
paths = list(
(self.data_path / f'snapshot-{self.params_hash}').glob(f'*-SNAPSHOT')
)
if paths:
paths = sorted(paths, key=lambda x: x.name)
return paths[-1]
return None
@property
def remote_store_client(self):
try:
import hubble
os.environ['JINA_AUTH_TOKEN'] = self.token
client = hubble.Client(max_retries=None, jsonify=True)
client.get_user_info()
return client
except Exception as ex:
logger.error(f'Not login to hubble yet.')
raise ex
def backup(self, target_name: Optional[str] = None, token: Optional[str] = None):
# file lock will be released when backup to remote, this will
# release the file lock. And it's only needed in Windows
# since we need to release file lock before we can access rocksdb files.
if not target_name:
logger.info('dump to local ...')
self.dump()
else:
if token is None:
logger.error(f'back up to remote needs token')
logger.info(f'dump to remote: {target_name}')
self.close()
self._backup_index_to_remote(target_name, token)
def restore(self, source_name: Optional[str] = None, token: Optional[str] = None):
# file lock will be released when restore from remote
if not source_name:
if self.total_docs > 0:
logger.info(f'restore Annlite from local')
self._rebuild_index_from_local()
else:
if token is None:
logger.error(f'restore from remote needs token')
logger.info(f'restore Annlite from artifact: {source_name}')
self.close()
self._rebuild_index_from_remote(source_name, token)
def dump_model(self):
logger.info(f'Save the parameters to {self.model_path}')
self.model_path.mkdir(parents=True, exist_ok=True)
if self._projector_codec:
self._projector_codec.dump(self._projector_codec_path)
if self._vq_codec:
self._vq_codec.dump(self._vq_codec_path)
if self._pq_codec:
self._pq_codec.dump(self._pq_codec_path)
def dump_index(self):
import shutil
logger.info(f'Save the indexer to {self.index_path}')
try:
if Path.exists(self.index_path):
logger.info(
f'Index path {self.index_path} already exists, will be '
f'overwritten'
)
shutil.rmtree(self.index_path)
self.index_path.mkdir(parents=True)
for cell_id in range(self.n_cells):
self.vec_index(cell_id).dump(self.index_path / f'cell_{cell_id}.hnsw')
self.cell_table(cell_id).dump(self.index_path / f'cell_{cell_id}.db')
self.meta_table.dump(self.index_path / f'meta.db')
except Exception as ex:
logger.error(f'Failed to dump the indexer, {ex!r}')
if self.index_path:
shutil.rmtree(self.index_path)
def dump(self):
self.dump_model()
self.dump_index()
def _backup_index_to_remote(self, target_name: str, token: str):
self.dump()
from .hubble_tools import Uploader
self.token = token
client = self.remote_store_client
uploader = Uploader(size_limit=self.size_limit, client=client)
for cell_id in range(self.n_cells):
# upload database
uploader.upload_directory(
Path(self.data_path) / f'cell_{cell_id}',
target_name=target_name,
type='database',
cell_id=cell_id,
)
# upload hnsw file
uploader.upload_file(
Path(self.index_path) / f'cell_{cell_id}.hnsw',
target_name=target_name,
type='hnsw',
cell_id=cell_id,
)
# upload cell_table
uploader.upload_file(
Path(self.index_path) / f'cell_{cell_id}.db',
target_name=target_name,
type='cell_table',
cell_id=cell_id,
)
# upload meta table
uploader.upload_file(
Path(self.index_path) / 'meta.db',
target_name=target_name,
type='meta_table',
cell_id=0,
)
# upload training model
uploader.archive_and_upload(
target_name,
'model',
'model.zip',
'all',
self.model_path.parent,
str(self.model_path.name),
)
def _rebuild_index_from_local(self):
if self.snapshot_path:
logger.info(f'Load the indexer from snapshot {self.snapshot_path}')
for cell_id in range(self.n_cells):
self.vec_index(cell_id).load(
self.snapshot_path / f'cell_{cell_id}.hnsw'
)
self.cell_table(cell_id).load(self.snapshot_path / f'cell_{cell_id}.db')
self.meta_table.load(self.snapshot_path / f'meta.db')
else:
logger.info(f'Rebuild the indexer from scratch')
for cell_id in range(self.n_cells):
cell_size = self.doc_store(cell_id).size
if cell_size == 0:
continue # skip empty cell
logger.debug(
f'Rebuild the index of cell-{cell_id} ({cell_size} docs)...'
)
for docs in self.documents_generator(cell_id, batch_size=10240):
x = to_numpy_array(docs.embeddings)
assigned_cells = np.ones(len(docs), dtype=np.int64) * cell_id
super().insert(x, assigned_cells, docs, only_index=True)
logger.debug(f'Rebuild the index of cell-{cell_id} done')
if self.model_path:
logger.info(f'Load the model from {self.model_path}')
self._reload_models()
def _rebuild_index_from_remote(self, source_name: str, token: str):
import shutil
from .hubble_tools import Merger
self.token = token
client = self.remote_store_client
art_list = client.list_artifacts(
filter={'metaData.name': source_name}, pageSize=100
)
if len(art_list['data']) == 0:
logger.info(f'The indexer `{source_name}` not found. ')
else:
logger.info(f'Load the indexer `{source_name}` from remote store')
restore_path = self.data_path / 'restore'
merger = Merger(restore_path=restore_path, client=client)
for cell_id in range(self.n_cells):
# download hnsw files and merge and load
logger.info(f'Load the hnsw `{source_name}` from remote store')
hnsw_ids = merger.get_artifact_ids(
art_list, type='hnsw', cell_id=cell_id
)
merger.download(ids=hnsw_ids, download_folder=f'hnsw_{cell_id}')
if len(hnsw_ids) > 1:
merger.merge_file(
inputdir=restore_path / f'hnsw_{cell_id}',
outputdir=restore_path / f'hnsw_{cell_id}',
outputfilename=Path(f'cell_{cell_id}.hnsw'),
)
self.vec_index(cell_id).load(
restore_path / f'hnsw_{cell_id}' / f'cell_{cell_id}.hnsw'
)
shutil.rmtree(restore_path / f'hnsw_{cell_id}')
# download cell_table files and merge and load
logger.info(f'Load the cell_table `{source_name}` from remote store')
cell_table_ids = merger.get_artifact_ids(
art_list, type='cell_table', cell_id=cell_id
)
merger.download(
ids=cell_table_ids, download_folder=f'cell_table_{cell_id}'
)
if len(cell_table_ids) > 1:
merger.merge_file(
inputdir=restore_path / f'cell_table_{cell_id}',
outputdir=restore_path / f'cell_table_{cell_id}',
outputfilename=Path(f'cell_{cell_id}.db'),
)
self.cell_table(cell_id).load(
restore_path / f'cell_table_{cell_id}' / f'cell_{cell_id}.db'
)
shutil.rmtree(restore_path / f'cell_table_{cell_id}')
# download database files and rebuild
logger.info(f'Load the database `{source_name}` from remote store')
database_ids = merger.get_artifact_ids(
art_list, type='database', cell_id=cell_id
)
merger.download(ids=database_ids, download_folder='database')
for zip_file in list((restore_path / 'database').iterdir()):
# default has only one cell
shutil.unpack_archive(zip_file, self.data_path / f'cell_{cell_id}')
for f in list(
(
self.data_path
/ f'cell_{cell_id}'
/ zip_file.name.split('.zip')[0]
).iterdir()
):
origin_database_path = (
self.data_path / f'cell_{cell_id}' / f.name
)
if origin_database_path.exists():
origin_database_path.unlink()
f.rename(self.data_path / f'cell_{cell_id}' / f.name)
shutil.rmtree(
self.data_path
/ f'cell_{cell_id}'
/ zip_file.name.split('.zip')[0]
)
Path(zip_file).unlink()
self._rebuild_database()
# download meta_table files
logger.info(f'Load the meta_table `{source_name}` from remote store')
meta_table_ids = merger.get_artifact_ids(
art_list, type='meta_table', cell_id=0
)
merger.download(ids=meta_table_ids, download_folder='meta_table')
if len(meta_table_ids) > 1:
merger.merge_file(
inputdir=restore_path / 'meta_table',
outputdir=restore_path / 'meta_table',
outputfilename=Path('meta.db'),
)
self._meta_table.load(restore_path / 'meta_table' / 'meta.db')
shutil.rmtree(restore_path / 'meta_table')
# download model files
logger.info(f'Load the model `{source_name}` from remote store')
file_name = str(self.model_path.parent / f'{source_name}_model.zip')
model_id = [
art['_id']
for art in art_list['data']
if 'model' in art['metaData']['type']
]
assert len(model_id) == 1
client.download_artifact(
id=model_id[0],
f=file_name,
show_progress=True,
)
shutil.unpack_archive(file_name, self.model_path.parent)
self._reload_models()
Path(file_name).unlink()
shutil.rmtree(restore_path)
@property
def is_trained(self):
if self._projector_codec and (not self._projector_codec.is_trained):
return False
if self._vq_codec and (not self._vq_codec.is_trained):
return False
if self._pq_codec and (not self._pq_codec.is_trained):
return False
return True
def _reload_models(self):
if self._projector_codec_path.exists():
self._projector_codec = ProjectorCodec.load(self._projector_codec_path)
if self._vq_codec_path.exists():
self._vq_codec = VQCodec.load(self._vq_codec_path)
if self._pq_codec_path.exists():
self._pq_codec = PQCodec.load(self._pq_codec_path)
@property
def use_smart_probing(self):
return self._use_smart_probing
@use_smart_probing.setter
def use_smart_probing(self, value):
assert type(value) is bool
self._use_smart_probing = value
@property
def stat(self):
"""Get information on status of the indexer."""
return {
'total_docs': self.total_docs,
'index_size': self.index_size,
'n_cells': self.n_cells,
'n_dim': self.n_dim,
'n_components': self.n_components,
'metric': self.metric.name,
'is_trained': self.is_trained,
}
# @property
# def smart_probing_temperature(self):
# return self._smart_probing_temperature
#
# @smart_probing_temperature.setter
# def smart_probing_temperature(self, value):
# assert value > 0
# assert self.use_smart_probing, 'set use_smart_probing to True first'
# self._smart_probing_temperature = value
================================================
FILE: annlite/math.py
================================================
from typing import Tuple
import numpy as np
def l2_normalize(x: 'np.ndarray', eps: float = np.finfo(np.float32).eps):
"""Scale input vectors individually to unit norm.
:param x: The data to normalize
:param eps: a small jitter to avoid divde by zero
:return: Normalized input X
"""
norms = np.einsum('ij,ij->i', x, x)
np.sqrt(norms, norms)
constant_mask = norms < 10 * eps
norms[constant_mask] = 1.0
return x / norms[:, np.newaxis]
def cosine(
x_mat: 'np.ndarray', y_mat: 'np.ndarray', eps: float = np.finfo(np.float32).eps
) -> 'np.ndarray':
"""Cosine distance between each row in x_mat and each row in y_mat.
:param x_mat: np.ndarray with ndim=2
:param y_mat: np.ndarray with ndim=2
:param eps: a small jitter to avoid divde by zero
:return: np.ndarray with ndim=2
"""
return 1 - np.clip(
(np.dot(x_mat, y_mat.T) + eps)
/ (
np.outer(np.linalg.norm(x_mat, axis=1), np.linalg.norm(y_mat, axis=1)) + eps
),
-1,
1,
)
def sqeuclidean(x_mat: 'np.ndarray', y_mat: 'np.ndarray') -> 'np.ndarray':
"""Squared Euclidean distance between each row in x_mat and each row in y_mat.
:param x_mat: np.ndarray with ndim=2
:param y_mat: np.ndarray with ndim=2
:return: np.ndarray with ndim=2
"""
return (
np.sum(y_mat**2, axis=1)
+ np.sum(x_mat**2, axis=1)[:, np.newaxis]
- 2 * np.dot(x_mat, y_mat.T)
)
def euclidean(x_mat: 'np.ndarray', y_mat: 'np.ndarray') -> 'np.ndarray':
"""Euclidean distance between each row in x_mat and each row in y_mat.
:param x_mat: scipy.sparse like array with ndim=2
:param y_mat: scipy.sparse like array with ndim=2
:return: np.ndarray with ndim=2
"""
return np.sqrt(sqeuclidean(x_mat, y_mat))
def pdist(
x_mat: 'np.ndarray',
metric: str,
) -> 'np.ndarray':
"""Computes Pairwise distances between observations in n-dimensional space.
:param x_mat: Union['np.ndarray','scipy.sparse.csr_matrix', 'scipy.sparse.coo_matrix'] of ndim 2
:param metric: string describing the metric type
:return: np.ndarray of ndim 2
"""
return cdist(x_mat, x_mat, metric)
def cdist(x_mat: 'np.ndarray', y_mat: 'np.ndarray', metric: str) -> 'np.ndarray':
"""Computes the pairwise distance between each row of X and each row on Y according to `metric`.
- Let `n_x = x_mat.shape[0]`
- Let `n_y = y_mat.shape[0]`
- Returns a matrix `dist` of shape `(n_x, n_y)` with `dist[i,j] = metric(x_mat[i], y_mat[j])`.
:param x_mat: numpy or scipy array of ndim 2
:param y_mat: numpy or scipy array of ndim 2
:param metric: string describing the metric type
:return: np.ndarray of ndim 2
"""
dists = {'cosine': cosine, 'sqeuclidean': sqeuclidean, 'euclidean': euclidean}[
metric
](x_mat, y_mat)
return dists
def top_k(
values: 'np.ndarray', k: int, descending: bool = False
) -> Tuple['np.ndarray', 'np.ndarray']:
"""Finds values and indices of the k largest entries for the last dimension.
:param values: array of distances
:param k: number of values to retrieve
:param descending: find top k biggest values
:return: indices and distances
"""
if descending:
values = -values
if k >= values.shape[1]:
idx = values.argsort(axis=1)[:, :k]
values = np.take_along_axis(values, idx, axis=1)
else:
idx_ps = values.argpartition(kth=k, axis=1)[:, :k]
values = np.take_along_axis(values, idx_ps, axis=1)
idx_fs = values.argsort(axis=1)
idx = np.take_along_axis(idx_ps, idx_fs, axis=1)
values = np.take_along_axis(values, idx_fs, axis=1)
if descending:
values = -values
return values, idx
================================================
FILE: annlite/profile.py
================================================
import cProfile
import pstats
import random
from functools import wraps
random.seed(20)
try:
import builtins
line_profile = builtins.profile
except AttributeError:
# No line profiler, provide a pass-through version
def profile(func):
return func
line_profile = profile
def time_profile(
output_file=None, sort_by='cumulative', lines_to_print=None, strip_dirs=False
):
"""A time profiler decorator.
Inspired by and modified the profile decorator of Giampaolo Rodola:
https://github.com/ekhoda/profile_decorator
Args:
output_file: str or None. Default is None
Path of the output file. If only name of the file is given, it's
saved in the current directory.
If it's None, the name of the decorated function is used.
sort_by: str or SortKey enum or tuple/list of str/SortKey enum
Sorting criteria for the Stats object.
For a list of valid string and SortKey refer to:
https://docs.python.org/3/library/profile.html#pstats.Stats.sort_stats
lines_to_print: int or None
Number of lines to print. Default (None) is for all the lines.
This is useful in reducing the size of the printout, especially
that sorting by 'cumulative', the time consuming operations
are printed toward the top of the file.
strip_dirs: bool
Whether to remove the leading path info from file names.
This is also useful in reducing the size of the printout
Returns:
Profile of the decorated function
"""
def inner(func):
@wraps(func)
def wrapper(*args, **kwargs):
_output_file = output_file or func.__name__ + '.prof'
pr = cProfile.Profile()
pr.enable()
retval = func(*args, **kwargs)
pr.disable()
pr.dump_stats(_output_file)
with open(_output_file, 'w') as f:
ps = pstats.Stats(pr, stream=f)
if strip_dirs:
ps.strip_dirs()
if isinstance(sort_by, (tuple, list)):
ps.sort_stats(*sort_by)
else:
ps.sort_stats(sort_by)
ps.print_stats(lines_to_print)
return retval
return wrapper
return inner
================================================
FILE: annlite/storage/__init__.py
================================================
================================================
FILE: annlite/storage/base.py
================================================
import abc
from typing import TYPE_CHECKING, List, Optional
if TYPE_CHECKING:
import numpy as np
from ..enums import ExpandMode
class Storage(abc.ABC):
def __init__(
self,
initial_size: Optional[int] = None,
expand_step_size: int = 10240,
expand_mode: ExpandMode = ExpandMode.ADAPTIVE,
):
if initial_size is None:
initial_size = expand_step_size
assert initial_size >= 0
assert expand_step_size > 0
self.initial_size = initial_size
self.expand_step_size = expand_step_size
self.expand_mode = expand_mode
@property
@abc.abstractmethod
def capacity(self) -> int:
...
@property
@abc.abstractmethod
def size(self):
...
@abc.abstractmethod
def clean(self):
...
@abc.abstractmethod
def add(
self,
data: 'np.ndarray',
cells: 'np.ndarray',
ids: List[str],
doc_tags: Optional[List[dict]] = None,
):
...
@abc.abstractmethod
def delete(self, ids: List[str]):
...
@abc.abstractmethod
def update(
self,
data: 'np.ndarray',
cells: 'np.ndarray',
ids: List[str],
doc_tags: Optional[List[dict]] = None,
):
...
================================================
FILE: annlite/storage/kv.py
================================================
import time
import warnings
from pathlib import Path
from typing import Dict, List, Union
from docarray import Document, DocumentArray
from rocksdict import Options, Rdict, ReadOptions, WriteBatch, WriteOptions
class DocStorage:
"""The backend storage engine of Documents"""
def __init__(
self,
path: Union[str, Path],
serialize_config: Dict = {},
create_if_missing: bool = True,
**kwargs,
):
self._path = str(path)
self._serialize_config = serialize_config
self._kwargs = kwargs
self._init_db(create_if_missing=create_if_missing, **self._kwargs)
def _init_db(self, create_if_missing: bool = True, **kwargs):
opt = Options(raw_mode=True)
opt.optimize_for_point_lookup(1024)
opt.set_inplace_update_support(True)
opt.set_allow_concurrent_memtable_write(False)
# configure mem-table to a large value (256 MB)
opt.set_write_buffer_size(0x10000000)
# 256 MB file size
opt.set_target_file_size_base(0x10000000)
# # set to plain-table for better performance
# opt.set_plain_table_factory(PlainTableFactoryOptions())
opt.create_if_missing(create_if_missing)
self._db = Rdict(path=self._path, options=opt)
# get the size of the database, if it is not created, set it to 0
self._size = len(list(self._db.keys()))
self._is_closed = False
def insert(self, docs: 'DocumentArray'):
write_batch = WriteBatch(raw_mode=True)
write_opt = WriteOptions()
write_opt.sync = True
batch_size = 0
for doc in docs:
write_batch.put(doc.id.encode(), doc.to_bytes(**self._serialize_config))
batch_size += 1
self._db.write(write_batch, write_opt=write_opt)
self._size += batch_size
def update(self, docs: 'DocumentArray'):
write_batch = WriteBatch(raw_mode=True)
write_opt = WriteOptions()
write_opt.sync = True
for doc in docs:
key = doc.id.encode()
if key not in self._db:
raise ValueError(f'The Doc ({doc.id}) does not exist in database!')
write_batch.put(key, doc.to_bytes(**self._serialize_config))
self._db.write(write_batch, write_opt=write_opt)
def delete(self, doc_ids: List[str]):
write_batch = WriteBatch(raw_mode=True)
write_opt = WriteOptions()
write_opt.sync = True
for doc_id in doc_ids:
write_batch.delete(doc_id.encode())
self._db.write(write_batch, write_opt=write_opt)
self._size -= len(doc_ids)
def get(self, doc_ids: Union[str, list]) -> DocumentArray:
docs = DocumentArray()
if isinstance(doc_ids, str):
doc_ids = [doc_ids]
for doc_bytes in self._db[[k.encode() for k in doc_ids]]:
if doc_bytes:
docs.append(Document.from_bytes(doc_bytes, **self._serialize_config))
return docs
def clear(self):
if self._is_closed:
warnings.warn(
'`DocStorage` had been closed already, will skip this close operation.'
)
else:
self._db.close()
self._db.destroy(self._path)
# re-initialize the database for the next usage
self._init_db(create_if_missing=True, **self._kwargs)
def close(self):
if self._is_closed:
warnings.warn(
'`DocStorage` had been closed already, will skip this close operation.'
)
return
try:
self._db.flush(wait=True)
self._db.close()
except Exception as ex:
if 'No such file or directory' not in str(ex):
# this is a known bug, we can safely ignore it
raise ex
self._is_closed = True
def __len__(self):
return self._size
@property
def stat(self):
return {'entries': len(self)}
@property
def size(self):
return self.stat['entries']
@property
def last_transaction_id(self):
return self._db.latest_sequence_number()
def batched_iterator(self, batch_size: int = 1, **kwargs) -> 'DocumentArray':
count = 0
docs = DocumentArray()
read_opt = ReadOptions()
for value in self._db.values(read_opt=read_opt):
doc = Document.from_bytes(value, **self._serialize_config)
docs.append(doc)
count += 1
if count == batch_size:
yield docs
count = 0
docs = DocumentArray()
if count > 0:
yield docs
================================================
FILE: annlite/storage/table.py
================================================
import datetime
import sqlite3
import threading
from pathlib import Path
from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Union
import numpy as np
if TYPE_CHECKING:
from docarray import DocumentArray
sqlite3.register_adapter(np.int64, lambda x: int(x))
sqlite3.register_adapter(np.int32, lambda x: int(x))
COLUMN_TYPE_MAPPING = {
float: 'FLOAT',
int: 'INTEGER',
bool: 'INTEGER',
str: 'TEXT',
bytes.__class__: 'BLOB',
bytes: 'BLOB',
memoryview: 'BLOB',
# datetime.datetime: 'TEXT',
# datetime.date: 'TEXT',
# datetime.time: 'TEXT',
None.__class__: 'TEXT',
# SQLite explicit types
'TEXT': 'TEXT',
'INTEGER': 'INTEGER',
'FLOAT': 'FLOAT',
'BLOB': 'BLOB',
'text': 'TEXT',
'integer': 'INTEGER',
'float': 'FLOAT',
'blob': 'BLOB',
}
# If numpy is available, add more types
if np:
COLUMN_TYPE_MAPPING.update(
{
np.int8: 'INTEGER',
np.int16: 'INTEGER',
np.int32: 'INTEGER',
np.int64: 'INTEGER',
np.uint8: 'INTEGER',
np.uint16: 'INTEGER',
np.uint32: 'INTEGER',
np.uint64: 'INTEGER',
np.float16: 'FLOAT',
np.float32: 'FLOAT',
np.float64: 'FLOAT',
}
)
def _converting(value: Any) -> str:
if isinstance(value, bool):
if value:
return 1
else:
return 0
return str(value)
def time_now():
return datetime.datetime.utcnow()
def _get_table_names(
conn: 'sqlite3.Connection', fts4: bool = False, fts5: bool = False
) -> List[str]:
"""A list of string table names in this database."""
where = ["type = 'table'"]
if fts4:
where.append("sql like '%USING FTS4%'")
if fts5:
where.append("sql like '%USING FTS5%'")
sql = 'select name from sqlite_master where {}'.format(' AND '.join(where))
return [r[0] for r in conn.execute(sql).fetchall()]
class Table:
def __init__(
self,
name: str,
data_path: Optional[Union[Path, str]] = None,
detect_types: int = 0,
in_memory: bool = True,
):
if in_memory:
self._conn_name = ':memory:'
else:
if isinstance(data_path, str):
data_path = Path(data_path)
self._conn_name = data_path / f'{name}.db'
self._name = name
self.detect_types = detect_types
self._conn = sqlite3.connect(
self._conn_name, detect_types=detect_types, check_same_thread=False
)
self._conn_lock = threading.Lock()
def execute(self, sql: str, commit: bool = True):
self._conn.execute(sql)
if commit:
self.commit()
def execute_many(self, sql: str, parameters: List[Tuple], commit: bool = True):
self._conn.executemany(sql, parameters)
if commit:
self.commit()
def commit(self):
self._conn.commit()
def create_table(self):
...
def drop_table(self):
self._conn.execute(f'DROP table {self.name}')
self._conn.commit()
def clear(self):
"""Drop the table and create a new one"""
self.drop_table()
self.create_table()
def load(self, data_file: Union[str, Path]):
disk_db = sqlite3.connect(data_file, detect_types=self.detect_types)
disk_db.backup(self._conn)
disk_db.close()
def dump(self, data_file: Union[str, Path]):
backup_db = sqlite3.connect(data_file, detect_types=self.detect_types)
self._conn.backup(backup_db)
backup_db.close()
def close(self):
self._conn.close()
@property
def name(self):
return self._name
@property
def schema(self):
"""SQL schema for this database"""
result = []
for row in self._conn.execute(
f'''PRAGMA table_info("{self.name}")'''
).fetchall():
result.append(', '.join([str(_) for _ in row]))
return '\n'.join(result)
class CellTable(Table):
def __init__(
self,
name: str,
columns: Optional[List[tuple]] = None,
in_memory: bool = True,
data_path: Optional[Path] = None,
lazy_create: bool = False,
):
super().__init__(name, data_path=data_path, in_memory=in_memory)
self._columns = []
self._indexed_keys = set()
if columns is not None:
for name, dtype in columns:
self.add_column(name, dtype, True)
if not lazy_create:
self.create_table()
@property
def columns(self) -> List[str]:
return ['_id', '_doc_id'] + [c.split()[0] for c in self._columns]
def existed(self):
return self.name in _get_table_names(self._conn)
def add_column(self, name: str, dtype: str, create_index: bool = True):
self._columns.append(f'{name} {COLUMN_TYPE_MAPPING[dtype]}')
if create_index:
self._indexed_keys.add(name)
def create_index(self, column: str, commit: bool = True):
sql_statement = f'''CREATE INDEX idx_{column}_
ON {self.name}({column})'''
self._conn.execute(sql_statement)
if commit:
self._conn.commit()
def create_table(self):
sql = f'''CREATE TABLE {self.name}
(_id INTEGER PRIMARY KEY AUTOINCREMENT,
_doc_id TEXT NOT NULL UNIQUE'''
if len(self._columns) > 0:
sql += ', ' + ', '.join(self._columns)
sql += ')'
self._conn.execute(sql)
for name in self._indexed_keys:
self.create_index(name, commit=False)
self._conn.commit()
def insert(
self,
docs: 'DocumentArray',
commit: bool = True,
) -> List[int]:
"""Add a single record into the table.
:param docs: The list of dict docs
:param commit: If set, commit is applied
"""
sql_template = 'INSERT INTO {table}({columns}) VALUES ({placeholders});'
column_names = self.columns[1:]
columns = ', '.join(column_names)
placeholders = ', '.join('?' for c in column_names)
sql = sql_template.format(
table=self.name, columns=columns, placeholders=placeholders
)
values = []
docs_size = 0
for doc in docs:
doc_value = tuple(
[doc.id]
+ [
_converting(doc.tags[c]) if c in doc.tags else None
for c in self.columns[2:]
]
)
values.append(doc_value)
docs_size += 1
with self._conn_lock:
cursor = self._conn.cursor()
if docs_size > 1:
cursor.executemany(sql, values[:-1])
cursor.execute(sql, values[-1])
last_row_id = cursor.lastrowid
row_ids = list(range(last_row_id - len(docs), last_row_id))
if commit:
self._conn.commit()
return row_ids
def query(
self,
where_clause: str = '',
where_params: Tuple = (),
limit: int = -1,
offset: int = 0,
order_by: Optional[str] = None,
ascending: bool = True,
) -> List[int]:
"""Query the records which matches the given conditions
:param where_clause: where clause for query
:param where_params: where parameters for query
:param limit: limit the number of results
:param offset: offset the number of results
:param order_by: order the results by the given column
:param ascending: order the results in ascending or descending order
:return: offsets list of matched docs
"""
where_conds = []
where = None
if where_clause:
where_conds.append(where_clause)
where = ' and '.join(where_conds)
_order_by = f'{order_by or "_id"} {"ASC" if ascending else "DESC"}'
_limit = f'LIMIT {limit}' if limit > 0 else ''
_offset = f'OFFSET {offset}' if offset > 0 else ''
if where:
sql = f'SELECT _id from {self.name} WHERE {where} ORDER BY {_order_by} {_limit} {_offset}'
else:
sql = f'SELECT _id from {self.name} ORDER BY {_order_by} {_limit} {_offset}'
params = tuple([_converting(p) for p in where_params])
# # EXPLAIN SQL query
# for row in self._conn.execute('EXPLAIN QUERY PLAN ' + sql, params):
# print(row)
# Use `row_factor`
# https://docs.python.org/3.6/library/sqlite3.html#sqlite3.Connection.row_factory
def _offset_factory(_, record):
return record[0] - 1
self._conn.row_factory = _offset_factory
cursor = self._conn.cursor()
try:
if where:
offsets = cursor.execute(sql, params).fetchall()
else:
offsets = cursor.execute(sql).fetchall()
self._conn.row_factory = None
return offsets if offsets else []
except Exception as e:
self._conn.row_factory = None
raise e
def delete(self, doc_ids: List[str]):
"""Delete the docs
:param doc_ids: The IDs of docs
"""
sql = f'DELETE from {self.name} WHERE _doc_id = ?'
self._conn.executemany(sql, doc_ids)
self._conn.commit()
def get_docid_by_offset(self, offset: int):
sql = f'SELECT _doc_id from {self.name} WHERE _id = ? LIMIT 1;'
result = self._conn.execute(sql, (offset + 1,)).fetchone()
if result:
return result[0]
return None
def delete_by_offset(self, offset: int):
"""Delete the doc with specific offset
:param offset: The offset of the doc
"""
sql = f'DELETE FROM {self.name} WHERE _id = ?'
self._conn.execute(sql, (offset + 1,))
self._conn.commit()
def exist(self, doc_id: str):
sql = f'SELECT count(*) from {self.name} WHERE _doc_id = ?;'
return self._conn.execute(sql, (doc_id,)).fetchone()[0] > 0
def count(self, where_clause: str = '', where_params: Tuple = ()):
"""Return the total number of records which match with the given conditions.
:param where_clause: where clause for query
:param where_params: where parameters for query
:return: the total number of matched records
"""
if where_clause:
sql = 'SELECT count(_id) from {table} WHERE {where} LIMIT 1;'
where = where_clause
sql = sql.format(table=self.name, where=where)
params = tuple([_converting(p) for p in where_params])
# # EXPLAIN SQL query
# for row in self._conn.execute('EXPLAIN QUERY PLAN ' + sql, params):
# print(row)
return self._conn.execute(sql, params).fetchone()[0]
else:
sql = f'SELECT count(_id) from {self.name};'
result = self._conn.execute(sql).fetchone()
if result[0]:
return result[0]
return 0
@property
def size(self):
return self.count()
class MetaTable(Table):
def __init__(
self,
name: str = 'meta',
data_path: Optional[Path] = None,
in_memory: bool = False,
):
super().__init__(
name,
data_path=data_path,
detect_types=sqlite3.PARSE_DECLTYPES | sqlite3.PARSE_COLNAMES,
in_memory=in_memory,
)
self.create_table()
def create_table(self):
sql = f'''CREATE TABLE if not exists {self.name}
(_doc_id TEXT NOT NULL PRIMARY KEY,
cell_id INTEGER NOT NULL,
offset INTEGER NOT NULL,
time_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP)'''
self._conn.execute(sql)
self._conn.execute(
f'CREATE INDEX if not exists idx_time_at_ ON {self.name}(time_at)'
)
self._conn.commit()
def iter_addresses(
self, time_since: 'datetime.datetime' = datetime.datetime(2020, 2, 2, 0, 0)
):
sql = f'SELECT _doc_id, cell_id, offset from {self.name} WHERE time_at >= ? ORDER BY time_at ASC;'
cursor = self._conn.cursor()
for doc_id, cell_id, offset in cursor.execute(sql, (time_since,)):
yield doc_id, cell_id, offset
def get_latest_commit(self):
sql = f'SELECT _doc_id, cell_id, offset, time_at from {self.name} ORDER BY time_at DESC LIMIT 1;'
cursor = self._conn.execute(sql)
row = cursor.fetchone()
return row
def get_address(self, doc_id: str):
sql = f'SELECT cell_id, offset from {self.name} WHERE _doc_id = ? LIMIT 1;'
cursor = self._conn.execute(sql, (doc_id,))
row = cursor.fetchone()
return (row[0], row[1]) if row else (None, None)
def delete_address(self, doc_id: str, commit: bool = True):
sql = f'DELETE from {self.name} WHERE _doc_id = ?'
self._conn.execute(sql, (doc_id,))
if commit:
self._conn.commit()
def add_address(self, doc_id: str, cell_id: int, offset: int, commit: bool = True):
sql = f'INSERT OR REPLACE INTO {self.name}(_doc_id, cell_id, offset, time_at) VALUES (?, ?, ?, ?);'
self._conn.execute(
sql,
(doc_id, cell_id, offset, time_now()),
)
if commit:
self._conn.commit()
def bulk_add_address(
self,
doc_ids: List[str],
cell_ids: Union[List[int], np.ndarray],
offsets: Union[List[int], np.ndarray],
commit: bool = True,
):
sql = f'INSERT OR REPLACE INTO {self.name}(_doc_id, cell_id, offset, time_at) VALUES (?, ?, ?, ?);'
self._conn.executemany(
sql,
[
(doc_id, cell_id, offset, time_now())
for doc_id, cell_id, offset in zip(doc_ids, cell_ids, offsets)
],
)
if commit:
self._conn.commit()
================================================
FILE: annlite/utils.py
================================================
import os
import shutil
import numpy as np
from docarray import Document, DocumentArray
def clean_workspace():
if os.path.exists('./data'):
shutil.rmtree('./data')
if os.path.exists('./workspace'):
shutil.rmtree('./workspace')
def docs_with_tags(N, D, probs, categories):
all_docs = []
start_current = 0
for k, prob in enumerate(probs):
n_current = int(N * prob)
X = np.random.random((n_current, D)).astype(np.float32)
docs = [
Document(
embedding=X[i],
id=f'{i+start_current}',
tags={
'category': categories[k],
},
)
for i in range(n_current)
]
all_docs.extend(docs)
start_current += n_current
return DocumentArray(all_docs)
def _precision(predicted, relevant, eval_at):
"""
fraction of retrieved documents that are relevant to the query
"""
if eval_at == 0:
return 0.0
predicted_at_k = predicted[:eval_at]
n_predicted_and_relevant = len(set(predicted_at_k).intersection(set(relevant)))
return n_predicted_and_relevant / len(predicted)
def _recall(predicted, relevant, eval_at):
"""
fraction of the relevant documents that are successfully retrieved
"""
if eval_at == 0:
return 0.0
predicted_at_k = predicted[:eval_at]
n_predicted_and_relevant = len(set(predicted_at_k).intersection(set(relevant)))
return n_predicted_and_relevant / len(relevant)
def evaluate(predicts, relevants, top_k):
recall = 0
precision = 0
for _predict, _relevant in zip(predicts, relevants):
_predict = np.array([int(x) for x in _predict])
recall += _recall(_predict, _relevant, top_k)
precision += _precision(_predict, _relevant, top_k)
return recall / len(predicts), precision / len(predicts)
================================================
FILE: benchmarks/filtering_bench.py
================================================
import os
import shutil
import tempfile
import numpy as np
from jina import Document, DocumentArray
from jina.logging.profile import TimeContext
from annlite import AnnLite
n_index = [10_000, 100_000, 500_000, 1_000_000]
n_query = [1, 8, 64]
D = 768
R = 5
B = 5000
n_cells = 1
probs = [[0.20, 0.30, 0.50], [0.05, 0.15, 0.80]]
categories = ['comic', 'movie', 'audiobook']
def docs_with_tags(N, D, probs, categories):
all_docs = []
for k, prob in enumerate(probs):
n_current = int(N * prob)
X = np.random.random((n_current, D)).astype(np.float32)
docs = [
Document(
embedding=X[i],
tags={
'category': categories[k],
},
)
for i in range(n_current)
]
all_docs.extend(docs)
return DocumentArray(all_docs)
results = []
for n_i in n_index:
results_ni = []
for current_probs in probs:
with tempfile.TemporaryDirectory() as tmpdir:
columns = [('category', str)]
idxer = AnnLite(
D,
initial_size=n_i,
n_cells=n_cells,
columns=columns,
data_path=tmpdir,
)
da = docs_with_tags(n_i, D, current_probs, categories)
with TimeContext(f'indexing {n_i} docs') as t_i:
for i, _batch in enumerate(da.batch(batch_size=B)):
idxer.index(_batch)
for cat, prob in zip(categories, current_probs):
f = {'category': {'$eq': cat}}
query_times = []
for n_q in n_query:
qa = DocumentArray.empty(n_q)
q_embs = np.random.random([n_q, D]).astype(np.float32)
qa.embeddings = q_embs
t_qs = []
for _ in range(R):
with TimeContext(f'searching {n_q} docs') as t_q:
idxer.search(qa, filter=f)
t_qs.append(t_q.duration)
query_times.append(np.mean(t_qs[1:]))
print(f'\n\nprob={prob}, current_probs={current_probs}, n_i={n_i}\n\n')
results_ni.append([n_i, int(100 * prob), t_i.duration] + query_times)
results.append(results_ni)
title = '| Stored data |% same filter| Indexing time | Query size=1 | Query size=8 | Query size=64|'
print(title)
print('|-----' * 6 + '|')
for block in results:
sorted_elements_in_block = np.argsort([b[1] for b in block])
for pos in sorted_elements_in_block:
res = block[pos]
print(
''.join(
[f'| {x} ' for x in res[0:2]] + [f'| {x:.3f} ' for x in res[2:]] + ['|']
)
)
================================================
FILE: benchmarks/hnsw_bench.py
================================================
import tempfile
import time
from datetime import date
import numpy as np
import pandas as pd
from docarray import Document, DocumentArray
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from annlite import AnnLite
from annlite.math import cdist
from annlite.math import top_k as _top_k
def _precision(predicted, relevant, eval_at):
"""
fraction of retrieved documents that are relevant to the query
"""
if eval_at == 0:
return 0.0
predicted_at_k = predicted[:eval_at]
n_predicted_and_relevant = len(set(predicted_at_k).intersection(set(relevant)))
return n_predicted_and_relevant / len(predicted)
def _recall(predicted, relevant, eval_at):
"""
fraction of the relevant documents that are successfully retrieved
"""
if eval_at == 0:
return 0.0
predicted_at_k = predicted[:eval_at]
n_predicted_and_relevant = len(set(predicted_at_k).intersection(set(relevant)))
return n_predicted_and_relevant / len(relevant)
def evaluate(predicts, relevants, eval_at):
recall = 0
precision = 0
for _predict, _relevant in zip(predicts, relevants):
_predict = np.array([int(x) for x in _predict])
recall += _recall(_predict, _relevant, top_k)
precision += _precision(_predict, _relevant, top_k)
return recall / len(predicts), precision / len(predicts)
# N = 100_000 # number of data points
Nt = 125_000
Nq = 1
D = 128 # dimentionality / number of features
top_k = 10
n_cells = 64
n_subvectors = 64
n_queries = 1000
# 2,000 128-dim vectors for training
np.random.seed(123)
Xtr, Xte = train_test_split(
make_blobs(n_samples=Nt, n_features=D)[0].astype(np.float32), test_size=20
)
print(f'Xtr: {Xtr.shape} vs Xte: {Xte.shape}')
def get_documents(nr=10, index_start=0, embeddings=None):
for i in range(index_start, nr + index_start):
d = Document()
d.id = f'{i}' # to test it supports non-int ids
d.embedding = embeddings[i - index_start]
yield d
precision_per_query = []
recall_per_query = []
results = []
for n_cells in [1, 8, 16, 32, 64, 128]:
with tempfile.TemporaryDirectory() as tmpdir:
pq = AnnLite(
D,
metric='euclidean',
n_cells=n_cells,
data_path=tmpdir,
)
t0 = time.time()
pq.train(Xtr[:20480])
train_time = abs(time.time() - t0)
t0 = time.time()
pq.index(DocumentArray(get_documents(len(Xtr), embeddings=Xtr)))
index_time = abs(t0 - time.time())
dists = cdist(Xte, Xtr, metric='euclidean')
true_dists, true_ids = _top_k(dists, top_k, descending=False)
t0 = time.time()
docs = DocumentArray(get_documents(len(Xte), embeddings=Xte))
pq.search(docs, limit=top_k)
query_time = abs(t0 - time.time())
pq_ids = []
for doc in docs:
pq_ids.append([m.id for m in doc.matches])
recall, precision = evaluate(pq_ids, true_ids, top_k)
results_dict = {
'precision': precision,
'recall': recall,
'train_time': train_time,
'index_time': index_time,
'query_time': query_time,
'query_qps': len(Xte) / query_time,
'index_qps': len(Xtr) / index_time,
'indexer_hyperparams': {'n_cells': n_cells},
}
print(results_dict)
results.append(results_dict)
pq.clear()
pq.close()
today = date.today()
results_df = pd.DataFrame(results)
results_df.sort_values('recall', ascending=False)
results_df.to_csv(f'bench-results-{today.strftime("%b-%d-%Y")}.csv')
================================================
FILE: bindings/hnsw_bindings.cpp
================================================
#include "hnswlib.h"
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
namespace py = pybind11;
using namespace pybind11::literals; // needed to bring in _a literal
/*
* replacement for the openmp '#pragma omp parallel for' directive
* only handles a subset of functionality (no reductions etc)
* Process ids from start (inclusive) to end (EXCLUSIVE)
*
* The method is borrowed from nmslib
*/
template
inline void ParallelFor(size_t start, size_t end, size_t numThreads,
Function fn) {
if (numThreads <= 0) {
numThreads = std::thread::hardware_concurrency();
}
if (numThreads == 1) {
for (size_t id = start; id < end; id++) {
fn(id, 0);
}
} else {
std::vector threads;
std::atomic current(start);
// keep track of exceptions in threads
// https://stackoverflow.com/a/32428427/1713196
std::exception_ptr lastException = nullptr;
std::mutex lastExceptMutex;
for (size_t threadId = 0; threadId < numThreads; ++threadId) {
threads.push_back(std::thread([&, threadId] {
while (true) {
size_t id = current.fetch_add(1);
if ((id >= end)) {
break;
}
try {
fn(id, threadId);
} catch (...) {
std::unique_lock lastExcepLock(lastExceptMutex);
lastException = std::current_exception();
/*
* This will work even when current is the largest value that
* size_t can fit, because fetch_add returns the previous value
* before the increment (what will result in overflow
* and produce 0 instead of current + 1).
*/
current = end;
break;
}
}
}));
}
for (auto &thread : threads) {
thread.join();
}
if (lastException) {
std::rethrow_exception(lastException);
}
}
}
inline void assert_true(bool expr, const std::string &msg) {
if (expr == false)
throw std::runtime_error("Unpickle Error: " + msg);
return;
}
template class Index {
public:
Index(const std::string &space_name, const int dim)
: space_name(space_name), dim(dim) {
normalize = false;
if (space_name == "l2") {
l2space = new hnswlib::L2Space(dim);
} else if (space_name == "ip") {
l2space = new hnswlib::InnerProductSpace(dim);
} else if (space_name == "cosine") {
l2space = new hnswlib::InnerProductSpace(dim);
normalize = true;
} else {
throw new std::runtime_error(
"Space name must be one of l2, ip, or cosine.");
}
appr_alg = NULL;
ep_added = true;
index_inited = false;
num_threads_default = std::thread::hardware_concurrency();
default_ef = 10;
pq_enable = false;
pq_n_clusters = -1;
pq_n_subvectors = -1;
pq_d_subvector = -1;
pq_codec = py::none();
}
static const int ser_version = 1; // serialization version
std::string space_name;
int dim;
size_t seed;
size_t default_ef;
bool index_inited;
bool ep_added;
bool normalize;
size_t maxElements, M, efConstruction;
int num_threads_default;
hnswlib::labeltype cur_l;
hnswlib::HierarchicalNSW *appr_alg;
hnswlib::SpaceInterface *l2space;
// quantization setting
bool pq_enable;
size_t pq_n_subvectors;
size_t pq_n_clusters;
size_t pq_d_subvector;
py::object pq_codec;
~Index() {
delete l2space;
if (appr_alg)
delete appr_alg;
// WARNING: will python release the pq_codec?
}
void init_new_index(const size_t maxElements, const size_t M,
const size_t efConstruction, const size_t random_seed,
const py::object &pq_codec) {
if (appr_alg) {
throw new std::runtime_error("The index is already initiated.");
}
if (!pq_codec.is_none()) {
_loadPQ(pq_codec);
}
this->cur_l = 0;
this->appr_alg = new hnswlib::HierarchicalNSW(
l2space, maxElements, M, efConstruction, random_seed);
this->index_inited = true;
this->ep_added = false;
this->appr_alg->ef_ = default_ef;
this->maxElements = maxElements;
this->M = M;
this->efConstruction = efConstruction;
this->seed = random_seed;
}
void loadPQ(const py::object &pq_codec) {
if (this->appr_alg) {
delete this->appr_alg;
}
if (pq_codec.is_none()) {
throw new std::runtime_error("Passed PQ class is none");
}
_loadPQ(pq_codec);
this->cur_l = 0;
this->ep_added = false;
this->appr_alg = new hnswlib::HierarchicalNSW(
l2space, maxElements, M, efConstruction, seed);
this->appr_alg->ef_ = default_ef;
}
void set_ef(size_t ef) {
default_ef = ef;
if (appr_alg)
appr_alg->ef_ = ef;
}
void set_num_threads(int num_threads) {
this->num_threads_default = num_threads;
}
void saveIndex(const std::string &path_to_index) {
appr_alg->saveIndex(path_to_index);
}
void loadIndex(const std::string &path_to_index, size_t max_elements) {
if (appr_alg) {
std::cerr << "Warning: Calling load_index for an already inited index. "
"Old index is being deallocated.";
delete appr_alg;
}
appr_alg = new hnswlib::HierarchicalNSW(l2space, path_to_index,
false, max_elements);
cur_l = appr_alg->cur_element_count;
index_inited = true;
}
void normalize_vector(float *data, float *norm_array) {
float norm = 0.0f;
for (int i = 0; i < dim; i++)
norm += data[i] * data[i];
norm = 1.0f / (sqrtf(norm) + 1e-30f);
for (int i = 0; i < dim; i++)
norm_array[i] = data[i] * norm;
}
template
void addRows_(int dim, int num_threads, const py::object &ids_,
const py::object &input, const py::object dtables) {
py::array_t items(input);
auto buffer = items.request();
size_t rows, features;
rows = buffer.shape[0];
features = buffer.shape[1];
if (!dtables.is_none()) {
hnswlib::pq_local_data_t pq_param;
py::array_t
dtable_items(dtables);
auto dtable_buffer = dtable_items.request();
pq_param.data = (float *)dtable_buffer.ptr;
pq_param.batch_len = rows;
this->l2space->attach_local_data(&pq_param);
}
if (features != dim)
throw std::runtime_error("wrong dimensionality of the vectors");
if (num_threads <= 0)
num_threads = num_threads_default;
// avoid using threads when the number of searches is small:
if (rows <= num_threads * 4) {
num_threads = 1;
}
std::vector ids;
if (!ids_.is_none()) {
py::array_t items(
ids_);
auto ids_numpy = items.request();
if (ids_numpy.ndim == 1 && ids_numpy.shape[0] == rows) {
std::vector ids1(ids_numpy.shape[0]);
for (size_t i = 0; i < ids1.size(); i++) {
ids1[i] = items.data()[i];
}
ids.swap(ids1);
} else if (ids_numpy.ndim == 0 && rows == 1) {
ids.push_back(*items.data());
} else
throw std::runtime_error("wrong dimensionality of the labels");
}
{
int start = 0;
if (!ep_added) {
size_t id = ids.size() ? ids.at(0) : (cur_l);
appr_alg->addPoint((void *)items.data(0), (size_t)id, 0);
start = 1;
ep_added = true;
}
py::gil_scoped_release l;
ParallelFor(start, rows, num_threads, [&](size_t row, size_t threadId) {
size_t id = ids.size() ? ids.at(row) : (cur_l + row);
appr_alg->addPoint((void *)items.data(row), (size_t)id, row);
});
cur_l += rows;
}
if (!dtables.is_none()) {
this->l2space->detach_local_data();
}
}
void addItems(const py::object &input, py::object ids_ = py::none(),
int num_threads = -1, py::object dtables = py::none()) {
// move dim check and normalization to python
if (!pq_enable) {
addRows_(this->dim, num_threads, ids_, input, dtables);
} else {
if (pq_n_clusters <= (UINT8_MAX + 1)) {
addRows_(pq_n_subvectors, num_threads, ids_, input, dtables);
} else if (pq_n_clusters <= (UINT16_MAX + 1)) {
addRows_(pq_n_subvectors, num_threads, ids_, input, dtables);
} else if (pq_n_clusters <= (UINT32_MAX + 1)) {
addRows_(pq_n_subvectors, num_threads, ids_, input, dtables);
}
}
}
template
py::object knnQuery_return_numpy_(size_t k, int num_threads,
const py::object &input,
const py::object dtables) {
py::array_t items(input);
auto buffer = items.request();
hnswlib::labeltype *data_numpy_l;
dist_t *data_numpy_d;
size_t rows, features;
{
py::gil_scoped_release l;
rows = buffer.shape[0];
features = buffer.shape[1];
if (!dtables.is_none()) {
hnswlib::pq_local_data_t pq_param;
py::array_t
dtable_items(dtables);
auto dtable_buffer = dtable_items.request();
pq_param.data = (float *)dtable_buffer.ptr;
pq_param.batch_len = rows;
this->l2space->attach_local_data(&pq_param);
}
if (num_threads <= 0)
num_threads = num_threads_default;
// avoid using threads when the number of searches is small:
if (rows <= num_threads * 4) {
num_threads = 1;
}
data_numpy_l = new hnswlib::labeltype[rows * k];
data_numpy_d = new dist_t[rows * k];
ParallelFor(0, rows, num_threads, [&](size_t row, size_t threadId) {
std::priority_queue> result =
appr_alg->searchKnn((void *)items.data(row), k, row);
if (result.size() != k)
throw std::runtime_error(
"Cannot return the results in a contigious 2D array. Probably "
"ef or M is too small");
for (int i = k - 1; i >= 0; i--) {
auto &result_tuple = result.top();
data_numpy_d[row * k + i] = result_tuple.first;
data_numpy_l[row * k + i] = result_tuple.second;
result.pop();
}
});
}
if (!dtables.is_none()) {
this->l2space->detach_local_data();
}
py::capsule free_when_done_l(data_numpy_l, [](void *f) { delete[] f; });
py::capsule free_when_done_d(data_numpy_d, [](void *f) { delete[] f; });
return py::make_tuple(
py::array_t(
{rows, k}, // shape
{k * sizeof(hnswlib::labeltype),
sizeof(
hnswlib::labeltype)}, // C-style contiguous strides for double
data_numpy_l, // the data pointer
free_when_done_l),
py::array_t(
{rows, k}, // shape
{k * sizeof(dist_t),
sizeof(dist_t)}, // C-style contiguous strides for double
data_numpy_d, // the data pointer
free_when_done_d));
}
py::object knnQuery_return_numpy(const py::object &input, size_t k = 1,
int num_threads = -1,
py::object dtables = py::none()) {
// move dim check and normalization to python
if (!pq_enable) {
return knnQuery_return_numpy_(k, num_threads, input, dtables);
} else {
if (pq_n_clusters <= (UINT8_MAX + 1)) {
return knnQuery_return_numpy_(k, num_threads, input, dtables);
} else if (pq_n_clusters <= (UINT16_MAX + 1)) {
return knnQuery_return_numpy_(k, num_threads, input, dtables);
} else if (pq_n_clusters <= (UINT32_MAX + 1)) {
return knnQuery_return_numpy_(k, num_threads, input, dtables);
}
}
}
template
py::object knnQuery_with_filter_(size_t k, int num_threads,
const py::object &candidate_ids_,
const py::object &input,
const py::object dtables) {
py::array_t items(input);
auto buffer = items.request();
hnswlib::labeltype *data_numpy_l;
dist_t *data_numpy_d;
if (num_threads <= 0)
num_threads = num_threads_default;
size_t rows;
size_t features;
rows = buffer.shape[0];
features = buffer.shape[1];
if (!dtables.is_none()) {
hnswlib::pq_local_data_t pq_param;
py::array_t
dtable_items(dtables);
auto dtable_buffer = dtable_items.request();
pq_param.data = (float *)dtable_buffer.ptr;
pq_param.batch_len = rows;
this->l2space->attach_local_data(&pq_param);
}
// avoid using threads when the number of searches is small:
if (rows <= num_threads * 4) {
num_threads = 1;
}
// FuseFilter constructing
binary_fuse16_t filter(appr_alg->max_elements_);
if (!candidate_ids_.is_none()) {
py::array_t items(
candidate_ids_);
auto ids_numpy = items.request();
if (ids_numpy.ndim == 1) {
const size_t size = ids_numpy.shape[0];
std::vector big_set;
big_set.reserve(size);
for (size_t i = 0; i < size; i++) {
big_set[i] = items.data()[i]; // we use contiguous values
}
if (!binary_fuse16_populate(big_set.data(), size, &filter)) {
throw std::runtime_error("failure to populate the fuse filter");
}
} else
throw std::runtime_error("wrong dimensionality of the filter labels");
}
{
py::gil_scoped_release l;
// would like to check the ownership of this data in more detail
data_numpy_l = new hnswlib::labeltype[rows * k];
data_numpy_d = new dist_t[rows * k];
ParallelFor(0, rows, num_threads, [&](size_t row, size_t threadId) {
std::priority_queue> result =
appr_alg->searchKnnWithFilter((void *)items.data(row), &filter, k,
row);
if (result.size() != k)
throw std::runtime_error(
"Cannot return the results in a contigious 2D array. Probably "
"ef or M is too small");
for (int i = k - 1; i >= 0; i--) {
auto &result_tuple = result.top();
data_numpy_d[row * k + i] = result_tuple.first;
data_numpy_l[row * k + i] = result_tuple.second;
result.pop();
}
});
}
if (!dtables.is_none()) {
this->l2space->detach_local_data();
}
py::capsule free_when_done_l(data_numpy_l, [](void *f) { delete[] f; });
py::capsule free_when_done_d(data_numpy_d, [](void *f) { delete[] f; });
return py::make_tuple(
py::array_t(
{rows, k}, // shape
{k * sizeof(hnswlib::labeltype),
sizeof(
hnswlib::labeltype)}, // C-style contiguous strides for double
data_numpy_l, // the data pointer
free_when_done_l),
py::array_t(
{rows, k}, // shape
{k * sizeof(dist_t),
sizeof(dist_t)}, // C-style contiguous strides for double
data_numpy_d, // the data pointer
free_when_done_d));
}
py::object knnQuery_with_filter(py::object input,
py::object candidate_ids_ = py::none(),
size_t k = 1, int num_threads = -1,
py::object dtables = py::none()) {
// move dim check and normalization to python
if (!pq_enable) {
return knnQuery_with_filter_(k, num_threads, candidate_ids_, input,
dtables);
} else {
if (pq_n_clusters <= (UINT8_MAX + 1)) {
return knnQuery_with_filter_(k, num_threads, candidate_ids_,
input, dtables);
} else if (pq_n_clusters <= (UINT16_MAX + 1)) {
return knnQuery_with_filter_(k, num_threads, candidate_ids_,
input, dtables);
} else if (pq_n_clusters <= (UINT32_MAX + 1)) {
return knnQuery_with_filter_(k, num_threads, candidate_ids_,
input, dtables);
}
}
}
std::vector>
getDataReturnList(py::object ids_ = py::none()) {
std::vector ids;
if (!ids_.is_none()) {
py::array_t items(
ids_);
auto ids_numpy = items.request();
std::vector ids1(ids_numpy.shape[0]);
for (size_t i = 0; i < ids1.size(); i++) {
ids1[i] = items.data()[i];
}
ids.swap(ids1);
}
std::vector> data;
for (auto id : ids) {
data.push_back(appr_alg->template getDataByLabel(id));
}
return data;
}
std::vector getIdsList() {
std::vector ids;
for (auto kv : appr_alg->label_lookup_) {
ids.push_back(kv.first);
}
return ids;
}
py::dict getAnnData() const { /* WARNING: Index::getAnnData is not thread-safe
with Index::addItems */
std::unique_lock templock(appr_alg->global);
unsigned int level0_npy_size =
appr_alg->cur_element_count * appr_alg->size_data_per_element_;
unsigned int link_npy_size = 0;
std::vector link_npy_offsets(appr_alg->cur_element_count);
for (size_t i = 0; i < appr_alg->cur_element_count; i++) {
unsigned int linkListSize =
appr_alg->element_levels_[i] > 0
? appr_alg->size_links_per_element_ * appr_alg->element_levels_[i]
: 0;
link_npy_offsets[i] = link_npy_size;
if (linkListSize)
link_npy_size += linkListSize;
}
char *data_level0_npy = (char *)malloc(level0_npy_size);
char *link_list_npy = (char *)malloc(link_npy_size);
int *element_levels_npy =
(int *)malloc(appr_alg->element_levels_.size() * sizeof(int));
hnswlib::labeltype *label_lookup_key_npy = (hnswlib::labeltype *)malloc(
appr_alg->label_lookup_.size() * sizeof(hnswlib::labeltype));
hnswlib::tableint *label_lookup_val_npy = (hnswlib::tableint *)malloc(
appr_alg->label_lookup_.size() * sizeof(hnswlib::tableint));
memset(label_lookup_key_npy, -1,
appr_alg->label_lookup_.size() * sizeof(hnswlib::labeltype));
memset(label_lookup_val_npy, -1,
appr_alg->label_lookup_.size() * sizeof(hnswlib::tableint));
size_t idx = 0;
for (auto it = appr_alg->label_lookup_.begin();
it != appr_alg->label_lookup_.end(); ++it) {
label_lookup_key_npy[idx] = it->first;
label_lookup_val_npy[idx] = it->second;
idx++;
}
memset(link_list_npy, 0, link_npy_size);
memcpy(data_level0_npy, appr_alg->data_level0_memory_, level0_npy_size);
memcpy(element_levels_npy, appr_alg->element_levels_.data(),
appr_alg->element_levels_.size() * sizeof(int));
for (size_t i = 0; i < appr_alg->cur_element_count; i++) {
unsigned int linkListSize =
appr_alg->element_levels_[i] > 0
? appr_alg->size_links_per_element_ * appr_alg->element_levels_[i]
: 0;
if (linkListSize) {
memcpy(link_list_npy + link_npy_offsets[i], appr_alg->linkLists_[i],
linkListSize);
}
}
py::capsule free_when_done_l0(data_level0_npy, [](void *f) { delete[] f; });
py::capsule free_when_done_lvl(element_levels_npy,
[](void *f) { delete[] f; });
py::capsule free_when_done_lb(label_lookup_key_npy,
[](void *f) { delete[] f; });
py::capsule free_when_done_id(label_lookup_val_npy,
[](void *f) { delete[] f; });
py::capsule free_when_done_ll(link_list_npy, [](void *f) { delete[] f; });
/* TODO: serialize state of random generators appr_alg->level_generator_
* and appr_alg->update_probability_generator_ */
/* for full reproducibility / to avoid re-initializing generators
* inside Index::createFromParams */
return py::dict(
"offset_level0"_a = appr_alg->offsetLevel0_,
"max_elements"_a = appr_alg->max_elements_,
"cur_element_count"_a = appr_alg->cur_element_count,
"size_data_per_element"_a = appr_alg->size_data_per_element_,
"label_offset"_a = appr_alg->label_offset_,
"offset_data"_a = appr_alg->offsetData_,
"max_level"_a = appr_alg->maxlevel_,
"enterpoint_node"_a = appr_alg->enterpoint_node_,
"max_M"_a = appr_alg->maxM_, "max_M0"_a = appr_alg->maxM0_,
"M"_a = appr_alg->M_, "mult"_a = appr_alg->mult_,
"ef_construction"_a = appr_alg->ef_construction_,
"ef"_a = appr_alg->ef_, "has_deletions"_a = appr_alg->has_deletions_,
"size_links_per_element"_a = appr_alg->size_links_per_element_,
"label_lookup_external"_a = py::array_t(
{appr_alg->label_lookup_.size()}, // shape
{sizeof(
hnswlib::labeltype)}, // C-style contiguous strides for double
label_lookup_key_npy, // the data pointer
free_when_done_lb),
"label_lookup_internal"_a = py::array_t(
{appr_alg->label_lookup_.size()}, // shape
{sizeof(
hnswlib::tableint)}, // C-style contiguous strides for double
label_lookup_val_npy, // the data pointer
free_when_done_id),
"element_levels"_a = py::array_t(
{appr_alg->element_levels_.size()}, // shape
{sizeof(int)}, // C-style contiguous strides for double
element_levels_npy, // the data pointer
free_when_done_lvl),
// linkLists_,element_levels_,data_level0_memory_
"data_level0"_a = py::array_t(
{level0_npy_size}, // shape
{sizeof(char)}, // C-style contiguous strides for double
data_level0_npy, // the data pointer
free_when_done_l0),
"link_lists"_a = py::array_t(
{link_npy_size}, // shape
{sizeof(char)}, // C-style contiguous strides for double
link_list_npy, // the data pointer
free_when_done_ll)
);
}
py::dict getIndexParams() const { /* WARNING: Index::getAnnData is not
thread-safe with Index::addItems */
auto params = py::dict(
"ser_version"_a =
py::int_(Index::ser_version), // serialization version
"space"_a = space_name, "dim"_a = dim, "index_inited"_a = index_inited,
"ep_added"_a = ep_added, "normalize"_a = normalize,
"num_threads"_a = num_threads_default, "seed"_a = seed);
if (index_inited == false)
return py::dict(**params, "ef"_a = default_ef);
auto ann_params = getAnnData();
return py::dict(**params, **ann_params);
}
static Index *createFromParams(const py::dict d) {
// check serialization version
assert_true(((int)py::int_(Index::ser_version)) >=
d["ser_version"].cast(),
"Invalid serialization version!");
auto space_name_ = d["space"].cast();
auto dim_ = d["dim"].cast();
auto index_inited_ = d["index_inited"].cast();
Index *new_index = new Index(space_name_, dim_);
/* TODO: deserialize state of random generators into
* new_index->level_generator_ and new_index->update_probability_generator_
*/
/* for full reproducibility / state of generators is serialized
* inside Index::getIndexParams */
new_index->seed = d["seed"].cast();
if (index_inited_) {
new_index->appr_alg = new hnswlib::HierarchicalNSW(
new_index->l2space, d["max_elements"].cast(),
d["M"].cast(), d["ef_construction"].cast(),
new_index->seed);
new_index->cur_l = d["cur_element_count"].cast();
}
new_index->index_inited = index_inited_;
new_index->ep_added = d["ep_added"].cast();
new_index->num_threads_default = d["num_threads"].cast();
new_index->default_ef = d["ef"].cast();
if (index_inited_)
new_index->setAnnData(d);
return new_index;
}
static Index *createFromIndex(const Index &index) {
return createFromParams(index.getIndexParams());
}
void setAnnData(const py::dict d) { /* WARNING: Index::setAnnData is not
thread-safe with Index::addItems */
std::unique_lock templock(appr_alg->global);
assert_true(appr_alg->offsetLevel0_ == d["offset_level0"].cast(),
"Invalid value of offsetLevel0_ ");
assert_true(appr_alg->max_elements_ == d["max_elements"].cast(),
"Invalid value of max_elements_ ");
appr_alg->cur_element_count = d["cur_element_count"].cast();
assert_true(appr_alg->size_data_per_element_ ==
d["size_data_per_element"].cast(),
"Invalid value of size_data_per_element_ ");
assert_true(appr_alg->label_offset_ == d["label_offset"].cast(),
"Invalid value of label_offset_ ");
assert_true(appr_alg->offsetData_ == d["offset_data"].cast(),
"Invalid value of offsetData_ ");
appr_alg->maxlevel_ = d["max_level"].cast();
appr_alg->enterpoint_node_ = d["enterpoint_node"].cast();
assert_true(appr_alg->maxM_ == d["max_M"].cast(),
"Invalid value of maxM_ ");
assert_true(appr_alg->maxM0_ == d["max_M0"].cast(),
"Invalid value of maxM0_ ");
assert_true(appr_alg->M_ == d["M"].cast(), "Invalid value of M_ ");
assert_true(appr_alg->mult_ == d["mult"].cast(),
"Invalid value of mult_ ");
assert_true(appr_alg->ef_construction_ ==
d["ef_construction"].cast(),
"Invalid value of ef_construction_ ");
appr_alg->ef_ = d["ef"].cast();
appr_alg->has_deletions_ = d["has_deletions"].cast();
assert_true(appr_alg->size_links_per_element_ ==
d["size_links_per_element"].cast(),
"Invalid value of size_links_per_element_ ");
auto label_lookup_key_npy =
d["label_lookup_external"]
.cast>();
auto label_lookup_val_npy =
d["label_lookup_internal"]
.cast>();
auto element_levels_npy =
d["element_levels"]
.cast<
py::array_t>();
auto data_level0_npy =
d["data_level0"]
.cast<
py::array_t>();
auto link_list_npy =
d["link_lists"]
.cast<
py::array_t>();
for (size_t i = 0; i < appr_alg->cur_element_count; i++) {
if (label_lookup_val_npy.data()[i] < 0) {
throw std::runtime_error("internal id cannot be negative!");
} else {
appr_alg->label_lookup_.insert(std::make_pair(
label_lookup_key_npy.data()[i], label_lookup_val_npy.data()[i]));
}
}
memcpy(appr_alg->element_levels_.data(), element_levels_npy.data(),
element_levels_npy.nbytes());
unsigned int link_npy_size = 0;
std::vector link_npy_offsets(appr_alg->cur_element_count);
for (size_t i = 0; i < appr_alg->cur_element_count; i++) {
unsigned int linkListSize =
appr_alg->element_levels_[i] > 0
? appr_alg->size_links_per_element_ * appr_alg->element_levels_[i]
: 0;
link_npy_offsets[i] = link_npy_size;
if (linkListSize)
link_npy_size += linkListSize;
}
memcpy(appr_alg->data_level0_memory_, data_level0_npy.data(),
data_level0_npy.nbytes());
for (size_t i = 0; i < appr_alg->max_elements_; i++) {
unsigned int linkListSize =
appr_alg->element_levels_[i] > 0
? appr_alg->size_links_per_element_ * appr_alg->element_levels_[i]
: 0;
if (linkListSize == 0) {
appr_alg->linkLists_[i] = nullptr;
} else {
appr_alg->linkLists_[i] = (char *)malloc(linkListSize);
if (appr_alg->linkLists_[i] == nullptr)
throw std::runtime_error(
"Not enough memory: loadIndex failed to allocate linklist");
memcpy(appr_alg->linkLists_[i],
link_list_npy.data() + link_npy_offsets[i], linkListSize);
}
}
}
void markDeleted(size_t label) { appr_alg->markDelete(label); }
void resizeIndex(size_t new_size) { appr_alg->resizeIndex(new_size); }
size_t getMaxElements() const { return appr_alg->max_elements_; }
size_t getCurrentCount() const { return appr_alg->cur_element_count; }
void _loadPQ(const py::object &pq_abstract) {
int exist_attr = 1, attr_correct = 1;
PyObject *pq_raw_ptr = pq_abstract.ptr();
exist_attr *= PyObject_HasAttrString(pq_raw_ptr, "encode");
exist_attr *= PyObject_HasAttrString(pq_raw_ptr, "get_codebook");
exist_attr *= PyObject_HasAttrString(pq_raw_ptr, "get_subspace_splitting");
if (exist_attr <= 0) {
throw py::index_error(
"PQ class should at least have the following attributes:\n"
"(encode, get_codebook, get_subspace_splitting)");
}
attr_correct *=
PyMethod_Check(PyObject_GetAttrString(pq_raw_ptr, "encode"));
attr_correct *=
PyMethod_Check(PyObject_GetAttrString(pq_raw_ptr, "get_codebook"));
attr_correct *= PyMethod_Check(
PyObject_GetAttrString(pq_raw_ptr, "get_subspace_splitting"));
if (attr_correct <= 0) {
throw py::attribute_error(
"PQ class have at least one of the following attributes' type "
"INCORRECT:\n(encode: ,\n codebook: "
",\n get_subspace_splitting: )");
}
this->pq_enable = true;
this->pq_codec = pq_abstract;
py::tuple subspaces_param = pq_abstract.attr("get_subspace_splitting")();
this->pq_n_subvectors = py::cast(subspaces_param[0]);
this->pq_n_clusters = py::cast(subspaces_param[1]);
this->pq_d_subvector = py::cast(subspaces_param[2]);
size_t pq_total_dims = (this->pq_n_subvectors * this->pq_d_subvector);
if (this->dim != pq_total_dims) {
throw py::value_error(
"Initialization Error, expect HNSW.dim == "
"PQ.n_subvector*PQ.d_subvector, but got:\n"
"HNSW.dim =" +
std::to_string(this->dim) +
", PQ.n_subvector*PQ.d_subvector=" + std::to_string(pq_total_dims));
}
// reading codebook into float buffer
py::array_t items(
pq_abstract.attr("get_codebook")());
auto buffer = items.request();
if (buffer.ndim != 3 || buffer.shape[0] != pq_n_subvectors ||
buffer.shape[1] != pq_n_clusters || buffer.shape[2] != pq_d_subvector) {
py::print("Expect the codebook with shape (", pq_n_subvectors,
pq_n_clusters, pq_d_subvector, "seq"_a = ",");
py::print(" but got shape ", buffer.shape);
throw py::attribute_error(
"PQ class returning the codebook with wrong dimension");
}
// std::shared_ptr codebook_buffer((float *)buffer.ptr);
float *codebook_buffer = (float *)buffer.ptr;
if (l2space) {
delete l2space;
}
if (pq_n_clusters <= (UINT8_MAX + 1)) {
l2space = new hnswlib::PQ_Space(space_name, pq_n_subvectors,
pq_n_clusters, pq_d_subvector,
codebook_buffer);
} else if (pq_n_clusters <= (UINT16_MAX + 1)) {
l2space = new hnswlib::PQ_Space(space_name, pq_n_subvectors,
pq_n_clusters, pq_d_subvector,
codebook_buffer);
} else if (pq_n_clusters <= (UINT32_MAX + 1)) {
l2space = new hnswlib::PQ_Space(space_name, pq_n_subvectors,
pq_n_clusters, pq_d_subvector,
codebook_buffer);
} else {
throw py::value_error(
"PQ clustering exceed the maximum, annlite set the maximum of "
"clusters = " +
std::to_string(UINT16_MAX + 1) +
", but got PQ.n_clusters=" + std::to_string(pq_n_clusters));
}
}
};
PYBIND11_PLUGIN(hnsw_bind) {
py::module m("hnsw_bind");
py::class_>(m, "Index")
.def(py::init(&Index::createFromParams), py::arg("params"))
/* WARNING: Index::createFromIndex is not thread-safe with Index::addItems
*/
.def(py::init(&Index::createFromIndex), py::arg("index"))
.def(py::init(), py::arg("space"),
py::arg("dim"))
.def("init_index", &Index::init_new_index, py::arg("max_elements"),
py::arg("M") = 16, py::arg("ef_construction") = 200,
py::arg("random_seed") = 100, py::arg("pq_codec") = py::none())
.def("knn_query", &Index::knnQuery_return_numpy, py::arg("data"),
py::arg("k") = 1, py::arg("num_threads") = -1,
py::arg("dtables") = py::none())
.def("knn_query_with_filter", &Index::knnQuery_with_filter,
py::arg("data"), py::arg("filters") = py::none(), py::arg("k") = 1,
py::arg("num_threads") = -1, py::arg("dtables") = py::none())
.def("add_items", &Index::addItems, py::arg("data"),
py::arg("ids") = py::none(), py::arg("num_threads") = -1,
py::arg("dtables") = py::none())
.def("get_items", &Index::getDataReturnList,
py::arg("ids") = py::none())
.def("get_ids_list", &Index::getIdsList)
.def("set_ef", &Index::set_ef, py::arg("ef"))
.def("set_num_threads", &Index::set_num_threads,
py::arg("num_threads"))
.def("save_index", &Index::saveIndex, py::arg("path_to_index"))
.def("load_index", &Index::loadIndex, py::arg("path_to_index"),
py::arg("max_elements") = 0)
.def("mark_deleted", &Index::markDeleted, py::arg("label"))
.def("resize_index", &Index::resizeIndex, py::arg("new_size"))
.def("get_max_elements", &Index::getMaxElements)
.def("get_current_count", &Index::getCurrentCount)
.def("loadPQ", &Index::loadPQ)
.def_readonly("space", &Index::space_name)
.def_readonly("dim", &Index::dim)
.def_readonly("pq_enable", &Index::pq_enable)
.def_readwrite("num_threads", &Index::num_threads_default)
.def_property(
"ef",
[](const Index &index) {
return index.index_inited ? index.appr_alg->ef_ : index.default_ef;
},
[](Index &index, const size_t ef_) {
index.default_ef = ef_;
if (index.appr_alg)
index.appr_alg->ef_ = ef_;
})
.def_property_readonly("max_elements",
[](const Index &index) {
return index.index_inited
? index.appr_alg->max_elements_
: 0;
})
.def_property_readonly("element_count",
[](const Index &index) {
return index.index_inited
? index.appr_alg->cur_element_count
: 0;
})
.def_property_readonly("ef_construction",
[](const Index &index) {
return index.index_inited
? index.appr_alg->ef_construction_
: 0;
})
.def_property_readonly("M",
[](const Index &index) {
return index.index_inited ? index.appr_alg->M_
: 0;
})
.def(py::pickle(
[](const Index &ind) { // __getstate__
return py::make_tuple(
ind.getIndexParams()); /* Return dict (wrapped in a tuple) that
fully encodes state of the Index
object */
},
[](py::tuple t) { // __setstate__
if (t.size() != 1)
throw std::runtime_error("Invalid state!");
return Index::createFromParams(t[0].cast());
}))
.def("__repr__", [](const Index &a) {
return "";
});
return m.ptr();
}
================================================
FILE: bindings/pq_bindings.pyx
================================================
# distutils: language = c++
import numpy as np
cimport cython
from libc.stdint cimport (
int8_t,
int16_t,
int32_t,
int64_t,
uint8_t,
uint16_t,
uint32_t,
uint64_t,
)
from libcpp.vector cimport vector
ctypedef fused any_int:
uint8_t
uint16_t
uint32_t
uint64_t
int8_t
int16_t
int32_t
int64_t
@cython.boundscheck(False)
@cython.wraparound(False)
cdef inline dist_pqcode_to_codebook(long M,const float[:,:] adtable, any_int[:] pq_code):
"""Compute the distance between each codevector and the pq_code of a query.
:param M: Number of sub-vectors in the original feature space.
:param adtable: 2D Memoryview[float] containing precomputed Asymmetric Distances.
:param pq_code: 1D Memoriview[any_int] containing a pq code.
:return: Distance between pq code and query according to the Asymmetric Distance table.
"""
cdef:
float dist = 0
int m
for m in range(M):
dist += adtable[m, pq_code[m]]
return dist
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef dist_pqcodes_to_codebooks(const float[:,:] adtable, any_int[:,:] pq_codes):
"""
Compute the distance between each row in pq_codes and each codevector using a adtable.
:param adtable: 2D Memoryview of precomputed Asymmetric Distances.
:param pq_codes: 2D Memoryview of pq_codes.
:return: List of Asymmetric Distances distances between pq_codes and the query.
This function is equivalent to:
'''
dists = np.zeros((N, )).astype(np.float32)
for n in range(N):
for m in range(M):
dists[n] += self.adtable[m][codes[n][m]]
'''
"""
cdef:
int m
int N = pq_codes.shape[0]
int M = pq_codes.shape[1]
vector[float] dists
for n in range(N):
dists.push_back(dist_pqcode_to_codebook(M, adtable, pq_codes[n,:]))
return dists
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef precompute_adc_table(const float[:] query,
long d_subvector,
long n_clusters,
const float[:,:,:] codebooks):
"""
Compute the Asymmetric Distance Table between a query and a PQ space.
:param query: Memoryview a query in the original feature space (not a pqcode).
:param d_subvector: Number of dimensions in a subvector.
:param n_clusters: Number of clusters per sub-space (number of prototypes per sub-space).
:param codebooks: Memoryview containing the learned codevectors for each slice.
This is a 3D view with (slice index, prototype index, vector values).
:return: Memoryview with a 2D matrix containing the Asymmetric Distance Computation.
This function is equivalent to
'''
def numpy_adc_table(query, n_subvectors, n_clusters, d_subvector, codebooks):
adtable = np.empty((n_subvectors, n_clusters), dtype=np.float32)
for m in range(n_subvectors):
query_sub = query[m * d_subvector: (m + 1) * d_subvector]
adtable[m, :] = np.linalg.norm(codebooks[m] - query_sub, axis=1) ** 2
return adtable
'''
But avoids generating views and calling numpy functions.
"""
cdef:
int D = len(query)
int M = int(D/d_subvector)
int n_subvectors = int(D/d_subvector)
int m, i, k, ind_prototype, j
float[:, ::1] adtable = np.empty((M, n_clusters), dtype=np.float32)
float[:] query_subvec = np.empty(d_subvector, dtype=np.float32)
float[:] query_subcodeword = np.empty(d_subvector, dtype=np.float32)
float dist_subprototype_to_subquery, coord_j
for m in range(n_subvectors):
# load m'th subquery
i = 0
for k in range(m * d_subvector, (m + 1) * d_subvector):
query_subvec[i] = query[k]
i += 1
for ind_prototype in range(n_clusters):
# load prototype ind_prototype for the m'th subspace
for i in range(d_subvector):
query_subcodeword[i] = codebooks[m, ind_prototype, i]
# compute the distance between subprototype and subquery
dist_subprototype_to_subquery = 0.
for j in range(d_subvector):
coord_j = query_subcodeword[j] - query_subvec[j]
dist_subprototype_to_subquery += coord_j * coord_j
adtable[m, ind_prototype] = dist_subprototype_to_subquery
return adtable
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef batch_precompute_adc_table(const float[:, :] queries,
long d_subvector,
long n_clusters,
const float[:,:,:] codebooks):
"""
Compute the Asymmetric Distance Table between a query and a PQ space.
:param query: Memoryview a query in the original feature space (not a pqcode).
:param d_subvector: Number of dimensions in a subvector.
:param n_clusters: Number of clusters per sub-space (number of prototypes per sub-space).
:param codebooks: Memoryview containing the learned codevectors for each slice.
This is a 3D view with (slice index, prototype index, vector values).
:return: Memoryview with a 2D matrix containing the Asymmetric Distance Computation.
This function is equivalent to
'''
def numpy_adc_table(query, n_subvectors, n_clusters, d_subvector, codebooks):
adtable = np.empty((n_subvectors, n_clusters), dtype=np.float32)
for m in range(n_subvectors):
query_sub = query[m * d_subvector: (m + 1) * d_subvector]
adtable[m, :] = np.linalg.norm(codebooks[m] - query_sub, axis=1) ** 2
return adtable
'''
But avoids generating views and calling numpy functions.
"""
cdef:
int N = queries.shape[0]
int D = queries.shape[1]
int n_subvectors = int(D/d_subvector)
int m, i, k, ind_prototype, j
float[:, :, :] adtable = np.empty((N, n_subvectors, n_clusters), dtype=np.float32)
float[:] query_subvec = np.empty(d_subvector, dtype=np.float32)
float[:] query_subcodeword = np.empty(d_subvector, dtype=np.float32)
float dist_subprototype_to_subquery, coord_j
for index in range(N):
for m in range(n_subvectors):
# load m'th subquery
i = 0
for k in range(m * d_subvector, (m + 1) * d_subvector):
query_subvec[i] = queries[index, k]
i += 1
for ind_prototype in range(n_clusters):
# load prototype ind_prototype for the m'th subspace
for i in range(d_subvector):
query_subcodeword[i] = codebooks[m, ind_prototype, i]
# compute the distance between subprototype and subquery
dist_subprototype_to_subquery = 0.
for j in range(d_subvector):
coord_j = query_subcodeword[j] - query_subvec[j]
dist_subprototype_to_subquery += coord_j * coord_j
adtable[index, m, ind_prototype] = dist_subprototype_to_subquery
return adtable
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef batch_precompute_adc_table_ip(const float[:, :] queries,
long d_subvector,
long n_clusters,
const float[:,:,:] codebooks):
"""
Compute the Asymmetric Distance Table between a query and a PQ space.
:param query: Memoryview a query in the original feature space (not a pqcode).
:param d_subvector: Number of dimensions in a subvector.
:param n_clusters: Number of clusters per sub-space (number of prototypes per sub-space).
:param codebooks: Memoryview containing the learned codevectors for each slice.
This is a 3D view with (slice index, prototype index, vector values).
:return: Memoryview with a 2D matrix containing the Asymmetric Distance Computation.
This function is equivalent to
'''
def numpy_adc_table(query, n_subvectors, n_clusters, d_subvector, codebooks):
adtable = np.empty((n_subvectors, n_clusters), dtype=np.float32)
for m in range(n_subvectors):
query_sub = query[m * d_subvector: (m + 1) * d_subvector]
adtable[m, :] = np.linalg.norm(codebooks[m] - query_sub, axis=1) ** 2
return adtable
'''
But avoids generating views and calling numpy functions.
"""
cdef:
int N = queries.shape[0]
int D = queries.shape[1]
int n_subvectors = int(D/d_subvector)
int m, i, k, ind_prototype, j
float[:, :, :] adtable = np.empty((N, n_subvectors, n_clusters), dtype=np.float32)
float[:] query_subvec = np.empty(d_subvector, dtype=np.float32)
float[:] query_subcodeword = np.empty(d_subvector, dtype=np.float32)
float dist_subprototype_to_subquery
for index in range(N):
for m in range(n_subvectors):
# load m'th subquery
i = 0
for k in range(m * d_subvector, (m + 1) * d_subvector):
query_subvec[i] = queries[index, k]
i += 1
for ind_prototype in range(n_clusters):
# load prototype ind_prototype for the m'th subspace
for i in range(d_subvector):
query_subcodeword[i] = codebooks[m, ind_prototype, i]
# compute the distance between subprototype and subquery
dist_subprototype_to_subquery = 0.
for j in range(d_subvector):
dist_subprototype_to_subquery += (query_subcodeword[j] * query_subvec[j])
adtable[index, m, ind_prototype] = dist_subprototype_to_subquery
return adtable
================================================
FILE: examples/annlite_vs_simpleindexer.py
================================================
import os
import shutil
import tempfile
import time
import numpy as np
import pandas as pd
from jina import Document, DocumentArray, Flow
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from executor.executor import AnnLiteIndexer
Nq = 1
D = 128
top_k = 10
R = 5
n_cells = 64
n_subvectors = 64
n_queries = 1
BENCHMARK_SIMPLEINDEXER = False
BENCHMARK_ANNLITE = True
def _precision(predicted, relevant, eval_at):
"""
fraction of retrieved documents that are relevant to the query
"""
if eval_at == 0:
return 0.0
predicted_at_k = predicted[:eval_at]
n_predicted_and_relevant = len(set(predicted_at_k).intersection(set(relevant)))
return n_predicted_and_relevant / len(predicted)
def _recall(predicted, relevant, eval_at):
"""
fraction of the relevant documents that are successfully retrieved
"""
if eval_at == 0:
return 0.0
predicted_at_k = predicted[:eval_at]
n_predicted_and_relevant = len(set(predicted_at_k).intersection(set(relevant)))
return n_predicted_and_relevant / len(relevant)
def evaluate(predicts, relevants, eval_at):
recall = 0
precision = 0
for _predict, _relevant in zip(predicts, relevants):
_predict = np.array([int(x) for x in _predict])
recall += _recall(_predict, _relevant, top_k)
precision += _precision(_predict, _relevant, top_k)
return recall / len(predicts), precision / len(predicts)
def create_data(n_examples, D):
np.random.seed(123)
Xtr, Xte = train_test_split(
make_blobs(n_samples=n_examples, n_features=D)[0].astype(np.float32),
test_size=1,
)
return Xtr, Xte
def create_data_online(n_examples, D, batch_size):
np.random.seed(123)
num = 0
while True:
Xtr_batch = make_blobs(n_samples=batch_size, n_features=D)[0].astype(np.float32)
yield DocumentArray([Document(embedding=x) for x in Xtr_batch])
num += batch_size
if num + batch_size >= n_examples:
break
if num < n_examples:
Xtr_batch = make_blobs(n_samples=n_examples - num, n_features=D)[0].astype(
np.float32
)
yield DocumentArray([Document(embedding=x) for x in Xtr_batch])
def create_test_data(D, Nq):
np.random.seed(123)
Xte = make_blobs(n_samples=Nq, n_features=D)[0].astype(np.float32)
return DocumentArray([Document(embedding=x) for x in Xte])
if BENCHMARK_SIMPLEINDEXER:
################ SimpleIndexer Benchmark BEGIN #################
n_datasets = [10001, 50001, 200001, 400001]
times = []
for n_examples in n_datasets:
time_taken = 0
Xtr, Xte = create_data(n_examples, D)
with tempfile.TemporaryDirectory() as tmpdir:
f = Flow().add(
uses='jinahub://SimpleIndexer',
uses_with={'match_args': {'metric': 'euclidean', 'limit': 10}},
workspace=tmpdir,
)
docs = [Document(id=f'{i}', embedding=Xtr[i]) for i in range(len(Xtr))]
with f:
resp = f.post(
on='/index',
inputs=docs,
)
with f:
t0 = time.time()
resp = f.post(
on='/search',
inputs=DocumentArray([Document(embedding=Xte[0])]),
return_results=True,
)
time_taken = time.time() - t0
times.append(time_taken)
df = pd.DataFrame({'n_examples': n_datasets, 'times': times})
df.to_csv('simpleindexer.csv')
print(df)
################ SimpleIndexer Benchmark END #################
if BENCHMARK_ANNLITE:
################ AnnLite Benchmark BEGIN ######################
n_datasets = [10_000, 100_000, 500_000, 1_000_000, 10_000_000]
# n_datasets = [10_000, 100_000]
n_queries = [1, 8, 64]
batch_size = 4096
times = []
results = {}
for n_examples in n_datasets:
print(f'\n\nWorking with n_examples={n_examples}\n\n')
time_taken = 0
with tempfile.TemporaryDirectory() as tmpdir:
f = Flow().add(
uses=AnnLiteIndexer,
uses_with={
'n_dim': D,
'limit': 10,
},
workspace=tmpdir,
)
docs = create_data_online(n_examples, D, batch_size)
results_current = {}
with f:
time_taken = 0
for batch in docs:
t0 = time.time()
resp = f.post(on='/index', inputs=batch, request_size=10240)
# This is done to avoid data creation time loaded in index time
time_taken += time.time() - t0
results_current['index_time'] = time_taken
times_per_n_query = []
with f:
for n_query in n_queries:
da_queries = create_test_data(D, n_query)
t_qs = []
for _ in range(R):
t0 = time.time()
resp = f.post(
on='/search',
inputs=da_queries,
return_results=True,
)
time_taken = time.time() - t0
t_qs.append(time_taken)
# remove warm-up
times_per_n_query.append(np.mean(t_qs[1:]))
results_current['query_times'] = times_per_n_query
print(f'==> query_times: {times_per_n_query}')
df = pd.DataFrame({'results': results_current})
df.to_csv(f'annlite_{n_examples}.csv')
results[n_examples] = results_current
df = pd.DataFrame(results)
df.to_csv('annlite.csv')
clean_workspace()
################ AnnLite Benchmark END #########################
================================================
FILE: examples/filter_example.py
================================================
import os
import random
import shutil
import numpy as np
from jina import Document, DocumentArray
from jina.logging.profile import TimeContext
from annlite import AnnLite
n_index = [10_000, 100_000, 500_000, 1_000_000]
n_index = [100_000]
n_query = [1, 8, 64]
n_query = [1]
D = 768
R = 5
B = 5000
n_cells = 1
# probs =[[0.20, 0.30, 0.50],
# [0.05, 0.15, 0.80]]
categories = ['comic', 'movie', 'audiobook']
def clean_workspace():
if os.path.exists('./data'):
shutil.rmtree('./data')
if os.path.exists('./workspace'):
shutil.rmtree('./workspace')
def docs_with_tags(N, D, probs, categories):
all_docs = []
for k, prob in enumerate(probs):
n_current = int(N * prob)
X = np.random.random((n_current, D)).astype(np.float32)
docs = [
Document(
embedding=X[i],
tags={'category': categories[k], 'x': random.randint(0, 5)},
)
for i in range(n_current)
]
all_docs.extend(docs)
return DocumentArray(all_docs)
results = []
for n_i in n_index:
clean_workspace()
results_ni = []
current_probs = [0.05, 0.15, 0.80]
columns = [('category', str)]
idxer = AnnLite(
D,
initial_size=n_i,
n_cells=n_cells,
data_path='./workspace',
columns=columns,
)
da = docs_with_tags(n_i, D, current_probs, categories)
with TimeContext(f'indexing {n_i} docs') as t_i:
for i, _batch in enumerate(da.batch(batch_size=B)):
idxer.index(_batch)
for cat, prob in zip(categories, current_probs):
f = {'category': {'$eq': cat}}
query_times = []
for n_q in n_query:
qa = DocumentArray.empty(n_q)
q_embs = np.random.random([n_q, D]).astype(np.float32)
qa.embeddings = q_embs
t_qs = []
for _ in range(R):
with TimeContext(f'searching {n_q} docs') as t_q:
idxer.search(qa, filter=f)
t_qs.append(t_q.duration)
query_times.append(np.mean(t_qs[1:]))
print(f'\n\nprob={prob}, current_probs={current_probs}, n_i={n_i}\n\n')
results_ni.append([n_i, prob, t_i.duration] + query_times)
results.append(results_ni)
title = '| Stored data |% same filter| Indexing time | Query size=1 | Query size=8 | Query size=64|'
print(title)
print('|-----' * 6 + '|')
for block in results:
sorted_elements_in_block = np.argsort([b[1] for b in block])
for pos in sorted_elements_in_block:
res = block[pos]
print(''.join([f'| {x:.3f} ' for x in res] + ['|']))
================================================
FILE: examples/hnsw_example.py
================================================
import random
import tempfile
import numpy as np
from docarray import Document, DocumentArray
from annlite import AnnLite
N = 1000 # number of data points
Nq = 5
Nt = 2000
D = 128 # dimensionality / number of features
dirpath = tempfile.mkdtemp()
with tempfile.TemporaryDirectory() as tmpdirname:
index = AnnLite(
D, columns=[('x', float)], data_path=tmpdirname, include_metadata=True
)
X = np.random.random((N, D)).astype(
np.float32
) # 10,000 128-dim vectors to be indexed
docs = DocumentArray(
[
Document(id=f'{i}', embedding=X[i], tags={'x': random.random()})
for i in range(N)
]
)
index.index(docs)
X = np.random.random((Nq, D)).astype(np.float32) # a 128-dim query vector
query = DocumentArray([Document(embedding=X[i]) for i in range(5)])
index.search(query, filter={'x': {'$lt': 0.2}}, limit=10, include_metadata=True)
for m in query[0].matches:
print(f'{m.scores["euclidean"].value} -> x={m.tags["x"]}')
assert m.tags['x'] < 0.2
print(f'====')
index.search(query, filter={'x': {'$gte': 0.9}}, limit=10, include_metadata=True)
for m in query[0].matches:
print(f'{m.scores["euclidean"].value} -> x={m.tags["x"]}')
assert m.tags['x'] >= 0.9
#
# print(f'{[m.scores["euclidean"].value for m in query[0].matches]}')
# for i in range(len(query[0].matches) - 1):
# assert (
# query[0].matches[i].scores['euclidean'].value
# <= query[0].matches[i + 1].scores['euclidean'].value
# )
================================================
FILE: examples/pq_benchmark.py
================================================
import time
from datetime import date
import numpy as np
import pandas as pd
from docarray import Document, DocumentArray
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from utils import evaluate
from annlite import AnnLite
from annlite.math import cdist
from annlite.math import top_k as _top_k
# N = 100_000 # number of data points
Nt = 100_020
Nq = 1
D = 128 # dimentionality / number of features
top_k = 10
n_cells = 64
n_subvectors = 64
n_queries = 1000
# 2,000 128-dim vectors for training
np.random.seed(123)
Xtr, Xte = train_test_split(
make_blobs(n_samples=Nt, n_features=D)[0].astype(np.float32), test_size=20
)
print(f'Xtr: {Xtr.shape} vs Xte: {Xte.shape}')
def get_documents(nr=10, index_start=0, embeddings=None):
for i in range(index_start, nr + index_start):
d = Document()
d.id = f'{i}' # to test it supports non-int ids
d.embedding = embeddings[i - index_start]
yield d
precision_per_query = []
recall_per_query = []
results = []
for n_cells in [1, 4, 8]:
for n_subvectors in [64, 128]:
pq = AnnLite(D, metric='euclidean', n_cells=n_cells, n_subvectors=n_subvectors)
t0 = time.time()
pq.train(Xtr[:20480])
train_time = abs(time.time() - t0)
t0 = time.time()
pq.index(DocumentArray(get_documents(len(Xtr), embeddings=Xtr)))
index_time = abs(t0 - time.time())
dists = cdist(Xte, Xtr, metric='euclidean')
true_dists, true_ids = _top_k(dists, top_k, descending=False)
docs = DocumentArray(get_documents(len(Xte), embeddings=Xte))
t0 = time.time()
pq.search(docs, limit=top_k)
query_time = abs(t0 - time.time())
pq_ids = []
for doc in docs:
pq_ids.append([m.id for m in doc.matches])
recall, precision = evaluate(pq_ids, true_ids, top_k)
results_dict = {
'precision': precision,
'recall': recall,
'train_time': train_time,
'index_time': index_time,
'query_time': query_time,
'query_qps': len(Xte) / query_time,
'index_qps': len(Xtr) / index_time,
'indexer_hyperparams': {'n_cells': n_cells, 'n_subvectors': n_subvectors},
}
print(results_dict)
results.append(results_dict)
pq.clear()
pq.close()
today = date.today()
results_df = pd.DataFrame(results)
results_df.sort_values('recall', ascending=False)
results_df.to_csv(f'bench-results-{today.strftime("%b-%d-%Y")}.csv')
================================================
FILE: examples/pqlinearscann_benchmark_with_filtering.py
================================================
import numpy as np
from docarray import Document, DocumentArray
from docarray.math.distance import cdist
from docarray.math.helper import top_k as _top_k
from jina.logging.profile import TimeContext
from utils import clean_workspace, docs_with_tags, evaluate
from annlite import AnnLite
n_index = [10_000, 100_000, 500_000, 1_000_000]
n_query = [1, 8, 64]
D = 768
R = 5
B = 100_000
n_cells = 1
probs = [[0.20, 0.30, 0.50], [0.05, 0.15, 0.80]]
categories = ['comic', 'movie', 'audiobook']
top_k = 20
n_cells = 1
n_subvectors = D
results = []
for n_i in n_index:
results_ni = []
for current_probs in probs:
clean_workspace()
columns = [('category', str)]
indexer = AnnLite(
D,
initial_size=n_i,
n_subvectors=n_subvectors,
n_cells=n_cells,
metas={'workspace': './workspace'},
columns=columns,
)
da = docs_with_tags(n_i, D, current_probs, categories)
da_embeddings = da.embeddings
with TimeContext(f'indexing {n_i} docs') as t_i:
n_train_quantizer = min(n_i, 20_000)
row_ids = np.random.choice(range(n_i), n_train_quantizer, replace=False)
indexer.partial_train(da_embeddings[row_ids, :])
indexer.build_codebook()
for i, _batch in enumerate(da.batch(batch_size=B)):
indexer.index(_batch)
for cat, prob in zip(categories, current_probs):
f = {'category': {'$eq': cat}}
indices_cat = (
np.array([t['category'] for t in da.get_attributes('tags')]) == cat
)
ids_indices_cat = np.array(
[d.id for d in da if d.tags['category'] == cat], dtype='int'
)
da_embeddings_cat = da_embeddings[indices_cat, :]
query_times = []
for n_q in n_query:
qa = DocumentArray.empty(n_q)
q_embs = np.random.random([n_q, D]).astype(np.float32)
qa.embeddings = q_embs
t_qs = []
for _ in range(R):
with TimeContext(f'searching {n_q} docs') as t_q:
indexer.search(qa, filter=f, limit=top_k)
t_qs.append(t_q.duration)
query_times.append(np.mean(t_qs[1:]))
if n_q == 8:
dists = cdist(q_embs, da_embeddings_cat, metric='euclidean')
true_dists, true_local_ids = _top_k(dists, top_k, descending=False)
# Note:x
# `true_ids` are not really positions within the original data
# but positions within the subset `da_embeddings_cat`
# We need to go from these positions to the original positions
true_ids = ids_indices_cat[true_local_ids]
ids = []
for doc in qa:
ids.append([m.id for m in doc.matches])
recall, precision = evaluate(ids, true_ids, top_k)
print(
f'\n\nprob={prob}, current_probs={current_probs}, n_i={n_i}, recall={recall}\n\n'
)
results_ni.append([n_i, prob, t_i.duration] + query_times + [recall])
results.append(results_ni)
title = '| Stored data |% same filter| Indexing time | Query size=1 | Query size=8 | Query size=64| Recall |'
print(title)
print('|-----' * 6 + '|')
for block in results:
sorted_elements_in_block = np.argsort([b[1] for b in block])
for pos in sorted_elements_in_block:
res = block[pos]
print(''.join([f'| {x:.3f} ' for x in res] + ['|']))
================================================
FILE: examples/utils.py
================================================
import os
import shutil
import numpy as np
from docarray import Document, DocumentArray
def clean_workspace():
if os.path.exists('./data'):
shutil.rmtree('./data')
if os.path.exists('./workspace'):
shutil.rmtree('./workspace')
def docs_with_tags(N, D, probs, categories):
all_docs = []
start_current = 0
for k, prob in enumerate(probs):
n_current = int(N * prob)
X = np.random.random((n_current, D)).astype(np.float32)
docs = [
Document(
embedding=X[i],
id=f'{i+start_current}',
tags={
'category': categories[k],
},
)
for i in range(n_current)
]
all_docs.extend(docs)
start_current += n_current
return DocumentArray(all_docs)
def _precision(predicted, relevant, eval_at):
"""
fraction of retrieved documents that are relevant to the query
"""
if eval_at == 0:
return 0.0
predicted_at_k = predicted[:eval_at]
n_predicted_and_relevant = len(set(predicted_at_k).intersection(set(relevant)))
return n_predicted_and_relevant / len(predicted)
def _recall(predicted, relevant, eval_at):
"""
fraction of the relevant documents that are successfully retrieved
"""
if eval_at == 0:
return 0.0
predicted_at_k = predicted[:eval_at]
n_predicted_and_relevant = len(set(predicted_at_k).intersection(set(relevant)))
return n_predicted_and_relevant / len(relevant)
def evaluate(predicts, relevants, top_k):
recall = 0
precision = 0
for _predict, _relevant in zip(predicts, relevants):
_predict = np.array([int(x) for x in _predict])
recall += _recall(_predict, _relevant, top_k)
precision += _precision(_predict, _relevant, top_k)
return recall / len(predicts), precision / len(predicts)
================================================
FILE: executor/Dockerfile
================================================
FROM jinaai/jina:3-py38-perf
RUN apt-get update && apt-get install --no-install-recommends -y gcc g++ git \
&& rm -rf /var/lib/apt/lists/*
COPY . /workspace
WORKDIR /workspace
RUN pip install -r requirements.txt --no-cache-dir
ENTRYPOINT ["jina", "executor", "--uses", "config.yml"]
================================================
FILE: executor/README.md
================================================
# AnnLiteIndexer
`AnnLiteIndexer` uses the [AnnLite](https://github.com/jina-ai/annlite) class for indexing Jina `Document` objects.
The `AnnLite` class partitions the data into cells at index time, and instantiates a "sub-indexer" in each cell. Search is performed aggregating results retrieved from cells.
This indexer is recommended to be used when an application requires **search with filters** applied on `Document` tags.
The `filtering query language` is based on [MongoDB's query and projection operators](https://docs.mongodb.com/manual/reference/operator/query/). We currently support a subset of those selectors.
The tags filters can be combined with `$and` and `$or`:
- `$eq` - Equal to (number, string)
- `$ne` - Not equal to (number, string)
- `$gt` - Greater than (number)
- `$gte` - Greater than or equal to (number)
- `$lt` - Less than (number)
- `$lte` - Less than or equal to (number)
- `$in` - Included in an array
- `$nin` - Not included in an array
For example, we want to search for a product with a price no more than `50$`.
```python
index.search(query, filter={"price": {"$lte": 50}})
```
More example filter expresses
- A Nike shoes with white color
```JSON
{
"brand": {"$eq": "Nike"},
"category": {"$eq": "Shoes"},
"color": {"$eq": "White"}
}
```
Or
```JSON
{
"$and":
{
"brand": {"$eq": "Nike"},
"category": {"$eq": "Shoes"},
"color": {"$eq": "White"}
}
}
```
- A Nike shoes or price less than `100$`
```JSON
{
"$or":
{
"brand": {"$eq": "Nike"},
"price": {"$lt": 100}
}
}
```
## Performance
One can run `benchmark.py` to get a quick performance overview.
|Stored data| Indexing time | Query size=1 | Query size=8 | Query size=64|
|---|---|---|---|---|
|10000 | 2.970 | 0.002 | 0.013 | 0.100|
|100000 | 76.474 | 0.011 | 0.078 | 0.649|
|500000 | 467.936 | 0.046 | 0.356 | 2.823|
|1000000 | 1025.506 | 0.091 | 0.695 | 5.778|
## Getting Started
For an in-depth overview of the features of AnnLite
you can follow along with one of the examples below:
| Name | Link |
|----------------------------------------------|---|
| E-commerce product image search with AnnLite | [](https://colab.research.google.com/github/jina-ai/pqlite/blob/main/notebooks/fashion_product_search.ipynb)|
## Quick Start
`AnnLiteIndexer` stores `Document` objects at the `workspace` directory, specified under the [`metas`](https://docs.jina.ai/fundamentals/executor/executor-built-in-features/#meta-attributes) attribute.
#### Example: Selecting items whose 'price' is less than 50
If documents have a tag `'price'` that stores floating point values this indexer allows searching documents with a filter, such as `price <= 50`.
```python
columns = [('price', 'float')]
f = Flow().add(
uses='jinahub://AnnLiteIndexer/latest',
uses_with={
'dim': 256,
'columns': columns,
'metric': 'cosine'
},
uses_metas={'workspace': '/my/tmp_folder'},
install_requirements=True
)
search_filter = {"price": {"$lte": 50}}
with f:
f.post(on='/index', inputs=docs)
query_res = f.post(on='/search',
inputs=query_docs,
return_results=True,
parameters={'filter': search_filter})
```
## CRUD operations
You can perform all the usual operations on the respective endpoints
- `/index` Add documents
- `/search` Search with query documents.
- `/update` Update documents
- `/delete` Delete documents
- `/clear` Clear the index
- `/status` Return the status of index
- `total_docs`: the total number of indexed documents
- `dim`: the dimension of the embeddings
- `metric`: the distance metric type
- `is_trained`: whether the index is already trained
================================================
FILE: executor/benchmark.py
================================================
import tempfile
import time
import numpy as np
from jina import DocumentArray
from jina.logging.profile import TimeContext
n_index = [10_000, 100_000, 500_000, 1_000_000]
n_query = [1, 8, 64]
D = 768
R = 5
B = 4096
n_cells = 1
from annlite.executor import AnnLiteIndexer
times = {}
for n_i in n_index:
with tempfile.TemporaryDirectory() as tempdir:
idxer = AnnLiteIndexer(
n_dim=D,
initial_size=n_i,
n_cells=n_cells,
data_path=str(tempdir),
)
# build index docs
i_embs = np.random.random([n_i, D]).astype(np.float32)
if n_cells > 1:
idxer._index.vq_codec.fit(i_embs)
da = DocumentArray.empty(n_i)
da.embeddings = i_embs
with TimeContext(f'indexing {n_i} docs') as t_i:
for _batch in da.batch(batch_size=B):
idxer.index(_batch)
times[n_i] = {}
times[n_i]['index'] = t_i.duration
# waiting for the index to be ready
time.sleep(5)
for n_q in n_query:
q_embs = np.random.random([n_q, D]).astype(np.float32)
qa = DocumentArray.empty(n_q)
qa.embeddings = q_embs
t_qs = []
for _ in range(R):
with TimeContext(f'searching {n_q} docs') as t_q:
idxer.search(qa)
t_qs.append(t_q.duration)
# # check if it return the full doc
# assert qa[0].matches
# assert qa[0].matches.embeddings.shape
times[n_i][f'query_{n_q}'] = np.mean(t_qs[1:]) # remove warm-up
idxer.clear()
idxer.close()
print('|Stored data| Indexing time | Query size=1 | Query size=8 | Query size=64|')
print('|---' * (len(list(times.values())[0]) + 1) + '|')
for k, v in times.items():
s = ' | '.join(f'{v[vv]:.3f}' for vv in ['index', 'query_1', 'query_8', 'query_64'])
print(f'|{k} | {s}|')
================================================
FILE: executor/config.yml
================================================
jtype: AnnLiteIndexer
py_modules:
- ./executor.py
metas:
name: AnnLiteIndexer_v3
description: A similarity search based on Annlite
url: https://github.com/jina-ai/annlite
keywords: [ann, similarity_search, indexer, pq, hnsw, pre-filtering]
================================================
FILE: executor/executor.py
================================================
import threading
import time
import traceback
import warnings
from threading import Thread
from typing import Dict, List, Optional, Tuple, Union
from docarray import Document, DocumentArray
from jina import Executor, requests
from jina.logging.logger import JinaLogger
INDEX_BATCH_SIZE = 1024
class AnnLiteIndexer(Executor):
"""A simple indexer that wraps the AnnLite indexer and adds a simple interface for indexing and searching.
:param n_dim: Dimensionality of vectors to index
:param metric: Distance metric type. Can be 'euclidean', 'inner_product', or 'cosine'
:param limit: Number of results to get for each query document in search
:param n_components: Number of components to use for dimensionality reduction
:param match_args: the arguments to `DocumentArray`'s match function
:param data_path: the workspace of the AnnLiteIndexer but not support when shards > 1.
:param ef_construction: The construction time/accuracy trade-off
:param ef_search: The query time accuracy/speed trade-off
:param max_connection: The maximum number of outgoing connections in the
graph (the "M" parameter)
:param include_metadata: If True, return the document metadata in response
:param index_access_paths: Default traversal paths on docs
(used for indexing, delete and update), e.g. '@r', '@c', '@r,c'
:param search_access_paths: Default traversal paths on docs
(used for search), e.g. '@r', '@c', '@r,c'
:param columns: A list or dict of column names to index.
:param dim: Deprecated, use n_dim instead
"""
def __init__(
self,
n_dim: int = 0,
metric: str = 'cosine',
limit: int = 10,
n_components: Optional[int] = None,
match_args: Optional[Dict] = None,
data_path: Optional[str] = None,
ef_construction: Optional[int] = None,
ef_search: Optional[int] = None,
max_connection: Optional[int] = None,
include_metadata: bool = True,
index_access_paths: str = '@r',
search_access_paths: str = '@r',
columns: Optional[Union[List[Tuple[str, str]], Dict[str, str]]] = None,
dim: int = None,
*args,
**kwargs,
):
super().__init__(*args, **kwargs)
self.logger = JinaLogger(self.__class__.__name__)
n_dim = n_dim or dim
if not n_dim:
raise ValueError('Please specify the dimension of the vectors to index!')
self.n_components = n_components
self.metric = metric
self.match_args = match_args or {}
self.include_metadata = include_metadata
if limit:
self.match_args.update({'limit': limit})
self.index_access_paths = index_access_paths
if 'index_traversal_paths' in kwargs:
warnings.warn(
f'`index_traversal_paths` is deprecated. Use `index_access_paths` instead.'
)
self.index_access_paths = kwargs['index_traversal_paths']
self.search_access_paths = search_access_paths
if 'search_traversal_paths' in kwargs:
warnings.warn(
f'`search_traversal_paths` is deprecated. Use `search_access_paths` instead.'
)
self.search_access_paths = kwargs['search_traversal_paths']
self._data_buffer = DocumentArray()
self._index_batch_size = INDEX_BATCH_SIZE
self._max_length_queue = 2 * self._index_batch_size
self._index_lock = threading.Lock()
self.logger = JinaLogger(getattr(self.metas, 'name', self.__class__.__name__))
if getattr(self.runtime_args, 'shards', 1) > 1 and data_path:
raise ValueError(
'`data_path` is not supported when shards > 1, please use `workspace` instead'
)
config = {
'n_dim': n_dim,
'n_components': n_components,
'metric': metric,
'ef_construction': ef_construction,
'ef_search': ef_search,
'max_connection': max_connection,
'data_path': data_path or self.workspace or './workspace',
'columns': columns,
}
self._index = DocumentArray(storage='annlite', config=config)
# start indexing thread in background to group indexing requests
# together and perform batch indexing at once
self._start_index_loop()
@requests(on='/index')
def index(
self, docs: Optional[DocumentArray] = None, parameters: dict = {}, **kwargs
):
"""Index new documents
:param docs: the Documents to index
:param parameters: dictionary with options for indexing
Keys accepted:
- 'access_paths': traversal paths on docs, e.g. '@r', '@c', '@r,c'
"""
if not docs:
return
access_paths = parameters.get('access_paths', self.index_access_paths)
flat_docs = docs[access_paths]
if len(flat_docs) == 0:
return
while len(self._data_buffer) >= self._max_length_queue:
time.sleep(0.001)
with self._index_lock:
self._data_buffer.extend(flat_docs)
def _start_index_loop(self):
"""Start the indexing loop in background.
This loop is responsible for batch indexing the documents in the buffer.
"""
def _index_loop():
try:
while True:
# if the buffer is none, will break the loop
if self._data_buffer is None:
break
# if the buffer is empty, will wait for new documents to be added
if len(self._data_buffer) == 0:
time.sleep(0.1) # sleep for 100ms
continue
# acquire the lock to prevent threading issues
with self._index_lock:
batch_docs = self._data_buffer.pop(
range(
self._index_batch_size
if len(self._data_buffer) > self._index_batch_size
else len(self._data_buffer)
)
)
self._index.extend(batch_docs)
self.logger.debug(f'indexing {len(batch_docs)} docs done...')
except Exception as e:
self.logger.error(traceback.format_exc())
raise e
self._index_thread = Thread(target=_index_loop, daemon=False)
self._index_thread.start()
@requests(on='/update')
def update(
self, docs: Optional[DocumentArray] = None, parameters: dict = {}, **kwargs
):
"""Update existing documents
:param docs: the Documents to update
:param parameters: dictionary with options for updating
Keys accepted:
- 'access_paths': traversal paths on docs, e.g. '@r', '@c', '@r,c'
- 'raise_errors_on_not_found': if True, raise an error if a document is not found. Default is False.
"""
if not docs:
return
access_paths = parameters.get('access_paths', self.index_access_paths)
raise_errors_on_not_found = parameters.get('raise_errors_on_not_found', False)
flat_docs = docs[access_paths]
if len(flat_docs) == 0:
return
with self._index_lock:
if len(self._data_buffer) > 0:
raise RuntimeError(
f'Cannot update documents while the pending documents in the buffer are not indexed yet. '
'Please wait for the pending documents to be indexed.'
)
for doc in flat_docs:
try:
self._index[doc.id] = doc
except IndexError:
if raise_errors_on_not_found:
raise Exception(
f'The document (id={doc.id}) cannot be updated as'
f'it is not found in the index'
)
else:
self.logger.warning(
f'cannot update doc {doc.id} as it does not exist in storage'
)
@requests(on='/delete')
def delete(self, parameters: dict = {}, **kwargs):
"""Delete existing documents
Delete entries from the index by id
:param parameters: parameters to the request
"""
delete_ids = parameters.get('ids', [])
if len(delete_ids) == 0:
return
with self._index_lock:
if len(self._data_buffer) > 0:
raise RuntimeError(
f'Cannot delete documents while the pending documents in the buffer are not indexed yet. '
'Please wait for the pending documents to be indexed.'
)
del self._index[delete_ids]
@requests(on='/search')
def search(
self, docs: Optional[DocumentArray] = None, parameters: dict = {}, **kwargs
):
"""Perform a vector similarity search and retrieve Document matches
Search can be performed with candidate filtering. Filters are a triplet (column,operator,value).
More than a filter can be applied during search. Therefore, conditions for a filter are specified as a list triplets.
Each triplet contains:
- column: Column used to filter.
- operator: Binary operation between two values. Some supported operators include `['>','<','=','<=','>=']`.
- value: value used to compare a candidate.
:param docs: the Documents to search with
:param parameters: dictionary for parameters for the search operation
Keys accepted:
- 'access_paths' (str): traversal paths on docs, e.g. '@r', '@c', '@r,c'
- 'filter' (dict): the filtering conditions on document tags
- 'limit' (int): nr of matches to get per Document
"""
if not docs:
return
access_paths = parameters.get('access_paths', self.search_access_paths)
flat_docs = docs[access_paths]
match_args = (
{**self.match_args, **parameters}
if parameters is not None
else self.match_args
)
with self._index_lock:
# if len(self._data_buffer) > 0:
# raise RuntimeError(
# f'Cannot search documents while the pending documents in the buffer are not indexed yet. '
# 'Please wait for the pending documents to be indexed.'
# )
flat_docs.match(self._index, **match_args)
@requests(on='/backup')
def backup(self, parameters: Optional[Dict] = {}, **kwargs):
"""
Backup data to local or remote.
Use api of
Keys accepted:
- 'target' (str): the name of indexer you want to backup as
"""
target_name = parameters.get('target_name', None)
token = parameters.get('token', None)
if target_name:
target_name = f'{target_name}_{self.runtime_args.shard_id}'
with self._index_lock:
if len(self._data_buffer) > 0:
raise RuntimeError(
f'Cannot backup documents while the pending documents in the buffer are not indexed yet. '
'Please wait for the pending documents to be indexed.'
)
self._index._annlite.backup(target_name, token)
@requests(on='/restore')
def restore(self, parameters: Optional[Dict] = {}, **kwargs):
"""
Restore data from local or remote.
Use api of
"""
source_name = parameters.get('source_name', None)
token = parameters.get('token', None)
if source_name:
source_name = f'{source_name}_{self.runtime_args.shard_id}'
self._index._annlite.restore(source_name, token)
@requests(on='/filter')
def filter(self, parameters: Dict, **kwargs):
"""
Query documents from the indexer by the filter `query` object in parameters. The `query` object must follow the
specifications in the `find` method of `DocumentArray` using annlite: https://docarray.jina.ai/fundamentals/documentarray/find/#filter-with-query-operators
:param parameters: Dictionary to define the `filter` that you want to use.
"""
return self._index.find(parameters.get('filter', None))
@requests(on='/fill_embedding')
def fill_embedding(self, docs: DocumentArray, **kwargs):
"""
retrieve embedding of Documents by id
:param docs: DocumentArray to search with
"""
for doc in docs:
doc.embedding = self._index[doc.id].embedding
@requests(on='/status')
def status(self, **kwargs) -> DocumentArray:
"""Return the document containing status information about the indexer.
The status will contain information on the total number of indexed and deleted
documents, and on the number of (searchable) documents currently in the index.
"""
status = Document(
tags={
'appending_size': len(self._data_buffer),
'total_docs': len(self._index),
'index_size': len(self._index),
}
)
return DocumentArray([status])
def flush(self):
"""Flush all the data in the buffer to the index"""
while len(self._data_buffer) > 0:
time.sleep(0.1)
@requests(on='/clear')
def clear(self, **kwargs):
"""Clear the index of all entries."""
self.flush()
with self._index_lock:
self._data_buffer = None
self._index_thread.join()
self._data_buffer = DocumentArray()
self._index.clear()
self._start_index_loop()
def close(self, **kwargs):
"""Close the index."""
super().close()
self.flush()
# wait for the index thread to finish
with self._index_lock:
self._data_buffer = None
self._index_thread.join()
# WARNING: the commented code below hangs the close in pytest `pytest tests/test_*.py`
# But don't know why. It works fine in `pytest tests/test_executor.py` and normal python execution
del self._index
================================================
FILE: executor/requirements.txt
================================================
annlite
certifi
docarray
================================================
FILE: include/hnswlib/bruteforce.h
================================================
#pragma once
#include
#include
#include
#include
namespace hnswlib {
template
class BruteforceSearch : public AlgorithmInterface {
public:
BruteforceSearch(SpaceInterface *s) {
}
BruteforceSearch(SpaceInterface *s, const std::string &location) {
loadIndex(location, s);
}
BruteforceSearch(SpaceInterface *s, size_t maxElements) {
maxelements_ = maxElements;
data_size_ = s->get_data_size();
fstdistfunc_ = s->get_dist_func();
dist_func_param_ = s->get_dist_func_param();
size_per_element_ = data_size_ + sizeof(labeltype);
data_ = (char *) malloc(maxElements * size_per_element_);
if (data_ == nullptr)
std::runtime_error("Not enough memory: BruteforceSearch failed to allocate data");
cur_element_count = 0;
}
~BruteforceSearch() {
free(data_);
}
char *data_;
size_t maxelements_;
size_t cur_element_count;
size_t size_per_element_;
size_t data_size_;
DISTFUNC fstdistfunc_;
void *dist_func_param_;
std::mutex index_lock;
std::unordered_map dict_external_to_internal;
void addPoint(const void *datapoint, labeltype label) {
int idx;
{
std::unique_lock lock(index_lock);
auto search=dict_external_to_internal.find(label);
if (search != dict_external_to_internal.end()) {
idx=search->second;
}
else{
if (cur_element_count >= maxelements_) {
throw std::runtime_error("The number of elements exceeds the specified limit\n");
}
idx=cur_element_count;
dict_external_to_internal[label] = idx;
cur_element_count++;
}
}
memcpy(data_ + size_per_element_ * idx + data_size_, &label, sizeof(labeltype));
memcpy(data_ + size_per_element_ * idx, datapoint, data_size_);
};
void removePoint(labeltype cur_external) {
size_t cur_c=dict_external_to_internal[cur_external];
dict_external_to_internal.erase(cur_external);
labeltype label=*((labeltype*)(data_ + size_per_element_ * (cur_element_count-1) + data_size_));
dict_external_to_internal[label]=cur_c;
memcpy(data_ + size_per_element_ * cur_c,
data_ + size_per_element_ * (cur_element_count-1),
data_size_+sizeof(labeltype));
cur_element_count--;
}
std::priority_queue>
searchKnn(const void *query_data, size_t k, size_t batch_index) const {
std::priority_queue> topResults;
if (cur_element_count == 0) return topResults;
for (int i = 0; i < k; i++) {
dist_t dist = fstdistfunc_(query_data, data_ + size_per_element_ * i, dist_func_param_, nullptr);
topResults.push(std::pair(dist, *((labeltype *)(data_ + size_per_element_ * i +
data_size_))));
}
dist_t lastdist = topResults.top().first;
for (int i = k; i < cur_element_count; i++) {
dist_t dist = fstdistfunc_(query_data, data_ + size_per_element_ * i, dist_func_param_, nullptr);
if (dist <= lastdist) {
topResults.push(std::pair(dist, *((labeltype *) (data_ + size_per_element_ * i +
data_size_))));
if (topResults.size() > k)
topResults.pop();
lastdist = topResults.top().first;
}
}
return topResults;
};
void saveIndex(const std::string &location) {
std::ofstream output(location, std::ios::binary);
std::streampos position;
writeBinaryPOD(output, maxelements_);
writeBinaryPOD(output, size_per_element_);
writeBinaryPOD(output, cur_element_count);
output.write(data_, maxelements_ * size_per_element_);
output.close();
}
void loadIndex(const std::string &location, SpaceInterface *s) {
std::ifstream input(location, std::ios::binary);
std::streampos position;
readBinaryPOD(input, maxelements_);
readBinaryPOD(input, size_per_element_);
readBinaryPOD(input, cur_element_count);
data_size_ = s->get_data_size();
fstdistfunc_ = s->get_dist_func();
dist_func_param_ = s->get_dist_func_param();
size_per_element_ = data_size_ + sizeof(labeltype);
data_ = (char *) malloc(maxelements_ * size_per_element_);
if (data_ == nullptr)
std::runtime_error("Not enough memory: loadIndex failed to allocate data");
input.read(data_, maxelements_ * size_per_element_);
input.close();
}
};
}
================================================
FILE: include/hnswlib/fusefilter.h
================================================
#ifndef BINARYFUSEFILTER_H
#define BINARYFUSEFILTER_H
#include
#include
#include
#include
#include
#include
#include
#ifndef XOR_MAX_ITERATIONS
#define XOR_MAX_ITERATIONS \
100 // probabillity of success should always be > 0.5 so 100 iterations is
// highly unlikely
#endif
#ifdef _MSC_VER
#include
#include
#endif
/**
* We start with a few utilities.
***/
static inline uint64_t binary_fuse_murmur64(uint64_t h) {
h ^= h >> 33;
h *= UINT64_C(0xff51afd7ed558ccd);
h ^= h >> 33;
h *= UINT64_C(0xc4ceb9fe1a85ec53);
h ^= h >> 33;
return h;
}
static inline uint64_t binary_fuse_mix_split(uint64_t key, uint64_t seed) {
return binary_fuse_murmur64(key + seed);
}
static inline uint64_t binary_fuse_rotl64(uint64_t n, unsigned int c) {
return (n << (c & 63)) | (n >> ((-c) & 63));
}
static inline uint32_t binary_fuse_reduce(uint32_t hash, uint32_t n) {
// http://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
return (uint32_t)(((uint64_t)hash * n) >> 32);
}
static inline uint64_t binary_fuse8_fingerprint(uint64_t hash) {
return hash ^ (hash >> 32);
}
/**
* We need a decent random number generator.
**/
// returns random number, modifies the seed
static inline uint64_t binary_fuse_rng_splitmix64(uint64_t *seed) {
uint64_t z = (*seed += UINT64_C(0x9E3779B97F4A7C15));
z = (z ^ (z >> 30)) * UINT64_C(0xBF58476D1CE4E5B9);
z = (z ^ (z >> 27)) * UINT64_C(0x94D049BB133111EB);
return z ^ (z >> 31);
}
typedef struct binary_fuse8_s {
uint64_t Seed;
uint32_t SegmentLength;
uint32_t SegmentLengthMask;
uint32_t SegmentCount;
uint32_t SegmentCountLength;
uint32_t ArrayLength;
uint8_t *Fingerprints;
} binary_fuse8_t;
#ifdef _MSC_VER
// Windows programmers who target 32-bit platform may need help:
static inline uint64_t binary_fuse_mulhi(uint64_t a, uint64_t b) { return __umulh(a, b); }
#else
static inline uint64_t binary_fuse_mulhi(uint64_t a, uint64_t b) {
return ((__uint128_t)a * b) >> 64;
}
#endif
typedef struct binary_hashes_s {
uint32_t h0;
uint32_t h1;
uint32_t h2;
} binary_hashes_t;
static inline binary_hashes_t binary_fuse8_hash_batch(uint64_t hash,
const binary_fuse8_t *filter) {
uint64_t hi = binary_fuse_mulhi(hash, filter->SegmentCountLength);
binary_hashes_t ans;
ans.h0 = (uint32_t)hi;
ans.h1 = ans.h0 + filter->SegmentLength;
ans.h2 = ans.h1 + filter->SegmentLength;
ans.h1 ^= (uint32_t)(hash >> 18) & filter->SegmentLengthMask;
ans.h2 ^= (uint32_t)(hash)&filter->SegmentLengthMask;
return ans;
}
static inline uint32_t binary_fuse8_hash(int index, uint64_t hash,
const binary_fuse8_t *filter) {
uint64_t h = binary_fuse_mulhi(hash, filter->SegmentCountLength);
h += index * filter->SegmentLength;
// keep the lower 36 bits
uint64_t hh = hash & ((1UL << 36) - 1);
// index 0: right shift by 36; index 1: right shift by 18; index 2: no shift
h ^= (size_t)((hh >> (36 - 18 * index)) & filter->SegmentLengthMask);
return h;
}
// Report if the key is in the set, with false positive rate.
static inline bool binary_fuse8_contain(uint64_t key,
const binary_fuse8_t *filter) {
uint64_t hash = binary_fuse_mix_split(key, filter->Seed);
uint8_t f = binary_fuse8_fingerprint(hash);
binary_hashes_t hashes = binary_fuse8_hash_batch(hash, filter);
f ^= filter->Fingerprints[hashes.h0] ^ filter->Fingerprints[hashes.h1] ^
filter->Fingerprints[hashes.h2];
return f == 0;
}
static inline uint32_t binary_fuse_calculate_segment_length(uint32_t arity,
uint32_t size) {
// These parameters are very sensitive. Replacing 'floor' by 'round' can
// substantially affect the construction time.
if (arity == 3) {
return ((uint32_t)1) << (int)(floor(log((double)(size)) / log(3.33) + 2.25));
} else if (arity == 4) {
return ((uint32_t)1) << (int)(floor(log((double)(size)) / log(2.91) - 0.5));
} else {
return 65536;
}
}
static inline double binary_fuse8_max(double a, double b) {
if (a < b) {
return b;
}
return a;
}
static inline double binary_fuse_calculate_size_factor(uint32_t arity,
uint32_t size) {
if (arity == 3) {
return binary_fuse8_max(1.125, 0.875 + 0.25 * log(1000000.0) / log((double)size));
} else if (arity == 4) {
return binary_fuse8_max(1.075, 0.77 + 0.305 * log(600000.0) / log((double)size));
} else {
return 2.0;
}
}
// allocate enough capacity for a set containing up to 'size' elements
// caller is responsible to call binary_fuse8_free(filter)
// size should be at least 2.
static inline bool binary_fuse8_allocate(uint32_t size,
binary_fuse8_t *filter) {
uint32_t arity = 3;
filter->SegmentLength = size == 0 ? 4 : binary_fuse_calculate_segment_length(arity, size);
if (filter->SegmentLength > 262144) {
filter->SegmentLength = 262144;
}
filter->SegmentLengthMask = filter->SegmentLength - 1;
double sizeFactor = binary_fuse_calculate_size_factor(arity, size);
uint32_t capacity = size <= 1 ? 0 : (uint32_t)(round((double)size * sizeFactor));
uint32_t initSegmentCount =
(capacity + filter->SegmentLength - 1) / filter->SegmentLength -
(arity - 1);
filter->ArrayLength = (initSegmentCount + arity - 1) * filter->SegmentLength;
filter->SegmentCount =
(filter->ArrayLength + filter->SegmentLength - 1) / filter->SegmentLength;
if (filter->SegmentCount <= arity - 1) {
filter->SegmentCount = 1;
} else {
filter->SegmentCount = filter->SegmentCount - (arity - 1);
}
filter->ArrayLength =
(filter->SegmentCount + arity - 1) * filter->SegmentLength;
filter->SegmentCountLength = filter->SegmentCount * filter->SegmentLength;
filter->Fingerprints = (uint8_t *)malloc(filter->ArrayLength);
return filter->Fingerprints != NULL;
}
// report memory usage
static inline size_t binary_fuse8_size_in_bytes(const binary_fuse8_t *filter) {
return filter->ArrayLength * sizeof(uint8_t) + sizeof(binary_fuse8_t);
}
// release memory
static inline void binary_fuse8_free(binary_fuse8_t *filter) {
free(filter->Fingerprints);
filter->Fingerprints = NULL;
filter->Seed = 0;
filter->SegmentLength = 0;
filter->SegmentLengthMask = 0;
filter->SegmentCount = 0;
filter->SegmentCountLength = 0;
filter->ArrayLength = 0;
}
static inline uint8_t binary_fuse_mod3(uint8_t x) {
return x > 2 ? x - 3 : x;
}
// construct the filter, returns true on success, false on failure.
// most likely, a failure is due to too high a memory usage
// size is the number of keys
// The caller is responsable for calling binary_fuse8_allocate(size,filter)
// before. The caller is responsible to ensure that there are not too many duplicated
// keys. The inner loop will run up to XOR_MAX_ITERATIONS times (default on
// 100), it should never fail, except if there are many duplicated keys. If it fails,
// a return value of false is provided.
//
//
// If there are many duplicated keys and you do not want to remove them, you can first
// sort your input, the algorithm will then work adequately.
bool binary_fuse8_populate(const uint64_t *keys, uint32_t size,
binary_fuse8_t *filter) {
uint64_t rng_counter = 0x726b2b9d438b9d4d;
filter->Seed = binary_fuse_rng_splitmix64(&rng_counter);
uint64_t *reverseOrder = (uint64_t *)calloc((size + 1), sizeof(uint64_t));
uint32_t capacity = filter->ArrayLength;
uint32_t *alone = (uint32_t *)malloc(capacity * sizeof(uint32_t));
uint8_t *t2count = (uint8_t *)calloc(capacity, sizeof(uint8_t));
uint8_t *reverseH = (uint8_t *)malloc(size * sizeof(uint8_t));
uint64_t *t2hash = (uint64_t *)calloc(capacity, sizeof(uint64_t));
uint32_t blockBits = 1;
while (((uint32_t)1 << blockBits) < filter->SegmentCount) {
blockBits += 1;
}
uint32_t block = ((uint32_t)1 << blockBits);
uint32_t *startPos = (uint32_t *)malloc((1 << blockBits) * sizeof(uint32_t));
uint32_t h012[5];
if ((alone == NULL) || (t2count == NULL) || (reverseH == NULL) ||
(t2hash == NULL) || (reverseOrder == NULL) || (startPos == NULL)) {
free(alone);
free(t2count);
free(reverseH);
free(t2hash);
free(reverseOrder);
free(startPos);
return false;
}
reverseOrder[size] = 1;
for (int loop = 0; true; ++loop) {
if (loop + 1 > XOR_MAX_ITERATIONS) {
fprintf(stderr, "Too many iterations. Are all your keys unique?");
free(alone);
free(t2count);
free(reverseH);
free(t2hash);
free(reverseOrder);
free(startPos);
return false;
}
for (uint32_t i = 0; i < block; i++) {
// important : i * size would overflow as a 32-bit number in some
// cases.
startPos[i] = ((uint64_t)i * size) >> blockBits;
}
uint64_t maskblock = block - 1;
for (uint32_t i = 0; i < size; i++) {
uint64_t hash = binary_fuse_murmur64(keys[i] + filter->Seed);
uint64_t segment_index = hash >> (64 - blockBits);
while (reverseOrder[startPos[segment_index]] != 0) {
segment_index++;
segment_index &= maskblock;
}
reverseOrder[startPos[segment_index]] = hash;
startPos[segment_index]++;
}
int error = 0;
uint32_t duplicates = 0;
for (uint32_t i = 0; i < size; i++) {
uint64_t hash = reverseOrder[i];
uint32_t h0 = binary_fuse8_hash(0, hash, filter);
t2count[h0] += 4;
t2hash[h0] ^= hash;
uint32_t h1 = binary_fuse8_hash(1, hash, filter);
t2count[h1] += 4;
t2count[h1] ^= 1;
t2hash[h1] ^= hash;
uint32_t h2 = binary_fuse8_hash(2, hash, filter);
t2count[h2] += 4;
t2hash[h2] ^= hash;
t2count[h2] ^= 2;
if ((t2hash[h0] & t2hash[h1] & t2hash[h2]) == 0) {
if (((t2hash[h0] == 0) && (t2count[h0] == 8)) ||
((t2hash[h1] == 0) && (t2count[h1] == 8)) ||
((t2hash[h2] == 0) && (t2count[h2] == 8))) {
duplicates += 1;
t2count[h0] -= 4;
t2hash[h0] ^= hash;
t2count[h1] -= 4;
t2count[h1] ^= 1;
t2hash[h1] ^= hash;
t2count[h2] -= 4;
t2count[h2] ^= 2;
t2hash[h2] ^= hash;
}
}
error = (t2count[h0] < 4) ? 1 : error;
error = (t2count[h1] < 4) ? 1 : error;
error = (t2count[h2] < 4) ? 1 : error;
}
if (error) {
memset(reverseOrder, 0, sizeof(uint64_t)*size);
memset(t2count, 0, sizeof(uint8_t)*capacity);
memset(t2hash, 0, sizeof(uint64_t)*capacity);
filter->Seed = binary_fuse_rng_splitmix64(&rng_counter);
continue;
}
// End of key addition
uint32_t Qsize = 0;
// Add sets with one key to the queue.
for (uint32_t i = 0; i < capacity; i++) {
alone[Qsize] = i;
Qsize += ((t2count[i] >> 2) == 1) ? 1 : 0;
}
uint32_t stacksize = 0;
while (Qsize > 0) {
Qsize--;
uint32_t index = alone[Qsize];
if ((t2count[index] >> 2) == 1) {
uint64_t hash = t2hash[index];
// h012[0] = binary_fuse8_hash(0, hash, filter);
h012[1] = binary_fuse8_hash(1, hash, filter);
h012[2] = binary_fuse8_hash(2, hash, filter);
h012[3] = binary_fuse8_hash(0, hash, filter); // == h012[0];
h012[4] = h012[1];
uint8_t found = t2count[index] & 3;
reverseH[stacksize] = found;
reverseOrder[stacksize] = hash;
stacksize++;
uint32_t other_index1 = h012[found + 1];
alone[Qsize] = other_index1;
Qsize += ((t2count[other_index1] >> 2) == 2 ? 1 : 0);
t2count[other_index1] -= 4;
t2count[other_index1] ^= binary_fuse_mod3(found + 1);
t2hash[other_index1] ^= hash;
uint32_t other_index2 = h012[found + 2];
alone[Qsize] = other_index2;
Qsize += ((t2count[other_index2] >> 2) == 2 ? 1 : 0);
t2count[other_index2] -= 4;
t2count[other_index2] ^= binary_fuse_mod3(found + 2);
t2hash[other_index2] ^= hash;
}
}
if (stacksize + duplicates == size) {
// success
size = stacksize;
break;
}
memset(reverseOrder, 0, sizeof(uint64_t)*size);
memset(t2count, 0, sizeof(uint8_t)*capacity);
memset(t2hash, 0, sizeof(uint64_t)*capacity);
filter->Seed = binary_fuse_rng_splitmix64(&rng_counter);
}
for (uint32_t i = size - 1; i < size; i--) {
// the hash of the key we insert next
uint64_t hash = reverseOrder[i];
uint8_t xor2 = binary_fuse8_fingerprint(hash);
uint8_t found = reverseH[i];
h012[0] = binary_fuse8_hash(0, hash, filter);
h012[1] = binary_fuse8_hash(1, hash, filter);
h012[2] = binary_fuse8_hash(2, hash, filter);
h012[3] = h012[0];
h012[4] = h012[1];
filter->Fingerprints[h012[found]] = xor2 ^
filter->Fingerprints[h012[found + 1]] ^
filter->Fingerprints[h012[found + 2]];
}
free(alone);
free(t2count);
free(reverseH);
free(t2hash);
free(reverseOrder);
free(startPos);
return true;
}
//////////////////
// fuse16
//////////////////
typedef struct binary_fuse16_s {
uint64_t Seed;
uint32_t SegmentLength;
uint32_t SegmentLengthMask;
uint32_t SegmentCount;
uint32_t SegmentCountLength;
uint32_t ArrayLength;
uint16_t *Fingerprints;
binary_fuse16_s(uint32_t size) {
uint32_t arity = 3;
SegmentLength =
size == 0 ? 4 : binary_fuse_calculate_segment_length(arity, size);
if (SegmentLength > 262144) {
SegmentLength = 262144;
}
SegmentLengthMask = SegmentLength - 1;
double sizeFactor =
size <= 1 ? 0 : binary_fuse_calculate_size_factor(arity, size);
uint32_t capacity = (uint32_t)(round((double)size * sizeFactor));
uint32_t initSegmentCount =
(capacity + SegmentLength - 1) / SegmentLength - (arity - 1);
ArrayLength = (initSegmentCount + arity - 1) * SegmentLength;
SegmentCount = (ArrayLength + SegmentLength - 1) / SegmentLength;
if (SegmentCount <= arity - 1) {
SegmentCount = 1;
} else {
SegmentCount = SegmentCount - (arity - 1);
}
ArrayLength = (SegmentCount + arity - 1) * SegmentLength;
SegmentCountLength = SegmentCount * SegmentLength;
Fingerprints = (uint16_t *)malloc(ArrayLength * sizeof(uint16_t));
memset(Fingerprints, 0, ArrayLength * sizeof(uint16_t));
if (Fingerprints == NULL) {
throw std::runtime_error("not enough memory to hold the fuse filter");
}
}
~binary_fuse16_s() {
free(Fingerprints);
Fingerprints = NULL;
Seed = 0;
SegmentLength = 0;
SegmentLengthMask = 0;
SegmentCount = 0;
SegmentCountLength = 0;
ArrayLength = 0;
}
} binary_fuse16_t;
static inline uint64_t binary_fuse16_fingerprint(uint64_t hash) {
return hash ^ (hash >> 32);
}
static inline binary_hashes_t binary_fuse16_hash_batch(uint64_t hash,
const binary_fuse16_t *filter) {
uint64_t hi = binary_fuse_mulhi(hash, filter->SegmentCountLength);
binary_hashes_t ans;
ans.h0 = (uint32_t)hi;
ans.h1 = ans.h0 + filter->SegmentLength;
ans.h2 = ans.h1 + filter->SegmentLength;
ans.h1 ^= (uint32_t)(hash >> 18) & filter->SegmentLengthMask;
ans.h2 ^= (uint32_t)(hash)&filter->SegmentLengthMask;
return ans;
}
static inline uint32_t binary_fuse16_hash(int index, uint64_t hash,
const binary_fuse16_t *filter) {
uint64_t h = binary_fuse_mulhi(hash, filter->SegmentCountLength);
h += index * filter->SegmentLength;
// keep the lower 36 bits
uint64_t hh = hash & ((1UL << 36) - 1);
// index 0: right shift by 36; index 1: right shift by 18; index 2: no shift
h ^= (size_t)((hh >> (36 - 18 * index)) & filter->SegmentLengthMask);
return h;
}
// Report if the key is in the set, with false positive rate.
static inline bool binary_fuse16_contain(uint64_t key,
const binary_fuse16_t *filter) {
uint64_t hash = binary_fuse_mix_split(key, filter->Seed);
uint16_t f = binary_fuse16_fingerprint(hash);
binary_hashes_t hashes = binary_fuse16_hash_batch(hash, filter);
f ^= filter->Fingerprints[hashes.h0] ^ filter->Fingerprints[hashes.h1] ^
filter->Fingerprints[hashes.h2];
return f == 0;
}
// allocate enough capacity for a set containing up to 'size' elements
// caller is responsible to call binary_fuse16_free(filter)
// size should be at least 2.
static inline bool binary_fuse16_allocate(uint32_t size,
binary_fuse16_t *filter) {
uint32_t arity = 3;
filter->SegmentLength = size == 0 ? 4 : binary_fuse_calculate_segment_length(arity, size);
if (filter->SegmentLength > 262144) {
filter->SegmentLength = 262144;
}
filter->SegmentLengthMask = filter->SegmentLength - 1;
double sizeFactor = size <= 1 ? 0 : binary_fuse_calculate_size_factor(arity, size);
uint32_t capacity = (uint32_t)(round((double)size * sizeFactor));
uint32_t initSegmentCount =
(capacity + filter->SegmentLength - 1) / filter->SegmentLength -
(arity - 1);
filter->ArrayLength = (initSegmentCount + arity - 1) * filter->SegmentLength;
filter->SegmentCount =
(filter->ArrayLength + filter->SegmentLength - 1) / filter->SegmentLength;
if (filter->SegmentCount <= arity - 1) {
filter->SegmentCount = 1;
} else {
filter->SegmentCount = filter->SegmentCount - (arity - 1);
}
filter->ArrayLength =
(filter->SegmentCount + arity - 1) * filter->SegmentLength;
filter->SegmentCountLength = filter->SegmentCount * filter->SegmentLength;
filter->Fingerprints = (uint16_t*)malloc(filter->ArrayLength * sizeof(uint16_t));
return filter->Fingerprints != NULL;
}
// report memory usage
static inline size_t
binary_fuse16_size_in_bytes(const binary_fuse16_t *filter) {
return filter->ArrayLength * sizeof(uint16_t) + sizeof(binary_fuse16_t);
}
// release memory
static inline void binary_fuse16_free(binary_fuse16_t *filter) {
free(filter->Fingerprints);
filter->Fingerprints = NULL;
filter->Seed = 0;
filter->SegmentLength = 0;
filter->SegmentLengthMask = 0;
filter->SegmentCount = 0;
filter->SegmentCountLength = 0;
filter->ArrayLength = 0;
}
// construct the filter, returns true on success, false on failure.
// most likely, a failure is due to too high a memory usage
// size is the number of keys
// The caller is responsable for calling binary_fuse8_allocate(size,filter)
// before. The caller is responsible to ensure that there are not too many duplicated
// keys. The inner loop will run up to XOR_MAX_ITERATIONS times (default on
// 100), it should never fail, except if there are many duplicated keys. If it fails,
// a return value of false is provided.
//
// If there are many duplicated keys and you do not want to remove them, you can first
// sort your input, the algorithm will then work adequately.
inline bool binary_fuse16_populate(const uint64_t *keys, uint32_t size,
binary_fuse16_t *filter) {
uint64_t rng_counter = 0x726b2b9d438b9d4d;
filter->Seed = binary_fuse_rng_splitmix64(&rng_counter);
uint64_t *reverseOrder = (uint64_t *)calloc((size + 1), sizeof(uint64_t));
uint32_t capacity = filter->ArrayLength;
uint32_t *alone = (uint32_t *)malloc(capacity * sizeof(uint32_t));
uint8_t *t2count = (uint8_t *)calloc(capacity, sizeof(uint8_t));
uint8_t *reverseH = (uint8_t *)malloc(size * sizeof(uint8_t));
uint64_t *t2hash = (uint64_t *)calloc(capacity, sizeof(uint64_t));
uint32_t blockBits = 1;
while (((uint32_t)1 << blockBits) < filter->SegmentCount) {
blockBits += 1;
}
uint32_t block = ((uint32_t)1 << blockBits);
uint32_t *startPos = (uint32_t *)malloc((1 << blockBits) * sizeof(uint32_t));
uint32_t h012[5];
if ((alone == NULL) || (t2count == NULL) || (reverseH == NULL) ||
(t2hash == NULL) || (reverseOrder == NULL) || (startPos == NULL)) {
free(alone);
free(t2count);
free(reverseH);
free(t2hash);
free(reverseOrder);
free(startPos);
return false;
}
reverseOrder[size] = 1;
for (int loop = 0; true; ++loop) {
if (loop + 1 > XOR_MAX_ITERATIONS) {
fprintf(stderr, "Too many iterations. Are all your keys unique?");
free(alone);
free(t2count);
free(reverseH);
free(t2hash);
free(reverseOrder);
free(startPos);
return false;
}
for (uint32_t i = 0; i < block; i++) {
// important : i * size would overflow as a 32-bit number in some
// cases.
startPos[i] = ((uint64_t)i * size) >> blockBits;
}
uint64_t maskblock = block - 1;
for (uint32_t i = 0; i < size; i++) {
uint64_t hash = binary_fuse_murmur64(keys[i] + filter->Seed);
uint64_t segment_index = hash >> (64 - blockBits);
while (reverseOrder[startPos[segment_index]] != 0) {
segment_index++;
segment_index &= maskblock;
}
reverseOrder[startPos[segment_index]] = hash;
startPos[segment_index]++;
}
int error = 0;
uint32_t duplicates = 0;
for (uint32_t i = 0; i < size; i++) {
uint64_t hash = reverseOrder[i];
uint32_t h0 = binary_fuse16_hash(0, hash, filter);
t2count[h0] += 4;
t2hash[h0] ^= hash;
uint32_t h1 = binary_fuse16_hash(1, hash, filter);
t2count[h1] += 4;
t2count[h1] ^= 1;
t2hash[h1] ^= hash;
uint32_t h2 = binary_fuse16_hash(2, hash, filter);
t2count[h2] += 4;
t2hash[h2] ^= hash;
t2count[h2] ^= 2;
if ((t2hash[h0] & t2hash[h1] & t2hash[h2]) == 0) {
if (((t2hash[h0] == 0) && (t2count[h0] == 8)) ||
((t2hash[h1] == 0) && (t2count[h1] == 8)) ||
((t2hash[h2] == 0) && (t2count[h2] == 8))) {
duplicates += 1;
t2count[h0] -= 4;
t2hash[h0] ^= hash;
t2count[h1] -= 4;
t2count[h1] ^= 1;
t2hash[h1] ^= hash;
t2count[h2] -= 4;
t2count[h2] ^= 2;
t2hash[h2] ^= hash;
}
}
error = (t2count[h0] < 4) ? 1 : error;
error = (t2count[h1] < 4) ? 1 : error;
error = (t2count[h2] < 4) ? 1 : error;
}
if (error) {
memset(reverseOrder, 0, sizeof(uint64_t)*size);
memset(t2count, 0, sizeof(uint8_t)*capacity);
memset(t2hash, 0, sizeof(uint64_t)*capacity);
filter->Seed = binary_fuse_rng_splitmix64(&rng_counter);
continue;
}
// End of key addition
uint32_t Qsize = 0;
// Add sets with one key to the queue.
for (uint32_t i = 0; i < capacity; i++) {
alone[Qsize] = i;
Qsize += ((t2count[i] >> 2) == 1) ? 1 : 0;
}
uint32_t stacksize = 0;
while (Qsize > 0) {
Qsize--;
uint32_t index = alone[Qsize];
if ((t2count[index] >> 2) == 1) {
uint64_t hash = t2hash[index];
// h012[0] = binary_fuse16_hash(0, hash, filter);
h012[1] = binary_fuse16_hash(1, hash, filter);
h012[2] = binary_fuse16_hash(2, hash, filter);
h012[3] = binary_fuse16_hash(0, hash, filter); // == h012[0];
h012[4] = h012[1];
uint8_t found = t2count[index] & 3;
reverseH[stacksize] = found;
reverseOrder[stacksize] = hash;
stacksize++;
uint32_t other_index1 = h012[found + 1];
alone[Qsize] = other_index1;
Qsize += ((t2count[other_index1] >> 2) == 2 ? 1 : 0);
t2count[other_index1] -= 4;
t2count[other_index1] ^= binary_fuse_mod3(found + 1);
t2hash[other_index1] ^= hash;
uint32_t other_index2 = h012[found + 2];
alone[Qsize] = other_index2;
Qsize += ((t2count[other_index2] >> 2) == 2 ? 1 : 0);
t2count[other_index2] -= 4;
t2count[other_index2] ^= binary_fuse_mod3(found + 2);
t2hash[other_index2] ^= hash;
}
}
if (stacksize + duplicates == size) {
// success
size = stacksize;
break;
}
memset(reverseOrder, 0, sizeof(uint64_t)*size);
memset(t2count, 0, sizeof(uint8_t)*capacity);
memset(t2hash, 0, sizeof(uint64_t)*capacity);
filter->Seed = binary_fuse_rng_splitmix64(&rng_counter);
}
for (uint32_t i = size - 1; i < size; i--) {
// the hash of the key we insert next
uint64_t hash = reverseOrder[i];
uint16_t xor2 = binary_fuse16_fingerprint(hash);
uint8_t found = reverseH[i];
h012[0] = binary_fuse16_hash(0, hash, filter);
h012[1] = binary_fuse16_hash(1, hash, filter);
h012[2] = binary_fuse16_hash(2, hash, filter);
h012[3] = h012[0];
h012[4] = h012[1];
filter->Fingerprints[h012[found]] = xor2 ^
filter->Fingerprints[h012[found + 1]] ^
filter->Fingerprints[h012[found + 2]];
}
free(alone);
free(t2count);
free(reverseH);
free(t2hash);
free(reverseOrder);
free(startPos);
return true;
}
#endif
================================================
FILE: include/hnswlib/hnswalg.h
================================================
#pragma once
#include "hnswlib.h"
#include "visited_list_pool.h"
#include
#include
#include
#include
#include
#include
namespace hnswlib {
typedef unsigned int tableint;
typedef unsigned int linklistsizeint;
template
class HierarchicalNSW : public AlgorithmInterface {
public:
static const tableint max_update_element_locks = 65536;
HierarchicalNSW(SpaceInterface *s) {
}
HierarchicalNSW(SpaceInterface *s, const std::string &location, bool nmslib = false, size_t max_elements=0) {
loadIndex(location, s, max_elements);
}
HierarchicalNSW(SpaceInterface *s, size_t max_elements, size_t M = 16, size_t ef_construction = 200, size_t random_seed = 100) :
link_list_locks_(max_elements), link_list_update_locks_(max_update_element_locks), element_levels_(max_elements) {
max_elements_ = max_elements;
has_deletions_ = false;
num_deleted_ = 0;
data_size_ = s->get_data_size();
fstdistfunc_ = s->get_dist_func();
dist_func_param_ = s->get_dist_func_param();
M_ = M;
maxM_ = M_;
maxM0_ = M_ * 2;
ef_construction_ = std::max(ef_construction,M_);
ef_ = 10;
level_generator_.seed(random_seed);
update_probability_generator_.seed(random_seed + 1);
size_links_level0_ = maxM0_ * sizeof(tableint) + sizeof(linklistsizeint);
size_data_per_element_ = size_links_level0_ + data_size_ + sizeof(labeltype);
offsetData_ = size_links_level0_;
label_offset_ = size_links_level0_ + data_size_;
offsetLevel0_ = 0;
data_level0_memory_ = (char *) malloc(max_elements_ * size_data_per_element_);
if (data_level0_memory_ == nullptr)
throw std::runtime_error("Not enough memory");
cur_element_count = 0;
visited_list_pool_ = new VisitedListPool(1, max_elements);
//initializations for special treatment of the first node
enterpoint_node_ = -1;
maxlevel_ = -1;
linkLists_ = (char **) malloc(sizeof(void *) * max_elements_);
if (linkLists_ == nullptr)
throw std::runtime_error("Not enough memory: HierarchicalNSW failed to allocate linklists");
size_links_per_element_ = maxM_ * sizeof(tableint) + sizeof(linklistsizeint);
mult_ = 1 / log(1.0 * M_);
revSize_ = 1.0 / mult_;
}
struct CompareByFirst {
constexpr bool operator()(std::pair const &a,
std::pair const &b) const noexcept {
return a.first < b.first;
}
};
~HierarchicalNSW() {
free(data_level0_memory_);
for (tableint i = 0; i < cur_element_count; i++) {
if (element_levels_[i] > 0)
free(linkLists_[i]);
}
free(linkLists_);
delete visited_list_pool_;
}
size_t max_elements_;
size_t cur_element_count;
size_t size_data_per_element_;
size_t size_links_per_element_;
size_t num_deleted_;
size_t M_;
size_t maxM_;
size_t maxM0_;
size_t ef_construction_;
double mult_, revSize_;
int maxlevel_;
VisitedListPool *visited_list_pool_;
std::mutex cur_element_count_guard_;
std::vector link_list_locks_;
// Locks to prevent race condition during update/insert of an element at same time.
// Note: Locks for additions can also be used to prevent this race condition if the querying of KNN is not exposed along with update/inserts i.e multithread insert/update/query in parallel.
std::vector link_list_update_locks_;
tableint enterpoint_node_;
size_t size_links_level0_;
size_t offsetData_, offsetLevel0_;
char *data_level0_memory_;
char **linkLists_;
std::vector element_levels_;
size_t data_size_;
bool has_deletions_;
size_t label_offset_;
DISTFUNC fstdistfunc_;
void *dist_func_param_;
std::unordered_map label_lookup_;
std::default_random_engine level_generator_;
std::default_random_engine update_probability_generator_;
inline labeltype getExternalLabel(tableint internal_id) const {
labeltype return_label;
memcpy(&return_label,(data_level0_memory_ + internal_id * size_data_per_element_ + label_offset_), sizeof(labeltype));
return return_label;
}
inline void setExternalLabel(tableint internal_id, labeltype label) const {
memcpy((data_level0_memory_ + internal_id * size_data_per_element_ + label_offset_), &label, sizeof(labeltype));
}
inline labeltype *getExternalLabeLp(tableint internal_id) const {
return (labeltype *) (data_level0_memory_ + internal_id * size_data_per_element_ + label_offset_);
}
inline char *getDataByInternalId(tableint internal_id) const {
return (data_level0_memory_ + internal_id * size_data_per_element_ + offsetData_);
}
int getRandomLevel(double reverse_size) {
std::uniform_real_distribution distribution(0.0, 1.0);
double r = -log(distribution(level_generator_)) * reverse_size;
return (int) r;
}
// TODO:
std::priority_queue, std::vector>, CompareByFirst>
searchBaseLayer(tableint ep_id, const void *data_point, int layer, const local_state_t *local_state_ptr) {
VisitedList *vl = visited_list_pool_->getFreeVisitedList();
vl_type *visited_array = vl->mass;
vl_type visited_array_tag = vl->curV;
std::priority_queue, std::vector>, CompareByFirst> top_candidates;
std::priority_queue, std::vector>, CompareByFirst> candidateSet;
dist_t lowerBound;
if (!isMarkedDeleted(ep_id)) {
dist_t dist = fstdistfunc_(data_point, getDataByInternalId(ep_id), dist_func_param_, local_state_ptr);
top_candidates.emplace(dist, ep_id);
lowerBound = dist;
candidateSet.emplace(-dist, ep_id);
} else {
lowerBound = std::numeric_limits::max();
candidateSet.emplace(-lowerBound, ep_id);
}
visited_array[ep_id] = visited_array_tag;
while (!candidateSet.empty()) {
std::pair curr_el_pair = candidateSet.top();
if ((-curr_el_pair.first) > lowerBound) {
break;
}
candidateSet.pop();
tableint curNodeNum = curr_el_pair.second;
std::unique_lock lock(link_list_locks_[curNodeNum]);
int *data;// = (int *)(linkList0_ + curNodeNum * size_links_per_element0_);
if (layer == 0) {
data = (int*)get_linklist0(curNodeNum);
} else {
data = (int*)get_linklist(curNodeNum, layer);
// data = (int *) (linkLists_[curNodeNum] + (layer - 1) * size_links_per_element_);
}
size_t size = getListCount((linklistsizeint*)data);
tableint *datal = (tableint *) (data + 1);
#ifdef USE_SSE
_mm_prefetch((char *) (visited_array + *(data + 1)), _MM_HINT_T0);
_mm_prefetch((char *) (visited_array + *(data + 1) + 64), _MM_HINT_T0);
_mm_prefetch(getDataByInternalId(*datal), _MM_HINT_T0);
_mm_prefetch(getDataByInternalId(*(datal + 1)), _MM_HINT_T0);
#endif
for (size_t j = 0; j < size; j++) {
tableint candidate_id = *(datal + j);
// if (candidate_id == 0) continue;
#ifdef USE_SSE
_mm_prefetch((char *) (visited_array + *(datal + j + 1)), _MM_HINT_T0);
_mm_prefetch(getDataByInternalId(*(datal + j + 1)), _MM_HINT_T0);
#endif
if (visited_array[candidate_id] == visited_array_tag) continue;
visited_array[candidate_id] = visited_array_tag;
char *currObj1 = (getDataByInternalId(candidate_id));
dist_t dist1 = fstdistfunc_(data_point, currObj1, dist_func_param_, local_state_ptr);
if (top_candidates.size() < ef_construction_ || lowerBound > dist1) {
candidateSet.emplace(-dist1, candidate_id);
#ifdef USE_SSE
_mm_prefetch(getDataByInternalId(candidateSet.top().second), _MM_HINT_T0);
#endif
if (!isMarkedDeleted(candidate_id))
top_candidates.emplace(dist1, candidate_id);
if (top_candidates.size() > ef_construction_)
top_candidates.pop();
if (!top_candidates.empty())
lowerBound = top_candidates.top().first;
}
}
}
visited_list_pool_->releaseVisitedList(vl);
return top_candidates;
}
mutable std::atomic metric_distance_computations;
mutable std::atomic metric_hops;
template
std::priority_queue, std::vector>, CompareByFirst>
searchBaseLayerST(tableint ep_id, const void *data_point, size_t ef, const local_state_t *local_state_ptr) const {
VisitedList *vl = visited_list_pool_->getFreeVisitedList();
vl_type *visited_array = vl->mass;
vl_type visited_array_tag = vl->curV;
std::priority_queue, std::vector>, CompareByFirst> top_candidates;
std::priority_queue, std::vector>, CompareByFirst> candidate_set;
dist_t lowerBound;
if (!has_deletions || !isMarkedDeleted(ep_id)) {
dist_t dist = fstdistfunc_(data_point, getDataByInternalId(ep_id), dist_func_param_, local_state_ptr);
lowerBound = dist;
top_candidates.emplace(dist, ep_id);
candidate_set.emplace(-dist, ep_id);
} else {
lowerBound = std::numeric_limits::max();
candidate_set.emplace(-lowerBound, ep_id);
}
visited_array[ep_id] = visited_array_tag;
while (!candidate_set.empty()) {
std::pair current_node_pair = candidate_set.top();
if ((-current_node_pair.first) > lowerBound && (top_candidates.size() == ef || has_deletions == false)) {
break;
}
candidate_set.pop();
tableint current_node_id = current_node_pair.second;
int *data = (int *) get_linklist0(current_node_id);
size_t size = getListCount((linklistsizeint*)data);
// bool cur_node_deleted = isMarkedDeleted(current_node_id);
if(collect_metrics){
metric_hops++;
metric_distance_computations+=size;
}
#ifdef USE_SSE
_mm_prefetch((char *) (visited_array + *(data + 1)), _MM_HINT_T0);
_mm_prefetch((char *) (visited_array + *(data + 1) + 64), _MM_HINT_T0);
_mm_prefetch(data_level0_memory_ + (*(data + 1)) * size_data_per_element_ + offsetData_, _MM_HINT_T0);
_mm_prefetch((char *) (data + 2), _MM_HINT_T0);
#endif
for (size_t j = 1; j <= size; j++) {
int candidate_id = *(data + j);
// if (candidate_id == 0) continue;
#ifdef USE_SSE
_mm_prefetch((char *) (visited_array + *(data + j + 1)), _MM_HINT_T0);
_mm_prefetch(data_level0_memory_ + (*(data + j + 1)) * size_data_per_element_ + offsetData_,
_MM_HINT_T0);////////////
#endif
if (!(visited_array[candidate_id] == visited_array_tag)) {
visited_array[candidate_id] = visited_array_tag;
char *currObj1 = (getDataByInternalId(candidate_id));
dist_t dist = fstdistfunc_(data_point, currObj1, dist_func_param_, local_state_ptr);
if (top_candidates.size() < ef || lowerBound > dist) {
candidate_set.emplace(-dist, candidate_id);
#ifdef USE_SSE
_mm_prefetch(data_level0_memory_ + candidate_set.top().second * size_data_per_element_ +
offsetLevel0_,///////////
_MM_HINT_T0);////////////////////////
#endif
if (!has_deletions || !isMarkedDeleted(candidate_id))
top_candidates.emplace(dist, candidate_id);
if (top_candidates.size() > ef)
top_candidates.pop();
if (!top_candidates.empty())
lowerBound = top_candidates.top().first;
}
}
}
}
visited_list_pool_->releaseVisitedList(vl);
return top_candidates;
}
// TODO
template
std::priority_queue,
std::vector>, CompareByFirst>
searchBaseLayerSTWithFilter(tableint ep_id, const void *data_point,
const binary_fuse16_t *filter, size_t ef,
const local_state_t *local_state_ptr) const {
VisitedList *vl = visited_list_pool_->getFreeVisitedList();
vl_type *visited_array = vl->mass;
vl_type visited_array_tag = vl->curV;
std::priority_queue,
std::vector>,
CompareByFirst>
top_candidates;
std::priority_queue,
std::vector>,
CompareByFirst>
candidate_set;
dist_t lowerBound;
uint64_t label = getExternalLabel(ep_id);
if (binary_fuse16_contain(label, filter)) {
dist_t dist = fstdistfunc_(data_point, getDataByInternalId(ep_id),
dist_func_param_, local_state_ptr);
lowerBound = dist;
top_candidates.emplace(dist, ep_id);
candidate_set.emplace(-dist, ep_id);
} else {
lowerBound = std::numeric_limits::max();
candidate_set.emplace(-lowerBound, ep_id);
}
visited_array[ep_id] = visited_array_tag;
while (!candidate_set.empty()) {
std::pair current_node_pair = candidate_set.top();
if ((-current_node_pair.first) > lowerBound) {
break;
}
candidate_set.pop();
tableint current_node_id = current_node_pair.second;
int *data = (int *)get_linklist0(current_node_id);
size_t size = getListCount((linklistsizeint *)data);
// bool cur_node_deleted =
// isMarkedDeleted(current_node_id);
if (collect_metrics) {
metric_hops++;
metric_distance_computations += size;
}
#ifdef USE_SSE
_mm_prefetch((char *)(visited_array + *(data + 1)), _MM_HINT_T0);
_mm_prefetch((char *)(visited_array + *(data + 1) + 64), _MM_HINT_T0);
_mm_prefetch(data_level0_memory_ +
(*(data + 1)) * size_data_per_element_ + offsetData_,
_MM_HINT_T0);
_mm_prefetch((char *)(data + 2), _MM_HINT_T0);
#endif
for (size_t j = 1; j <= size; j++) {
int candidate_id = *(data + j);
// if (candidate_id == 0) continue;
#ifdef USE_SSE
_mm_prefetch((char *)(visited_array + *(data + j + 1)), _MM_HINT_T0);
_mm_prefetch(data_level0_memory_ +
(*(data + j + 1)) * size_data_per_element_ +
offsetData_,
_MM_HINT_T0); ////////////
#endif
if (!(visited_array[candidate_id] == visited_array_tag)) {
visited_array[candidate_id] = visited_array_tag;
char *currObj1 = (getDataByInternalId(candidate_id));
dist_t dist = fstdistfunc_(data_point, currObj1, dist_func_param_,
local_state_ptr);
if (top_candidates.size() < ef || lowerBound > dist) {
candidate_set.emplace(-dist, candidate_id);
#ifdef USE_SSE
_mm_prefetch(data_level0_memory_ +
candidate_set.top().second *
size_data_per_element_ +
offsetLevel0_, ///////////
_MM_HINT_T0); ////////////////////////
#endif
uint64_t et_label = getExternalLabel(candidate_id);
if (binary_fuse16_contain(et_label, filter))
top_candidates.emplace(dist, candidate_id);
if (top_candidates.size() > ef)
top_candidates.pop();
if (!top_candidates.empty())
lowerBound = top_candidates.top().first;
}
}
}
}
visited_list_pool_->releaseVisitedList(vl);
return top_candidates;
}
// TODO
void getNeighborsByHeuristic2(
std::priority_queue, std::vector>, CompareByFirst> &top_candidates,
const size_t M, const local_state_t *local_state_ptr) {
if (top_candidates.size() < M) {
return;
}
std::priority_queue> queue_closest;
std::vector> return_list;
while (top_candidates.size() > 0) {
queue_closest.emplace(-top_candidates.top().first, top_candidates.top().second);
top_candidates.pop();
}
while (queue_closest.size()) {
if (return_list.size() >= M)
break;
std::pair curent_pair = queue_closest.top();
dist_t dist_to_query = -curent_pair.first;
queue_closest.pop();
bool good = true;
for (std::pair second_pair : return_list) {
dist_t curdist =
fstdistfunc_(getDataByInternalId(second_pair.second),
getDataByInternalId(curent_pair.second),
dist_func_param_, local_state_ptr);;
if (curdist < dist_to_query) {
good = false;
break;
}
}
if (good) {
return_list.push_back(curent_pair);
}
}
for (std::pair curent_pair : return_list) {
top_candidates.emplace(-curent_pair.first, curent_pair.second);
}
}
linklistsizeint *get_linklist0(tableint internal_id) const {
return (linklistsizeint *) (data_level0_memory_ + internal_id * size_data_per_element_ + offsetLevel0_);
};
linklistsizeint *get_linklist0(tableint internal_id, char *data_level0_memory_) const {
return (linklistsizeint *) (data_level0_memory_ + internal_id * size_data_per_element_ + offsetLevel0_);
};
linklistsizeint *get_linklist(tableint internal_id, int level) const {
return (linklistsizeint *) (linkLists_[internal_id] + (level - 1) * size_links_per_element_);
};
linklistsizeint *get_linklist_at_level(tableint internal_id, int level) const {
return level == 0 ? get_linklist0(internal_id) : get_linklist(internal_id, level);
};
tableint mutuallyConnectNewElement(const void *data_point, tableint cur_c,
std::priority_queue, std::vector>, CompareByFirst> &top_candidates,
int level, bool isUpdate, const local_state_t *local_state_ptr) {
size_t Mcurmax = level ? maxM_ : maxM0_;
getNeighborsByHeuristic2(top_candidates, M_, local_state_ptr);
if (top_candidates.size() > M_)
throw std::runtime_error("Should be not be more than M_ candidates returned by the heuristic");
std::vector selectedNeighbors;
selectedNeighbors.reserve(M_);
while (top_candidates.size() > 0) {
selectedNeighbors.push_back(top_candidates.top().second);
top_candidates.pop();
}
tableint next_closest_entry_point = selectedNeighbors.back();
{
linklistsizeint *ll_cur;
if (level == 0)
ll_cur = get_linklist0(cur_c);
else
ll_cur = get_linklist(cur_c, level);
if (*ll_cur && !isUpdate) {
throw std::runtime_error("The newly inserted element should have blank link list");
}
setListCount(ll_cur,selectedNeighbors.size());
tableint *data = (tableint *) (ll_cur + 1);
for (size_t idx = 0; idx < selectedNeighbors.size(); idx++) {
if (data[idx] && !isUpdate)
throw std::runtime_error("Possible memory corruption");
if (level > element_levels_[selectedNeighbors[idx]])
throw std::runtime_error("Trying to make a link on a non-existent level");
data[idx] = selectedNeighbors[idx];
}
}
for (size_t idx = 0; idx < selectedNeighbors.size(); idx++) {
std::unique_lock lock(link_list_locks_[selectedNeighbors[idx]]);
linklistsizeint *ll_other;
if (level == 0)
ll_other = get_linklist0(selectedNeighbors[idx]);
else
ll_other = get_linklist(selectedNeighbors[idx], level);
size_t sz_link_list_other = getListCount(ll_other);
if (sz_link_list_other > Mcurmax)
throw std::runtime_error("Bad value of sz_link_list_other");
if (selectedNeighbors[idx] == cur_c)
throw std::runtime_error("Trying to connect an element to itself");
if (level > element_levels_[selectedNeighbors[idx]])
throw std::runtime_error("Trying to make a link on a non-existent level");
tableint *data = (tableint *) (ll_other + 1);
bool is_cur_c_present = false;
if (isUpdate) {
for (size_t j = 0; j < sz_link_list_other; j++) {
if (data[j] == cur_c) {
is_cur_c_present = true;
break;
}
}
}
// If cur_c is already present in the neighboring connections of `selectedNeighbors[idx]` then no need to modify any connections or run the heuristics.
if (!is_cur_c_present) {
if (sz_link_list_other < Mcurmax) {
data[sz_link_list_other] = cur_c;
setListCount(ll_other, sz_link_list_other + 1);
} else {
// finding the "weakest" element to replace it with the new one
dist_t d_max = fstdistfunc_(getDataByInternalId(cur_c), getDataByInternalId(selectedNeighbors[idx]),
dist_func_param_, local_state_ptr);
// Heuristic:
std::priority_queue, std::vector>, CompareByFirst> candidates;
candidates.emplace(d_max, cur_c);
for (size_t j = 0; j < sz_link_list_other; j++) {
candidates.emplace(
fstdistfunc_(getDataByInternalId(data[j]), getDataByInternalId(selectedNeighbors[idx]),
dist_func_param_, local_state_ptr), data[j]);
}
getNeighborsByHeuristic2(candidates, Mcurmax, local_state_ptr);
int indx = 0;
while (candidates.size() > 0) {
data[indx] = candidates.top().second;
candidates.pop();
indx++;
}
setListCount(ll_other, indx);
// Nearest K:
/*int indx = -1;
for (int j = 0; j < sz_link_list_other; j++) {
dist_t d = fstdistfunc_(getDataByInternalId(data[j]), getDataByInternalId(rez[idx]), dist_func_param_, nullptr);
if (d > d_max) {
indx = j;
d_max = d;
}
}
if (indx >= 0) {
data[indx] = cur_c;
} */
}
}
}
return next_closest_entry_point;
}
std::mutex global;
size_t ef_;
void setEf(size_t ef) {
ef_ = ef;
}
std::priority_queue> searchKnnInternal(void *query_data, int k) {
std::priority_queue> top_candidates;
if (cur_element_count == 0) return top_candidates;
// FIXME: Never used, so fix index to 0.
local_state_t local_state;
local_state.batch_index = 0;
tableint currObj = enterpoint_node_;
dist_t curdist = fstdistfunc_(query_data, getDataByInternalId(enterpoint_node_), dist_func_param_, &local_state);
for (size_t level = maxlevel_; level > 0; level--) {
bool changed = true;
while (changed) {
changed = false;
int *data;
data = (int *) get_linklist(currObj,level);
int size = getListCount(data);
tableint *datal = (tableint *) (data + 1);
for (int i = 0; i < size; i++) {
tableint cand = datal[i];
if (cand < 0 || cand > max_elements_)
throw std::runtime_error("cand error");
dist_t d = fstdistfunc_(query_data, getDataByInternalId(cand), dist_func_param_, nullptr);
if (d < curdist) {
curdist = d;
currObj = cand;
changed = true;
}
}
}
}
if (num_deleted_) {
std::priority_queue> top_candidates1=searchBaseLayerST(currObj, query_data,
ef_, &local_state);
top_candidates.swap(top_candidates1);
}
else{
std::priority_queue> top_candidates1=searchBaseLayerST