Showing preview only (1,211K chars total). Download the full file or copy to clipboard to get everything.
Repository: jina-ai/annlite
Branch: main
Commit: f0174dee0af6
Files: 97
Total size: 1.1 MB
Directory structure:
gitextract_ifbk854h/
├── .gitattributes
├── .github/
│ ├── release-template.ejs
│ ├── requirements-test.txt
│ └── workflows/
│ ├── cd.yml
│ ├── ci.yml
│ ├── force-release.yml
│ └── tag.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CHANGELOG.md
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── annlite/
│ ├── __init__.py
│ ├── container.py
│ ├── core/
│ │ ├── __init__.py
│ │ ├── codec/
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ ├── pq.py
│ │ │ ├── projector.py
│ │ │ └── vq.py
│ │ └── index/
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── flat_index.py
│ │ ├── hnsw/
│ │ │ ├── __init__.py
│ │ │ └── index.py
│ │ └── pq_index.py
│ ├── enums.py
│ ├── executor.py
│ ├── filter.py
│ ├── helper.py
│ ├── hubble_tools.py
│ ├── index.py
│ ├── math.py
│ ├── profile.py
│ ├── storage/
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── kv.py
│ │ └── table.py
│ └── utils.py
├── benchmarks/
│ ├── filtering_bench.py
│ └── hnsw_bench.py
├── bindings/
│ ├── hnsw_bindings.cpp
│ └── pq_bindings.pyx
├── examples/
│ ├── annlite_vs_simpleindexer.py
│ ├── filter_example.py
│ ├── hnsw_example.py
│ ├── pq_benchmark.py
│ ├── pqlinearscann_benchmark_with_filtering.py
│ └── utils.py
├── executor/
│ ├── Dockerfile
│ ├── README.md
│ ├── benchmark.py
│ ├── config.yml
│ ├── executor.py
│ └── requirements.txt
├── include/
│ └── hnswlib/
│ ├── bruteforce.h
│ ├── fusefilter.h
│ ├── hnswalg.h
│ ├── hnswlib.h
│ ├── space_ip.h
│ ├── space_l2.h
│ ├── space_pq.h
│ └── visited_list_pool.h
├── notebooks/
│ └── fashion_product_search.ipynb
├── pyproject.toml
├── requirements.txt
├── scripts/
│ ├── black.sh
│ ├── get-all-test-paths.sh
│ ├── get-last-release-note.py
│ ├── release.sh
│ └── update-version.sh
├── setup.py
└── tests/
├── __init__.py
├── conftest.py
├── docarray/
│ ├── __init__.py
│ ├── test_add.py
│ ├── test_del.py
│ ├── test_find.py
│ ├── test_get.py
│ └── test_save_load.py
├── executor/
│ ├── __init__.py
│ └── test_executor.py
├── test_codec.py
├── test_crud.py
├── test_dump.py
├── test_enums.py
├── test_filter.py
├── test_hnsw_load_save.py
├── test_index.py
├── test_pq_bind.py
├── test_pq_index.py
├── test_projector.py
├── test_projector_index.py
├── test_store.py
└── test_table.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .gitattributes
================================================
notebooks/* linguist-vendored
include/* linguist-vendored
bindings/* linguist-vendored
*.h linguist-detectable=false
*.cpp linguist-detectable=false
================================================
FILE: .github/release-template.ejs
================================================
<% var groupCommits = [
{
name: 'breaking',
show: true,
list: []
}, {
name: 'feat',
show: true,
list: []
}, {
name: 'perf',
show: true,
list: []
}, {
name: 'fix',
show: true,
list: []
}, {
name: 'refactor',
show: true,
list: []
}, {
name: 'docs',
show: true,
list: []
}, {
name: 'test',
show: true,
list: []
}, {
name: 'other',
show: true,
list: []
}
]
var all_titles = {};
var all_commiters = {};
var commitHref = "https://github.com/jina-ai/docarray/commit/"
commits.forEach(function (commit) {
var result = (commit.title).match(/^(\w*)(\((.*)\))?\: (.*)$/);
var type = result && result[1];
var scope = result && result[3];
var title = result && result[4];
var committer = commit.authorName
if (!(committer in all_commiters)) {
all_commiters[committer] = 1
}
if (!(title in all_titles)) {
all_titles[title] = 1
if( title != null && (title.indexOf('💥')>-1 || title.indexOf(':boom:')>-1) ){
groupCommits.find(item => item.name === 'breaking').list.push({
type: type,
scope: scope,
title: title,
commit: commit
})
} else if(type == 'fix' || type == 'fixed'){
groupCommits.find(item => item.name === 'fix').list.push({
type: type,
scope: scope,
title: title,
commit: commit
})
} else if(type == 'perf' || type == 'performance'){
groupCommits.find(item => item.name === 'perf').list.push({
type: type,
scope: scope,
title: title,
commit: commit
})
} else if(type == 'feat' || type == 'feature'){
groupCommits.find(item => item.name === 'feat').list.push({
type: type,
scope: scope,
title: title,
commit: commit
})
} else if(type == 'refactor'){
groupCommits.find(item => item.name === 'refactor').list.push({
type: type,
scope: scope,
title: title,
commit: commit
})
} else if(type == 'docs' || type == 'doc'){
groupCommits.find(item => item.name === 'docs').list.push({
type: type,
scope: scope,
title: title,
commit: commit
})
} else if(type == 'test' || type == 'tests' || type == 'ci'){
groupCommits.find(item => item.name === 'test').list.push({
type: type,
scope: scope,
title: title,
commit: commit
})
} else {
groupCommits.find(item => item.name === 'other').list.push({
type: type,
scope: scope,
title: title,
commit: commit
})
}
}
});
var listCommits = function(list, key){
list.forEach(function (ct) {
var type = ct.type;
var scope = ct.scope;
var title = '';
var commit = ct.commit;
if(type){
if(key != 'other'){
title = (scope? '__'+scope+'__: ':'') + ct.title;
}else{
title = '__' + type + (scope? '('+scope+')':'') + '__ : ' + ct.title;
}
}else{
title = commit.title;
}
%> - <% if(typeof commitHref === 'undefined' || commitHref === '') { %>[```<%=commit.sha1.slice(0,8)%>```]<% } else { %>[[```<%=commit.sha1.slice(0,8)%>```](<%=commitHref%><%=commit.sha1%>)]<%}%> __-__ <%=title%> (*<%= commit.authorName %>*)
<% })} %>
🙇 We'd like to thank all contributors for this new release! In particular,
<% Object.keys(all_commiters).forEach(function (key) { %> <%= key %>, <% }) %> 🙇
<%
for(var i of groupCommits){
if(i.list.length == 0) continue;
if (i.name === 'breaking' && i.show) { %>
### 💥 Breaking changes
<% } else if (i.name === 'fix' && i.show) { %>
### 🐞 Bug fixes
<% } else if( i.name === 'feat' && i.show) { %>
### 🆕 New Features
<% } else if(i.name === 'perf' && i.show) { %>
### ⚡ Performance Improvements
<% } else if(i.name === 'refactor' && i.show) { %>
### 🧼 Code Refactoring
<% } else if(i.name === 'docs' && i.show) { %>
### 📗 Documentation
<% } else if(i.name === 'test' && i.show) { %>
### 🏁 Unit Test and CICD
<% } else if (i.name === 'other' && i.show) { %>
### 🍹 Other Improvements
<% }
i.show && listCommits(i.list, i);
} %>
================================================
FILE: .github/requirements-test.txt
================================================
pytest
pytest-custom_exit_code
================================================
FILE: .github/workflows/cd.yml
================================================
name: CD
on:
push:
branches:
- main
jobs:
prep-testbed:
if: |
!startsWith(github.event.head_commit.message, 'chore') &&
!startsWith(github.event.head_commit.message, 'build: hotfix') &&
!endsWith(github.event.head_commit.message, 'reformatted by jina-dev-bot')
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- id: set-matrix
run: |
sudo apt-get install jq
echo "::set-output name=matrix::$(bash scripts/get-all-test-paths.sh)"
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
build-wheels:
needs: [prep-testbed]
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest, windows-latest, macos-latest]
cibw_arch: ["auto64"]
python-version: [['3.7', "cp37-*"], ['3.8', "cp38-*"], ['3.9', "cp39-*"], ['3.10', "cp310-*"]]
steps:
- uses: actions/checkout@v3
with:
fetch-depth: 100
- name: Set up Python ${{ matrix.python-version[0] }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version[0] }}
- name: Update version
shell: bash
run: |
git fetch --depth=1 origin +refs/tags/*:refs/tags/*
./scripts/update-version.sh
- name: Build sdist
if: ${{ matrix.os == 'ubuntu-latest' && matrix.python-version[0] == '3.7' }}
run: |
pip install build
python -m build --sdist
- name: Build wheels
uses: pypa/cibuildwheel@v2.10.2
with:
package-dir: ./
env:
CIBW_ENVIRONMENT: >
STAN_BACKEND="${{ env.STAN_BACKEND }}"
CIBW_MANYLINUX_X86_64_IMAGE: manylinux2014
CIBW_BUILD: ${{ matrix.python-version[1] }}
CIBW_SKIP: "*musllinux*"
CIBW_ARCHS: ${{ matrix.cibw_arch }}
# CIBW_ARCH_MACOS: x86_64 arm64
CIBW_BUILD_FRONTEND: build
- uses: actions/upload-artifact@v3
with:
path: |
./wheelhouse/*.whl
./dist/*.tar.gz
core-test:
needs: [ prep-testbed, build-wheels ]
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ ubuntu-latest, windows-latest, macos-latest ]
cibw_arch: [ "auto64" ]
python-version: [ [ '3.7', "cp37-*" ] ]
test-path: ${{fromJson(needs.prep-testbed.outputs.matrix)}}
env:
JINA_HIDE_SURVEY: "1"
steps:
- uses: actions/checkout@v3
with:
fetch-depth: 100
- name: Set up Python ${{ matrix.python-version[0] }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version[0] }}
- name: Prepare enviroment
run: |
python -m pip install --upgrade pip
pip install jina
pip install --pre docarray
pip install pytest pytest-html pytest-cov pytest-mock pytest-repeat pytest-custom-exit-code pytest-timeout pytest-reraise
- uses: actions/download-artifact@v3
with:
name: artifact
- name: Install annlite linux
if: ${{ matrix.os == 'ubuntu-latest' }}
run: |
pip install wheelhouse/*${{ matrix.python-version[1] }}**linux*.whl
- name: Install annlite macos
if: ${{ matrix.os == 'macos-latest' }}
run: |
pip install wheelhouse/*${{ matrix.python-version[1] }}**macos**x86_64*.whl
- name: Install annlite win
if: ${{ matrix.os == 'windows-latest'}}
run: |
pip install --pre --find-links=wheelhouse/ annlite
- name: Test unix
id: test_unix
if: ${{ matrix.os == 'ubuntu-latest' || matrix.os == 'macos-latest' }}
run: |
cd ..
mv annlite/tests/ ./
pytest --suppress-no-test-exit-code --cov=annlite --cov-report=xml \
-v -s -m "not gpu" ${{ matrix.test-path }}
echo "::set-output name=codecov_flag::annlite"
timeout-minutes: 30
- name: Test win
id: test_win
if: ${{ matrix.os == 'windows-latest'}}
env:
PYTHONIOENCODING: 'utf-8'
run: |
cd ..
move annlite/tests/ ./
cd tests/
pytest -v -s -m "not gpu" -k "test"
timeout-minutes: 30
- name: Check codecov file
id: check_files
uses: andstor/file-existence-action@v1
with:
files: "coverage.xml"
- name: Upload coverage from test to Codecov
uses: codecov/codecov-action@v2
if: steps.check_files.outputs.files_exists == 'true' && ${{ matrix.python-version[0] }} == '3.7' && ${{ matrix.matrix.os }} == 'ubuntu-latest'
with:
file: coverage.xml
flags: ${{ steps.test.outputs.codecov_flag }}
fail_ci_if_error: false
token: ${{ secrets.CODECOV_TOKEN }} # not required for public repos
prerelease:
needs: [core-test]
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
with:
fetch-depth: 100
- uses: actions/download-artifact@v3
with:
name: artifact
- name: Pre-release (.devN)
run: |
git fetch --depth=1 origin +refs/tags/*:refs/tags/*
pip install twine
./scripts/release.sh
env:
TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
JINA_SLACK_WEBHOOK: ${{ secrets.JINA_SLACK_WEBHOOK }}
================================================
FILE: .github/workflows/ci.yml
================================================
name: CI
on:
pull_request:
jobs:
commit-lint:
runs-on: ubuntu-latest
steps:
- name: find the prev warning if exist
uses: peter-evans/find-comment@v1
id: fc
with:
issue-number: ${{ github.event.pull_request.number }}
comment-author: "github-actions[bot]"
body-includes: "bad commit message"
- name: Delete comment if exist
if: ${{ steps.fc.outputs.comment-id != 0 }}
uses: actions/github-script@v3
with:
github-token: ${{ secrets.GITHUB_TOKEN }}
script: |
github.issues.deleteComment({
owner: context.repo.owner,
repo: context.repo.repo,
comment_id: ${{ steps.fc.outputs.comment-id }},
})
- uses: actions/checkout@v2
with:
fetch-depth: 0
- run: 'echo "module.exports = {extends: [''@commitlint/config-conventional'']}" > commitlint.config.js'
- uses: wagoid/commitlint-github-action@v1
env:
GITHUB_TOKEN: "${{ secrets.GITHUB_TOKEN }}"
- name: if lint failed
if: ${{ failure() }}
uses: peter-evans/create-or-update-comment@v1
with:
issue-number: ${{ github.event.pull_request.number }}
body: |
Thanks for your contribution :heart:
:broken_heart: Unfortunately, this PR has one ore more **bad commit messages**, it can not be merged. To fix this problem, please refer to:
- [Commit Message Guideline for the First Time Contributor](https://github.com/jina-ai/jina/issues/553)
- [Contributing Guideline](https://github.com/jina-ai/jina/blob/master/CONTRIBUTING.md)
Note, other CI tests will *not* *start* until the commit messages get fixed.
This message will be deleted automatically when the commit messages get fixed.
reaction-type: "eyes"
# lint-flake-8:
# runs-on: ubuntu-latest
# steps:
# - uses: actions/checkout@v2
# - name: Set up Python 3.7
# uses: actions/setup-python@v2
# with:
# python-version: 3.7
# - name: Lint with flake8
# run: |
# pip install flake8
# # stop the build if there are Python syntax errors or undefined names
# flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics --exclude .git,__pycache__,docs/source/conf.py,old,build,dist,tests/,jina/resources/,bindings
# # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
# flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics --exclude .git,__pycache__,docs/source/conf.py,old,build,dist,tests/,jina/resources/
check-black:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
with:
fetch-depth: 0
- name: Set up Python 3.7
uses: actions/setup-python@v2
with:
python-version: 3.7
- id: file_changes
uses: Ana06/get-changed-files@v1.2
- name: check black
run: ./scripts/black.sh
env:
CHANGED_FILES: ${{ steps.file_changes.outputs.added_modified }}
prep-testbed:
runs-on: ubuntu-latest
needs: [commit-lint, check-black]
steps:
- uses: actions/checkout@v2
- id: set-matrix
run: |
sudo apt-get install jq
echo "::set-output name=matrix::$(bash scripts/get-all-test-paths.sh)"
outputs:
matrix: ${{ steps.set-matrix.outputs.matrix }}
core-test:
needs: [prep-testbed]
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest, windows-latest]
cpython-version: ["cp37-*"]
python-version: [3.7]
cibw_arch: ["auto64"]
test-path: ${{fromJson(needs.prep-testbed.outputs.matrix)}}
env:
JINA_HIDE_SURVEY: "1"
steps:
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v3
with:
python-version: ${{ matrix.python-version }}
- uses: actions/checkout@v3
- name: Prepare enviroment
run: |
python -m pip install --upgrade pip
pip install jina
pip install --pre docarray
pip install pytest pytest-html pytest-cov pytest-mock pytest-repeat pytest-custom-exit-code pytest-timeout pytest-reraise
- name: Build wheel
uses: pypa/cibuildwheel@v2.10.2
with:
package-dir: ./
env:
CIBW_ENVIRONMENT: >
STAN_BACKEND="${{ env.STAN_BACKEND }}"
CIBW_MANYLINUX_X86_64_IMAGE: manylinux2014
CIBW_BUILD: ${{ matrix.cpython-version }}
CIBW_SKIP: "*musllinux*"
CIBW_ARCHS: ${{ matrix.cibw_arch }}
CIBW_BUILD_FRONTEND: build
- name: Install annlite unix
if: ${{ matrix.os == 'ubuntu-latest' }}
run: |
pip install wheelhouse/*linux*.whl
- name: Install annlite win
if: ${{ matrix.os == 'windows-latest'}}
run: |
pip install --pre --find-links=wheelhouse/ annlite
- name: Test unix
id: test_unix
if: ${{ matrix.os == 'ubuntu-latest' || matrix.os == 'macos-latest' }}
run: |
cd ..
mv annlite/tests/ ./
pytest --suppress-no-test-exit-code --cov=annlite --cov-report=xml \
-v -s -m "not gpu" ${{ matrix.test-path }}
echo "::set-output name=codecov_flag::annlite"
timeout-minutes: 30
- name: Test win
id: test_win
if: ${{ matrix.os == 'windows-latest'}}
env:
PYTHONIOENCODING: 'utf-8'
run: |
cd ..
move annlite/tests/ ./
cd tests/
pytest -v -s -m "not gpu" -k "test"
echo "::set-output name=codecov_flag::annlite"
timeout-minutes: 30
- name: Check codecov file
id: check_files
uses: andstor/file-existence-action@v1
with:
files: "coverage.xml"
- name: Upload coverage from test to Codecov
uses: codecov/codecov-action@v2
if: steps.check_files.outputs.files_exists == 'true' && ${{ matrix.python-version }} == '3.7'
with:
file: coverage.xml
flags: ${{ steps.test.outputs.codecov_flag }}
fail_ci_if_error: false
token: ${{ secrets.CODECOV_TOKEN }} # not required for public repos
# just for blocking the merge until all parallel core-test are successful
success-all-test:
needs: [core-test]
if: always()
runs-on: ubuntu-latest
steps:
- uses: technote-space/workflow-conclusion-action@v2
- name: Check Failure
if: env.WORKFLOW_CONCLUSION == 'failure'
run: exit 1
- name: Success
if: ${{ success() }}
run: echo "All Done"
================================================
FILE: .github/workflows/force-release.yml
================================================
name: Manual Release
on:
workflow_dispatch:
inputs:
release_token:
description: 'Your release token'
required: true
release_reason:
description: 'Short reason for this manual release'
required: true
jobs:
token-check:
runs-on: ubuntu-latest
steps:
- run: echo "success!"
if: "${{ github.event.inputs.release_token }} == ${{ env.release_token }}"
env:
release_token: ${{ secrets.ANNLITE_RELEASE_TOKEN }}
build-wheels:
needs: [token-check]
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ ubuntu-latest, windows-latest, macos-latest ]
cibw_arch: [ "auto64" ]
python-version: [ [ '3.7', "cp37-*" ], [ '3.8', "cp38-*" ], [ '3.9', "cp39-*" ], [ '3.10', "cp310-*" ] ]
steps:
- uses: actions/checkout@v3
with:
fetch-depth: 100
- name: Set up Python ${{ matrix.python-version[0] }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version[0] }}
- name: Build sdist
if: ${{ matrix.os == 'ubuntu-latest' && matrix.python-version[0] == '3.7' }}
run: |
pip install build
python -m build --sdist
- name: Build wheels
uses: pypa/cibuildwheel@v2.10.2
with:
package-dir: ./
env:
CIBW_ENVIRONMENT: >
STAN_BACKEND="${{ env.STAN_BACKEND }}"
CIBW_MANYLINUX_X86_64_IMAGE: manylinux2014
CIBW_BUILD: ${{ matrix.python-version[1] }}
CIBW_SKIP: "*musllinux*"
CIBW_ARCHS: ${{ matrix.cibw_arch }}
# CIBW_ARCH_MACOS: x86_64 arm64
CIBW_BUILD_FRONTEND: build
- uses: actions/upload-artifact@v3
with:
path: |
./wheelhouse/*.whl
./dist/*.tar.gz
regular-release:
needs: build-wheels
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
with:
token: ${{ secrets.JINA_DEV_BOT }}
fetch-depth: 100 # means max contribute history is limited to 100 lines
# submodules: true
- uses: actions/setup-python@v2
with:
python-version: 3.7
- uses: actions/download-artifact@v3
with:
name: artifact
- run: |
git fetch --depth=1 origin +refs/tags/*:refs/tags/*
npm install git-release-notes
pip install twine wheel
./scripts/release.sh final "${{ github.event.inputs.release_reason }}" "${{github.actor}}"
env:
TWINE_USERNAME: ${{ secrets.TWINE_USERNAME }}
TWINE_PASSWORD: ${{ secrets.TWINE_PASSWORD }}
JINA_SLACK_WEBHOOK: ${{ secrets.JINA_SLACK_WEBHOOK }}
- if: failure()
run: echo "nothing to release"
- name: bumping master version
uses: ad-m/github-push-action@v0.6.0
with:
github_token: ${{ secrets.JINA_DEV_BOT }}
tags: true
branch: main
================================================
FILE: .github/workflows/tag.yml
================================================
name: Release CD
on:
push:
tags:
- "v*" # push to version tags trigger the build
#on:
# push:
# branches-ignore:
# - '**' # temporally disable this action
jobs:
create-release:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v2
with:
ref: 'main'
- uses: actions/setup-python@v2
with:
python-version: 3.7
- run: |
python scripts/get-last-release-note.py
- name: Create Release
id: create_release
uses: actions/create-release@v1
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # This token is provided by Actions, you do not need to create your own token
with:
tag_name: ${{ github.ref }}
release_name: 💫 Patch ${{ github.ref }}
body_path: 'tmp.md'
draft: false
prerelease: false
================================================
FILE: .gitignore
================================================
# Initially taken from Github's Python gitignore file
# local dev
.vscode
bindings/pq_bindings.cpp
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
docs/.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
.idea/
toy*.py
.DS_Store
post/
toy*.ipynb
data/
*.c
.nes_cache
toy*.yml
*.tmp
/junit/
/tests/junit/
/docs/chapters/proto/docs.md
# IntelliJ IDEA
*.iml
.idea
# Rust
/target
Cargo.lock
toy*.py
================================================
FILE: .pre-commit-config.yaml
================================================
repos:
#- repo: https://github.com/terrencepreilly/darglint
# rev: v1.5.8
# hooks:
# - id: darglint
# files: annlite/
# exclude: docs/
# args:
# - --message-template={path}:{line} {msg_id} {msg}
# - -s=sphinx
# - -z=full
# - -v=2
#- repo: https://github.com/pycqa/pydocstyle
# rev: 5.1.1 # pick a git hash / tag to point to
# hooks:
# - id: pydocstyle
# files: annlite/
# exclude: docs/
# args:
# - --select=D101,D102,D103
- repo: https://github.com/timothycrosley/isort
rev: 5.12.0
hooks:
- id: isort
args: ["--profile", "black"]
- repo: https://github.com/psf/black
rev: 22.3.0
hooks:
- id: black
types: [python]
exclude: docs/
args:
- -S
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.0.1
hooks:
- id: trailing-whitespace
- id: check-yaml
- id: end-of-file-fixer
- id: requirements-txt-fixer
- id: double-quote-string-fixer
- id: check-merge-conflict
- id: fix-encoding-pragma
args: ["--remove"]
- id: mixed-line-ending
args: ["--fix=lf"]
- repo: https://github.com/pre-commit/mirrors-clang-format
rev: "v14.0.3"
hooks:
- id: clang-format
================================================
FILE: CHANGELOG.md
================================================
<a name=release-note-0-3-0></a>
## Release Note (`0.3.0`)
> Release time: 2022-03-04 11:49:06
🙇 We'd like to thank all contributors for this new release! In particular,
numb3r3, felix-wang, David Buchaca Prats, 🙇
### 🆕 New Features
- [[```e4afbe17```](https://github.com/jina-ai/docarray/commit/e4afbe17c724f48de19bfbe6d6c127e62a8bcd5a)] __-__ add cd workflow (*numb3r3*)
### 🐞 Bug fixes
- [[```2aeb5377```](https://github.com/jina-ai/docarray/commit/2aeb537779df6649fab143affa5c5decf8b2e364)] __-__ format (*numb3r3*)
- [[```27ab16c2```](https://github.com/jina-ai/docarray/commit/27ab16c2622606e7bf77af5286620f0cdec4abb0)] __-__ build dep (*numb3r3*)
- [[```7887ac95```](https://github.com/jina-ai/docarray/commit/7887ac95a7954c783264151cd26ae47319a47baa)] __-__ turn on upload pypi (#109) (*felix-wang*)
- [[```2e4739ac```](https://github.com/jina-ai/docarray/commit/2e4739ac2f19e78bc23889ee2a1baef79be65e5d)] __-__ version (*numb3r3*)
- [[```293a0dbf```](https://github.com/jina-ai/docarray/commit/293a0dbfe8b063c66ec4b8b710f44eb1c8ff4041)] __-__ release script (*numb3r3*)
- [[```e77c95d9```](https://github.com/jina-ai/docarray/commit/e77c95d97fe54bb409ce39931de86fb45fdc6f41)] __-__ cov (*numb3r3*)
- [[```287de379```](https://github.com/jina-ai/docarray/commit/287de379686fc4eb91afd99f32a85e3dcbc6bf27)] __-__ install in cd (*numb3r3*)
- [[```88b8e0b6```](https://github.com/jina-ai/docarray/commit/88b8e0b6c033f33d76e5ec3e7e95f3de14d7ed70)] __-__ pip install in ci (*numb3r3*)
- [[```348e0444```](https://github.com/jina-ai/docarray/commit/348e044463039c1a4ed60c43c1f83353378a86f7)] __-__ release (*numb3r3*)
- [[```beb35f29```](https://github.com/jina-ai/docarray/commit/beb35f29c1f695728e7c43b235abb97a71f797ca)] __-__ setup pytest (*numb3r3*)
- [[```e9ea4b89```](https://github.com/jina-ai/docarray/commit/e9ea4b8942d48034a7a471e2b91a6b542913f7c9)] __-__ workflow yml (*numb3r3*)
- [[```5f0e21ec```](https://github.com/jina-ai/docarray/commit/5f0e21ec85a09a1dabc618841eb9fcc4c56c34bb)] __-__ ci yml (*numb3r3*)
- [[```db58e283```](https://github.com/jina-ai/docarray/commit/db58e2835f758c94ec73fa3ccc795c286fdb4e86)] __-__ setup.py (*numb3r3*)
- [[```bdc1c7a2```](https://github.com/jina-ai/docarray/commit/bdc1c7a276086b702c807112d0603c245476944a)] __-__ __cicd__: add scripts (*numb3r3*)
### 🧼 Code Refactoring
- [[```9c693adb```](https://github.com/jina-ai/docarray/commit/9c693adb438742b633d04f554e442ceff9da923d)] __-__ remove pqlite namings (*David Buchaca Prats*)
- [[```793f4711```](https://github.com/jina-ai/docarray/commit/793f4711c42d3a717e9e60e3548c222f1ecc7de2)] __-__ rename project (*numb3r3*)
### 🍹 Other Improvements
- [[```f0c6d809```](https://github.com/jina-ai/docarray/commit/f0c6d809d47a6142f5ccf40c7cc7cf24045ad2e5)] __-__ bump version (*numb3r3*)
<a name=release-note-0-3-1></a>
## Release Note (`0.3.1`)
> Release time: 2022-03-04 16:29:52
🙇 We'd like to thank all contributors for this new release! In particular,
felix-wang, Jina Dev Bot, 🙇
### 🆕 New Features
- [[```44298038```](https://github.com/jina-ai/docarray/commit/44298038e4e79f3dd0c3532ba6ad5b64d1a35caf)] __-__ add tag release workflow (#110) (*felix-wang*)
### 🍹 Other Improvements
- [[```d15289f9```](https://github.com/jina-ai/docarray/commit/d15289f9912f158c7b9a7ec7d5f58188ecb19cf2)] __-__ __version__: the next version will be 0.3.1 (*Jina Dev Bot*)
<a name=release-note-0-3-2></a>
## Release Note (`0.3.2`)
> Release time: 2022-06-09 10:04:30
🙇 We'd like to thank all contributors for this new release! In particular,
David Buchaca Prats, Gustavo Ye, felix-wang, Han Xiao, numb3r3, Jina Dev Bot, 🙇
### 🆕 New Features
- [[```a7804baf```](https://github.com/jina-ai/docarray/commit/a7804bafc90064f929afff62192f19e44406b5d1)] __-__ add filter method (#121) (*David Buchaca Prats*)
### 🐞 Bug fixes
- [[```9448fcab```](https://github.com/jina-ai/docarray/commit/9448fcab060efe5b59108f3cac183aeb432a81d6)] __-__ only apply black with changed py files (#118) (*felix-wang*)
- [[```e4e706e3```](https://github.com/jina-ai/docarray/commit/e4e706e313ba5cbfb7083a5dea9e75b8d2813394)] __-__ upgrade traverse path (#113) (*felix-wang*)
- [[```a0fc7cac```](https://github.com/jina-ai/docarray/commit/a0fc7cac7722d108143f6590c3435c77376f76bd)] __-__ tag yml (*numb3r3*)
### 🍹 Other Improvements
- [[```2ce1ec22```](https://github.com/jina-ai/docarray/commit/2ce1ec2283b381f5153ea60141a6bb474bbf0f0c)] __-__ __cpp/h__: clang-format (#117) (*Gustavo Ye*)
- [[```a6168400```](https://github.com/jina-ai/docarray/commit/a61684000af4518aafa41d1d9dea47766bedf247)] __-__ update readme (*Han Xiao*)
- [[```aec33c75```](https://github.com/jina-ai/docarray/commit/aec33c75b8ce883e044a828fbae7142f71dbb05a)] __-__ __version__: the next version will be 0.3.2 (*Jina Dev Bot*)
<a name=release-note-0-3-3></a>
## Release Note (`0.3.3`)
> Release time: 2022-07-04 13:57:38
🙇 We'd like to thank all contributors for this new release! In particular,
felix-wang, Gustavo Ye, Jina Dev Bot, 🙇
### 🐞 Bug fixes
- [[```e240128f```](https://github.com/jina-ai/docarray/commit/e240128f403bde04bc21e1ac5ce3baa5507db687)] __-__ proto version bump (#126) (*felix-wang*)
- [[```f62809b5```](https://github.com/jina-ai/docarray/commit/f62809b5d90f4fca1762c6a1c13beee40e972212)] __-__ change input type to `data_t` (#123) (*Gustavo Ye*)
### 🍹 Other Improvements
- [[```f7f7b751```](https://github.com/jina-ai/docarray/commit/f7f7b75104a28f1860d3cdc94c87f526e742db51)] __-__ __version__: the next version will be 0.3.3 (*Jina Dev Bot*)
<a name=release-note-0-3-4></a>
## Release Note (`0.3.4`)
> Release time: 2022-07-20 07:12:22
🙇 We'd like to thank all contributors for this new release! In particular,
Jie Fu, Jina Dev Bot, 🙇
### 🆕 New Features
- [[```1a9de05c```](https://github.com/jina-ai/docarray/commit/1a9de05cdcbb861cd036b2ed9a3a66fa84e707f9)] __-__ copy executor.py to annlite folder (#129) (*Jie Fu*)
### 🍹 Other Improvements
- [[```5225e6cb```](https://github.com/jina-ai/docarray/commit/5225e6cbec7df3fbf4dbd7ab404d34bb7d899a29)] __-__ __version__: the next version will be 0.3.4 (*Jina Dev Bot*)
<a name=release-note-0-3-5></a>
## Release Note (`0.3.5`)
> Release time: 2022-07-29 08:17:02
🙇 We'd like to thank all contributors for this new release! In particular,
Jie Fu, Jina Dev Bot, 🙇
### 🆕 New Features
- [[```534eed81```](https://github.com/jina-ai/docarray/commit/534eed810e6b56238f0b71cc4123783236eac7fd)] __-__ support load and save hnsw (#133) (*Jie Fu*)
### 🍹 Other Improvements
- [[```1f76fb5d```](https://github.com/jina-ai/docarray/commit/1f76fb5dedc79d331dc655d6409718457c6c05d7)] __-__ Feat task queue (#131) (*Jie Fu*)
- [[```103b5bf3```](https://github.com/jina-ai/docarray/commit/103b5bf31b59ca041d7ca70b8e0442421bcdf279)] __-__ __version__: the next version will be 0.3.5 (*Jina Dev Bot*)
<a name=release-note-0-3-6></a>
## Release Note (`0.3.6`)
> Release time: 2022-08-23 10:45:36
🙇 We'd like to thank all contributors for this new release! In particular,
felix-wang, Jie Fu, Gustavo Ye, Jina Dev Bot, 🙇
### 🆕 New Features
- [[```690098be```](https://github.com/jina-ai/docarray/commit/690098be576fd5d6149e0502b9d2bf100b726f27)] __-__ add pq support for hnsw searching (#122) (*Gustavo Ye*)
- [[```ad5a5fe3```](https://github.com/jina-ai/docarray/commit/ad5a5fe39293f771b9945102b45f05bebfaf3ad6)] __-__ indexer dumploader (#137) (*felix-wang*)
- [[```29019e3d```](https://github.com/jina-ai/docarray/commit/29019e3da94d0a6025c91919a8978966f073b608)] __-__ integrate annlite with projector (#136) (*Jie Fu*)
- [[```14c95986```](https://github.com/jina-ai/docarray/commit/14c9598602741a5558c02a98bd123b34cddc32b8)] __-__ implement pca (#135) (*Jie Fu*)
### 🐞 Bug fixes
- [[```9f74de8f```](https://github.com/jina-ai/docarray/commit/9f74de8f34bc51ab740a05e9bfabf0be4ec77b87)] __-__ reload duplicate storage (#143) (*felix-wang*)
- [[```7010d778```](https://github.com/jina-ai/docarray/commit/7010d77869dfd2a44ebaeb4a697bbac3e01d6970)] __-__ fix update/delete (#140) (*Jie Fu*)
- [[```8a05887d```](https://github.com/jina-ai/docarray/commit/8a05887dc68d4219190de8767c98fc5a1740e3a2)] __-__ composite pca and pq (#139) (*felix-wang*)
### 🍹 Other Improvements
- [[```157a1a9c```](https://github.com/jina-ai/docarray/commit/157a1a9c189f73f5757d2ec60c4427e41cd130bd)] __-__ __version__: the next version will be 0.3.6 (*Jina Dev Bot*)
<a name=release-note-0-3-7></a>
## Release Note (`0.3.7`)
> Release time: 2022-08-26 04:48:47
🙇 We'd like to thank all contributors for this new release! In particular,
Jie Fu, Jina Dev Bot, 🙇
### 🐞 Bug fixes
- [[```01cf7a89```](https://github.com/jina-ai/docarray/commit/01cf7a8960f27abcbc414faf2d274e0d8da470db)] __-__ fix np.int64 issue (#146) (*Jie Fu*)
### 🍹 Other Improvements
- [[```b602acce```](https://github.com/jina-ai/docarray/commit/b602acce8f7f2ac085f9cbccae6fadeee1ebcb85)] __-__ __version__: the next version will be 0.3.7 (*Jina Dev Bot*)
<a name=release-note-0-3-8></a>
## Release Note (`0.3.8`)
> Release time: 2022-08-26 11:12:19
🙇 We'd like to thank all contributors for this new release! In particular,
felix-wang, Jina Dev Bot, 🙇
### 🐞 Bug fixes
- [[```a5eaa53e```](https://github.com/jina-ai/docarray/commit/a5eaa53e84aa374e061ca0bfddc7797fe369f8ec)] __-__ insert when not found at update (#151) (*felix-wang*)
- [[```18af2d4c```](https://github.com/jina-ai/docarray/commit/18af2d4c42ced2ca9d393456ba9722953edce383)] __-__ normalize training data for cosine metric (#150) (*felix-wang*)
### 🍹 Other Improvements
- [[```f0fb1c4b```](https://github.com/jina-ai/docarray/commit/f0fb1c4b37c89fea31fc24fb47b60181f0cd3218)] __-__ __version__: the next version will be 0.3.8 (*Jina Dev Bot*)
<a name=release-note-0-3-9></a>
## Release Note (`0.3.9`)
> Release time: 2022-08-31 03:46:53
🙇 We'd like to thank all contributors for this new release! In particular,
felix-wang, Gustavo Ye, Jina Dev Bot, 🙇
### 🐞 Bug fixes
- [[```27589456```](https://github.com/jina-ai/docarray/commit/27589456fe72c3e9a1fe7d6c8d198e0e71fd9a89)] __-__ reload indexer (#154) (*felix-wang*)
- [[```cdbe7256```](https://github.com/jina-ai/docarray/commit/cdbe725691cf5cc19db5df250d6d9d384bc94437)] __-__ __py__: roll back the original decorator (#142) (*Gustavo Ye*)
### 🍹 Other Improvements
- [[```f34062b2```](https://github.com/jina-ai/docarray/commit/f34062b2249a3fce8cb51dc9e8b8497d33a54a2f)] __-__ __version__: the next version will be 0.3.9 (*Jina Dev Bot*)
<a name=release-note-0-3-10></a>
## Release Note (`0.3.10`)
> Release time: 2022-09-06 09:40:07
🙇 We'd like to thank all contributors for this new release! In particular,
felix-wang, numb3r3, Jie Fu, Jina Dev Bot, 🙇
### 🐞 Bug fixes
- [[```bc7f5ae2```](https://github.com/jina-ai/docarray/commit/bc7f5ae2e8e83cef919052a9d8a3a954a1cd813f)] __-__ allow columns as dict format (#160) (*felix-wang*)
- [[```25c3ec3d```](https://github.com/jina-ai/docarray/commit/25c3ec3d616a9ece53c710d734ba71d0eb6fc19c)] __-__ thread safe index and search (#157) (*felix-wang*)
- [[```d235cb83```](https://github.com/jina-ai/docarray/commit/d235cb83a2d8399f1120b3ebf6e54392ecb37efa)] __-__ executor unittest (#156) (*felix-wang*)
### 🏁 Unit Test and CICD
- [[```e6543a28```](https://github.com/jina-ai/docarray/commit/e6543a28894aad5481d1a6a5bbd8f070ca979a90)] __-__ add docarray tests (#153) (*Jie Fu*)
### 🍹 Other Improvements
- [[```6a774648```](https://github.com/jina-ai/docarray/commit/6a774648d772ad98f42641c6094192bf4e7d1877)] __-__ update logo (*numb3r3*)
- [[```33fd66be```](https://github.com/jina-ai/docarray/commit/33fd66be1135ebe63f2a1e1366196c8712e2bc63)] __-__ update readme (*numb3r3*)
- [[```e1ad3a55```](https://github.com/jina-ai/docarray/commit/e1ad3a55d4732a115e6a925075b7b5ed768cd9f8)] __-__ update log (*numb3r3*)
- [[```c2630257```](https://github.com/jina-ai/docarray/commit/c26302572de4e755d64ffee71b017e798f790e9e)] __-__ update readme (#149) (*felix-wang*)
- [[```c1fd8cfe```](https://github.com/jina-ai/docarray/commit/c1fd8cfe746d94de2ebef843f6e13384f266773b)] __-__ __version__: the next version will be 0.3.10 (*Jina Dev Bot*)
<a name=release-note-0-3-11></a>
## Release Note (`0.3.11`)
> Release time: 2022-09-08 06:17:25
🙇 We'd like to thank all contributors for this new release! In particular,
felix-wang, Jina Dev Bot, 🙇
### 🐞 Bug fixes
- [[```0b435eb6```](https://github.com/jina-ai/docarray/commit/0b435eb6c5a737d370b389ba3c2395796febe95f)] __-__ bump pybind11 (#162) (*felix-wang*)
### 🍹 Other Improvements
- [[```9c43f02c```](https://github.com/jina-ai/docarray/commit/9c43f02c5ec9face32ee58858b8ab25d65edbf12)] __-__ __version__: the next version will be 0.3.11 (*Jina Dev Bot*)
<a name=release-note-0-3-12></a>
## Release Note (`0.3.12`)
> Release time: 2022-09-19 05:15:49
🙇 We'd like to thank all contributors for this new release! In particular,
felix-wang, YangXiuyu, numb3r3, Jina Dev Bot, 🙇
### 🆕 New Features
- [[```4eb61d3d```](https://github.com/jina-ai/docarray/commit/4eb61d3dea5a86d5212dafaaa0c1ecd87abc399a)] __-__ add limit and offset for filtering (#167) (*felix-wang*)
### 📗 Documentation
- [[```6ecfa833```](https://github.com/jina-ai/docarray/commit/6ecfa833e3d0f69a1a1b4992724753c89fa8d96f)] __-__ fix typo (#168) (*YangXiuyu*)
### 🍹 Other Improvements
- [[```6902a8b1```](https://github.com/jina-ai/docarray/commit/6902a8b15763e73fd1cd90827ccfdc758f2fce32)] __-__ add description about hnsw parameters (#169) (*felix-wang*)
- [[```aec2d605```](https://github.com/jina-ai/docarray/commit/aec2d605d43394638168282c78e13f7f2150200a)] __-__ update readme (*numb3r3*)
- [[```95541bad```](https://github.com/jina-ai/docarray/commit/95541bad119dd2bb03e9894f32f57578cd3d8a7a)] __-__ __version__: the next version will be 0.3.12 (*Jina Dev Bot*)
<a name=release-note-0-3-13></a>
## Release Note (`0.3.13`)
> Release time: 2022-09-26 02:04:59
🙇 We'd like to thank all contributors for this new release! In particular,
felix-wang, Jie Fu, numb3r3, Jina Dev Bot, 🙇
### 🆕 New Features
- [[```f98a8336```](https://github.com/jina-ai/docarray/commit/f98a83368e9fc81d82abb2be9c8a6569ee45e177)] __-__ change sqlite in_memory to false (#173) (*Jie Fu*)
### 🐞 Bug fixes
- [[```b5f8694f```](https://github.com/jina-ai/docarray/commit/b5f8694fb5c33af15cc9b4a140a0165bf3aa01f4)] __-__ __hnswlib__: cannot find sufficient data with filtering (#176) (*felix-wang*)
- [[```0aa07863```](https://github.com/jina-ai/docarray/commit/0aa078633d7b6b850a3ea0ffe651dbad06240cc0)] __-__ __ci__: unittest (#175) (*felix-wang*)
### 🧼 Code Refactoring
- [[```896c5006```](https://github.com/jina-ai/docarray/commit/896c5006a6cac5f6b144bcb452ec5b2169ca4c88)] __-__ use rockdb as the storage backend (#171) (*felix-wang*)
### 🍹 Other Improvements
- [[```66257b41```](https://github.com/jina-ai/docarray/commit/66257b4147836a8e8b7c17dd12537143509e78a2)] __-__ Revert "refactor: use rockdb as the storage backend (#171)" (*numb3r3*)
- [[```e56aae75```](https://github.com/jina-ai/docarray/commit/e56aae757e65e5f8294a15b6ba998e00bda97e1a)] __-__ __version__: the next version will be 0.3.13 (*Jina Dev Bot*)
<a name=release-note-0-4-0></a>
## Release Note (`0.4.0`)
> Release time: 2022-10-24 09:53:20
🙇 We'd like to thank all contributors for this new release! In particular,
numb3r3, Jie Fu, YangXiuyu, felix-wang, Ziniu Yu, Gustavo Ye, Jina Dev Bot, 🙇
### 🆕 New Features
- [[```1cd07f2c```](https://github.com/jina-ai/docarray/commit/1cd07f2c33e4f1f4e756baab4a6cd0a1452996cf)] __-__ hub as remote storage (#177) (*YangXiuyu*)
- [[```90db6006```](https://github.com/jina-ai/docarray/commit/90db60067205d8e64a63c0c7eb4e782ac04d1033)] __-__ add flag for close (#191) (*Jie Fu*)
- [[```3b782900```](https://github.com/jina-ai/docarray/commit/3b7829004edfb4f6cf4b77048a747a0d338c21b3)] __-__ cibuildwheels (#186) (*YangXiuyu*)
### 🐞 Bug fixes
- [[```8c807937```](https://github.com/jina-ai/docarray/commit/8c80793733e96680d2ca846371ebc3c684f1d5d7)] __-__ restore when initializing annlite (#201) (*Jie Fu*)
- [[```98f94d0f```](https://github.com/jina-ai/docarray/commit/98f94d0f4ff1bd0b1bc1b99d828d6fd21932dc2b)] __-__ set is_train=true after loading pca (#199) (*Jie Fu*)
- [[```0b9bb413```](https://github.com/jina-ai/docarray/commit/0b9bb413abe631ea07bd7358160060b16cafa827)] __-__ unittest (#198) (*YangXiuyu*)
- [[```0d1d5a2e```](https://github.com/jina-ai/docarray/commit/0d1d5a2e58eb043397f24a1d6efcee07c22e2cb1)] __-__ pre-release (*numb3r3*)
- [[```fdcf3818```](https://github.com/jina-ai/docarray/commit/fdcf3818f1b8a2b11df96f4c6c72e972bab91238)] __-__ cd release version (#197) (*YangXiuyu*)
- [[```f4bbc495```](https://github.com/jina-ai/docarray/commit/f4bbc49529a2ac4174bbb0cc7c88ab8666bd034c)] __-__ clean codes (*numb3r3*)
- [[```6bedafe4```](https://github.com/jina-ai/docarray/commit/6bedafe4c49ce8c4831b4cc01c1b8702924e4b8f)] __-__ test-paths (*numb3r3*)
- [[```beb79b23```](https://github.com/jina-ai/docarray/commit/beb79b23a1b1412bad402410a0b90393f0e535b6)] __-__ __cd__: combine build and test (*numb3r3*)
- [[```19652ed0```](https://github.com/jina-ai/docarray/commit/19652ed0b6ef0ffc9dc1c3d2dc2640075e4d6d13)] __-__ cd workflow (#196) (*YangXiuyu*)
- [[```4fe2b3be```](https://github.com/jina-ai/docarray/commit/4fe2b3be51ed0b571c6f0b9c9f20a68c18b7486c)] __-__ cd workflow (#195) (*YangXiuyu*)
- [[```4fdacde2```](https://github.com/jina-ai/docarray/commit/4fdacde2a22a5e6fb0edc15bb6f77bf4d8f25834)] __-__ cd workflow (#193) (*YangXiuyu*)
- [[```494fbfcb```](https://github.com/jina-ai/docarray/commit/494fbfcbb96257168542f56ec617a9dc9082c084)] __-__ cd release (#192) (*felix-wang*)
- [[```d2803ce0```](https://github.com/jina-ai/docarray/commit/d2803ce00e41a69470a3001e4677927b407c3282)] __-__ cd workflow (#190) (*felix-wang*)
- [[```fe9db3d9```](https://github.com/jina-ai/docarray/commit/fe9db3d9f7e04cbad96832409dd1d3159195c060)] __-__ build on apple silicon (#188) (*Ziniu Yu*)
- [[```bb402ae9```](https://github.com/jina-ai/docarray/commit/bb402ae9674a8410686b8dde3aea82f1b86fc10b)] __-__ __bindings__: build on windows (#183) (*YangXiuyu*)
- [[```efc18f80```](https://github.com/jina-ai/docarray/commit/efc18f80228ef2371c19fbb205cabb94afd47385)] __-__ update executor parameter (#180) (*YangXiuyu*)
- [[```f142089a```](https://github.com/jina-ai/docarray/commit/f142089a419bba3d7e23b2c34b16455ddbcb805d)] __-__ executor tests (#179) (*felix-wang*)
- [[```79e171d4```](https://github.com/jina-ai/docarray/commit/79e171d411f31dad7231bce1682ac78ff7b3e1b2)] __-__ __annliteindexer__: annlite executor integration (#170) (*YangXiuyu*)
### 🧼 Code Refactoring
- [[```e8c59907```](https://github.com/jina-ai/docarray/commit/e8c59907540bada5282a413b43d64f11596678d3)] __-__ use rocksdb as the docs storage engine (#178) (*felix-wang*)
### 🍹 Other Improvements
- [[```e6dfcd2a```](https://github.com/jina-ai/docarray/commit/e6dfcd2a7ff0fdd149dd606487f7aa0107775c63)] __-__ bump to v0.4.0 (*numb3r3*)
- [[```6ce03fd0```](https://github.com/jina-ai/docarray/commit/6ce03fd04e5bb63da3c3d2cf1c620ac77daf6c2a)] __-__ release (*numb3r3*)
- [[```02857ec4```](https://github.com/jina-ai/docarray/commit/02857ec4103e32139f695adac4f7a40c4e65c67a)] __-__ Add pq dist table support (#158) (*Gustavo Ye*)
- [[```ff54290b```](https://github.com/jina-ai/docarray/commit/ff54290bfb19d07b953aa00d5d52cb1e8f805d3a)] __-__ __version__: the next version will be 0.3.14 (*Jina Dev Bot*)
<a name=release-note-0-5-0></a>
## Release Note (`0.5.0`)
> Release time: 2022-10-26 07:12:58
🙇 We'd like to thank all contributors for this new release! In particular,
numb3r3, felix-wang, Jina Dev Bot, 🙇
### 🐞 Bug fixes
- [[```de5273ab```](https://github.com/jina-ai/docarray/commit/de5273ab7cf81ba26bb4307cd66f3fc50974b8ef)] __-__ undo wheels uploading (#203) (*felix-wang*)
### 🍹 Other Improvements
- [[```5faf1dfa```](https://github.com/jina-ai/docarray/commit/5faf1dfa3326213c3b9e20de0d8b7aca4dbdf6ba)] __-__ regular release (*numb3r3*)
- [[```95e51f26```](https://github.com/jina-ai/docarray/commit/95e51f269ac130b9d54b004734ff68afc07a9cf1)] __-__ __version__: the next version will be 0.4.1 (*Jina Dev Bot*)
<a name=release-note-0-5-1></a>
## Release Note (`0.5.1`)
> Release time: 2022-11-03 03:29:55
🙇 We'd like to thank all contributors for this new release! In particular,
numb3r3, Jie Fu, Jina Dev Bot, 🙇
### 🆕 New Features
- [[```dd08f6f2```](https://github.com/jina-ai/docarray/commit/dd08f6f28008aaf020cb972f45e154afaf99cb28)] __-__ add file splitter and merger to support big file transfer to hubble (#202) (*Jie Fu*)
### 🍹 Other Improvements
- [[```83e45001```](https://github.com/jina-ai/docarray/commit/83e4500180e98bb967ab2ec17ca202c1702fb411)] __-__ bump 0.5.1 (*numb3r3*)
- [[```557c624b```](https://github.com/jina-ai/docarray/commit/557c624b3419b8bf70386e66a75d1958ef427738)] __-__ fix force release (*numb3r3*)
- [[```77312214```](https://github.com/jina-ai/docarray/commit/773122144a9ec39663829c64090d6116585fb6e1)] __-__ __version__: the next version will be 0.5.1 (*Jina Dev Bot*)
<a name=release-note-0-5-2></a>
## Release Note (`0.5.2`)
> Release time: 2022-11-03 07:10:10
🙇 We'd like to thank all contributors for this new release! In particular,
felix-wang, Jina Dev Bot, 🙇
### 🐞 Bug fixes
- [[```3f886f10```](https://github.com/jina-ai/docarray/commit/3f886f10d17195913cea839f3e4ac8631426ad5c)] __-__ restore from latest snapshot (#204) (*felix-wang*)
### 🍹 Other Improvements
- [[```2462afa4```](https://github.com/jina-ai/docarray/commit/2462afa4451c66603cc1e25bf755c97a97b7eec1)] __-__ __version__: the next version will be 0.5.2 (*Jina Dev Bot*)
<a name=release-note-0-5-4></a>
## Release Note (`0.5.4`)
> Release time: 2022-11-04 04:27:28
🙇 We'd like to thank all contributors for this new release! In particular,
felix-wang, Jina Dev Bot, 🙇
### 🐞 Bug fixes
- [[```f2ce3a9a```](https://github.com/jina-ai/docarray/commit/f2ce3a9a4b35a71a492fbe63203114e0d7fa6b52)] __-__ bump rocksdict (#206) (*felix-wang*)
### 🍹 Other Improvements
- [[```c84635c9```](https://github.com/jina-ai/docarray/commit/c84635c9854b7b3c2b5718941979720b673ed7eb)] __-__ __version__: the next version will be 0.5.3 (*Jina Dev Bot*)
<a name=release-note-0-5-5></a>
## Release Note (`0.5.5`)
> Release time: 2022-12-22 03:27:38
🙇 We'd like to thank all contributors for this new release! In particular,
Jie Fu, Jina Dev Bot, 🙇
### 🐞 Bug fixes
- [[```61e0a57f```](https://github.com/jina-ai/docarray/commit/61e0a57f901b087fbe6cab16aefb1d46d5fd54a2)] __-__ fix offset factory (#208) (*Jie Fu*)
### 🍹 Other Improvements
- [[```f6fdc9d3```](https://github.com/jina-ai/docarray/commit/f6fdc9d34fd8e8de47e7ac68e6813956c2aeebfe)] __-__ __version__: the next version will be 0.5.5 (*Jina Dev Bot*)
<a name=release-note-0-5-6></a>
## Release Note (`0.5.6`)
> Release time: 2023-02-08 09:06:21
🙇 We'd like to thank all contributors for this new release! In particular,
Jie Fu, Jina Dev Bot, 🙇
### 🐞 Bug fixes
- [[```906ed7d4```](https://github.com/jina-ai/docarray/commit/906ed7d4159527e8b5f00cdc9106433925f04a43)] __-__ save and load offset2ids for list_like feature (#210) (*Jie Fu*)
### 🏁 Unit Test and CICD
- [[```a2519f0f```](https://github.com/jina-ai/docarray/commit/a2519f0f323d0c3cb2d68ad1cd3eeed4beb40ce6)] __-__ add test for offset2ids saving and loading (#212) (*Jie Fu*)
### 🍹 Other Improvements
- [[```68215807```](https://github.com/jina-ai/docarray/commit/68215807953eed366e1d6f48fbbbbc1abebf13ff)] __-__ __version__: the next version will be 0.5.6 (*Jina Dev Bot*)
<a name=release-note-0-5-7></a>
## Release Note (`0.5.7`)
> Release time: 2023-02-21 12:34:55
🙇 We'd like to thank all contributors for this new release! In particular,
Jie Fu, Jina Dev Bot, 🙇
### 🐞 Bug fixes
- [[```4a66d0c0```](https://github.com/jina-ai/docarray/commit/4a66d0c06e7497cb94cd18da505a2e309594b20c)] __-__ remove delete tags for cell_table and meta_table (#215) (*Jie Fu*)
### 🍹 Other Improvements
- [[```eee752fe```](https://github.com/jina-ai/docarray/commit/eee752fe091dc5b5b519bb2aa866cb7d5dc48822)] __-__ __version__: the next version will be 0.5.7 (*Jina Dev Bot*)
<a name=release-note-0-5-8></a>
## Release Note (`0.5.8`)
> Release time: 2023-03-24 11:13:35
🙇 We'd like to thank all contributors for this new release! In particular,
felix-wang, YangXiuyu, Jina Dev Bot, 🙇
### 🐞 Bug fixes
- [[```bfeb7e94```](https://github.com/jina-ai/docarray/commit/bfeb7e94c879db263dc4931b54b6b87dac09df81)] __-__ delete and update (#222) (*felix-wang*)
- [[```7783b30f```](https://github.com/jina-ai/docarray/commit/7783b30fd55a10bd2dd0c75dc0451f0fe1834927)] __-__ __hnsw__: bump hnswlib to v0.6.2 (#185) (*YangXiuyu*)
### 🍹 Other Improvements
- [[```4c145ddd```](https://github.com/jina-ai/docarray/commit/4c145ddd19abb4caec479941d1c0ffb03c4cfcf3)] __-__ __version__: the next version will be 0.5.8 (*Jina Dev Bot*)
<a name=release-note-0-5-9></a>
## Release Note (`0.5.9`)
> Release time: 2023-04-07 04:16:28
🙇 We'd like to thank all contributors for this new release! In particular,
felix-wang, Jina Dev Bot, 🙇
### 🐞 Bug fixes
- [[```ab807ff6```](https://github.com/jina-ai/docarray/commit/ab807ff69e533ec90e0b0f6c782749267324d177)] __-__ bump rocksdb >= 0.3.9 (#225) (*felix-wang*)
### 🍹 Other Improvements
- [[```3668c3fa```](https://github.com/jina-ai/docarray/commit/3668c3fa6f7a2d2af39549987637e11e385a75bb)] __-__ __version__: the next version will be 0.5.9 (*Jina Dev Bot*)
<a name=release-note-0-5-10></a>
## Release Note (`0.5.10`)
> Release time: 2023-04-19 03:12:50
🙇 We'd like to thank all contributors for this new release! In particular,
Jie Fu, Jina Dev Bot, 🙇
### 🍹 Other Improvements
- [[```d55d544e```](https://github.com/jina-ai/docarray/commit/d55d544e876653bb47e79e2dd00f859c3ef00ffa)] __-__ bumping docarray (#227) (*Jie Fu*)
- [[```86057c69```](https://github.com/jina-ai/docarray/commit/86057c69401e2b6d63822be2c3c3a0f63d4661a6)] __-__ __version__: the next version will be 0.5.10 (*Jina Dev Bot*)
================================================
FILE: LICENSE
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: MANIFEST.in
================================================
include setup.py
include requirements.txt
include pyproject.toml
global-include *.pyx
recursive-include include/hnswlib/ *
================================================
FILE: Makefile
================================================
pypi: dist
twine upload dist/*
dist:
rm -rf dist/*
pip install build
python -m build --sdist
test:
python -m unittest discover --start-directory tests --pattern "*_test*.py"
clean:
rm -rf *.egg-info build dist tmp var tests/__pycache__ annlite/*.so
.PHONY: dist
================================================
FILE: README.md
================================================
<p align="center">
<br>
<br>
<br>
<img src="https://github.com/jina-ai/annlite/blob/main/.github/assets/logo.svg?raw=true" alt="AnnLite logo: A fast and efficient ann libray" width="200px">
<br>
<br>
<b>A fast embedded library for approximate nearest neighbor search</b>
</p>
<p align=center>
<a href="hhttps://github.com/jina-ai/annlite"><img alt="GitHub" src="https://img.shields.io/github/license/jina-ai/annlite?style=flat-square"></a>
<a href="https://pypi.org/project/annlite/"><img alt="PyPI" src="https://img.shields.io/pypi/v/annlite?label=Release&style=flat-square"></a>
<a href="https://codecov.io/gh/jina-ai/annlite"><img alt="Codecov branch" src="https://img.shields.io/codecov/c/github/jina-ai/annlite/main?logo=Codecov&logoColor=white&style=flat-square"></a>
<a href="https://slack.jina.ai"><img src="https://img.shields.io/badge/Slack-3.1k-blueviolet?logo=slack&logoColor=white&style=flat-square"></a>
</p>
<!-- start elevator-pitch -->
## What is AnnLite?
`AnnLite` is a *lightweight* and *embeddable* library for **fast** and **filterable** *approximate nearest neighbor search* (ANNS).
It allows to search for nearest neighbors in a dataset of millions of points with a Pythonic API.
**Highlighted features:**
- 🐥 **Easy-to-use**: a simple API is designed to be used with Python. It is easy to use and intuitive to set up to production.
- 🐎 **Fast**: the library uses a highly optimized approximate nearest neighbor search algorithm (*HNSW*) to search for nearest neighbors.
- 🔎 **Filterable**: the library allows you to search for nearest neighbors within a subset of the dataset.
- 🍱 **Integration**: Smooth integration with neural search ecosystem including [Jina](https://github.com/jina-ai/jina) and [DocArray](https://github.com/jina-ai/docarray),
so that users can easily expose search API with **gRPC** and/or **HTTP**.
The library is easy to install and use. It is designed to be used with Python.
<!---
Read more on why should you use `AnnLite`: [here](), and compare to alternatives: [here]().
-->
## Installation
To use AnnLite, you need to first install it. The easiest way to install AnnLite is using `pip`:
```bash
pip install -U annlite
```
or install from source:
```bash
python setup.py install
```
## Quick start
Before you start, you need to know some experience about [DocArray](https://github.com/jina-ai/docarray).
`AnnLite` is designed to be used with [DocArray](https://github.com/jina-ai/docarray), so you need to know how to use `DocArray` first.
For example, you can create a `DocArray` with `1000` random vectors with `128` dimensions:
```python
from docarray import DocumentArray
import numpy as np
docs = DocumentArray.empty(1000)
docs.embeddings = np.random.random([1000, 128]).astype(np.float32)
```
### Index
Then you can create an `AnnIndexer` to index the created `docs` and search for nearest neighbors:
```python
from annlite import AnnLite
ann = AnnLite(128, metric='cosine', data_path="/tmp/annlite_data")
ann.index(docs)
```
Note that this will create a directory `/tmp/annlite_data` to persist the documents indexed.
If this directory already exists, the index will be loaded from the directory.
And if you want to create a new index, you can delete the directory first.
### Search
Then you can search for nearest neighbors for some query docs with `ann.search()`:
```python
query = DocumentArray.empty(5)
query.embeddings = np.random.random([5, 128]).astype(np.float32)
result = ann.search(query)
```
Then, you can inspect the retrieved docs for each query doc inside `query` matches:
```python
for q in query:
print(f'Query {q.id}')
for k, m in enumerate(q.matches):
print(f'{k}: {m.id} {m.scores["cosine"]}')
```
```bash
Query ddbae2073416527bad66ff186543eff8
0: 47dcf7f3fdbe3f0b8d73b87d2a1b266f {'value': 0.17575037}
1: 7f2cbb8a6c2a3ec7be024b750964f317 {'value': 0.17735684}
2: 2e7eed87f45a87d3c65c306256566abb {'value': 0.17917466}
Query dda90782f6514ebe4be4705054f74452
0: 6616eecba99bd10d9581d0d5092d59ce {'value': 0.14570713}
1: d4e3147fc430de1a57c9883615c252c6 {'value': 0.15338594}
2: 5c7b8b969d4381f405b8f07bc68f8148 {'value': 0.15743542}
...
```
Or shorten the loop as one-liner using the element & attribute selector:
```python
print(query['@m', ('id', 'scores__cosine')])
```
### Query
You can get specific document by its id:
```python
doc = ann.get_doc_by_id('<doc_id>')
```
And you can also get the documents with `limit` and `offset`, which is useful for pagination:
```python
docs = ann.get_docs(limit=10, offset=0)
```
Furthermore, you can also get the documents ordered by a specific column from the index:
```python
docs = ann.get_docs(limit=10, offset=0, order_by='x', ascending=True)
```
**Note**: the `order_by` column must be one of the `columns` in the index.
### Update
After you have indexed the `docs`, you can update the docs in the index by calling `ann.update()`:
```python
updated_docs = docs.sample(10)
updated_docs.embeddings = np.random.random([10, 128]).astype(np.float32)
ann.update(updated_docs)
```
### Delete
And finally, you can delete the docs from the index by calling `ann.delete()`:
```python
to_delete = docs.sample(10)
ann.delete(to_delete)
```
## Search with filters
To support search with filters, the annlite must be created with `colums` parameter, which is a series of fields you want to filter by.
At the query time, the annlite will filter the dataset by providing `conditions` for certain fields.
```python
import annlite
# the column schema: (name:str, dtype:type, create_index: bool)
ann = annlite.AnnLite(128, columns=[('price', float)], data_path="/tmp/annlite_data")
```
Then you can insert the docs, in which each doc has a field `price` with a float value contained in the `tags`:
```python
import random
docs = DocumentArray.empty(1000)
docs = DocumentArray(
[
Document(id=f'{i}', tags={'price': random.random()})
for i in range(1000)
]
)
docs.embeddings = np.random.random([1000, 128]).astype(np.float32)
ann.index(docs)
```
Then you can search for nearest neighbors with filtering conditions as:
```python
query = DocumentArray.empty(5)
query.embeddings = np.random.random([5, 128]).astype(np.float32)
ann.search(query, filter={"price": {"$lte": 50}}, limit=10)
print(f'the result with filtering:')
for i, q in enumerate(query):
print(f'query [{i}]:')
for m in q.matches:
print(f'\t{m.id} {m.scores["euclidean"].value} (price={m.tags["price"]})')
```
The `conditions` parameter is a dictionary of conditions. The key is the field name, and the value is a dictionary of conditions.
The query language is the same as [MongoDB Query Language](https://docs.mongodb.com/manual/reference/operator/query/).
We currently support a subset of those selectors.
- `$eq` - Equal to (number, string)
- `$ne` - Not equal to (number, string)
- `$gt` - Greater than (number)
- `$gte` - Greater than or equal to (number)
- `$lt` - Less than (number)
- `$lte` - Less than or equal to (number)
- `$in` - Included in an array
- `$nin` - Not included in an array
The query will be performed on the field if the condition is satisfied. The following is an example of a query:
1. A Nike shoes with white color
```python
{
"brand": {"$eq": "Nike"},
"category": {"$eq": "Shoes"},
"color": {"$eq": "White"}
}
```
We also support boolean operators `$or` and `$and`:
```python
{
"$and":
{
"brand": {"$eq": "Nike"},
"category": {"$eq": "Shoes"},
"color": {"$eq": "White"}
}
}
```
2. A Nike shoes or price less than `100$`:
```python
{
"$or":
{
"brand": {"$eq": "Nike"},
"price": {"$lte": 100}
}
}
```
## Dump and Load
By default, the hnsw index is in memory. You can dump the index to `data_path` by calling `.dump()`:
```python
from annlite import AnnLite
ann = AnnLite(128, metric='cosine', data_path="/path/to/data_path")
ann.index(docs)
ann.dump()
```
And you can restore the hnsw index from `data_path` if it exists:
```python
new_ann = AnnLite(128, metric='cosine', data_path="/path/to/data_path")
```
If you didn't dump the hnsw index, the index will be rebuilt from scratch. This will take a while.
## Supported distance metrics
The annlite supports the following distance metrics:
#### Supported distances:
| Distance | parameter | Equation |
|----------------------------------------------------------------------|----------------:|--------------------------------------------------------:|
| [Euclidean](https://en.wikipedia.org/wiki/Euclidean_distance) | `euclidean` | d = sqrt(sum((Ai-Bi)^2)) |
| [Inner product](https://en.wikipedia.org/wiki/Inner_product_space) | `inner_product` | d = 1.0 - sum(Ai\*Bi) |
| [Cosine similarity](https://en.wikipedia.org/wiki/Cosine_similarity) | `cosine` | d = 1.0 - sum(Ai\*Bi) / sqrt(sum(Ai\*Ai) * sum(Bi\*Bi)) |
Note that inner product is not an actual metric. An element can be closer to some other element than to itself.
That allows some speedup if you remove all elements that are not the closest to themselves from the index, e.g.,
`inner_product([1.0, 1.0], [1.0. 1.0]) < inner_product([1.0, 1.0], [2.0, 2.0])`
## HNSW algorithm parameters
The HNSW algorithm has several parameters that can be tuned to improve the search performance.
### Search parameters
- `ef_search` - The size of the dynamic list for the nearest neighbors during search (default: `50`).
The larger the value, the more accurate the search results, but the slower the search speed.
The `ef_search` must be larger than `limit` parameter in `search(..., limit)`.
- `limit` - The maximum number of results to return (default: `10`).
## Construction parameters
- `max_connection` - The number of bi-directional links created for every new element during construction (default: `16`).
Reasonable range is from `2` to `100`. Higher values works better for dataset with higher dimensionality and/or high recall.
This parameter also affects the memory consumption during construction, which is roughly `max_connection * 8-10` bytes per stored element.
As an example for `n_dim=4` random vectors optimal `max_connection` for search is somewhere around `6`,
while for high dimensional datasets, higher `max_connection` are required (e.g. `M=48-64`) for optimal performance at high recall.
The range `max_connection=12-48` is ok for the most of the use cases.
When `max_connection` is changed one has to update the other parameters.
Nonetheless, `ef_search` and `ef_construction` parameters can be roughly estimated by assuming that `max_connection * ef_{construction}` is a constant.
- `ef_construction`: The size of the dynamic list for the nearest neighbors during construction (default: `200`).
Higher values give better accuracy, but increase construction time and memory consumption.
At some point, increasing `ef_construction` does not give any more accuracy.
To set `ef_construction` to a reasonable value, one can measure the recall: if the recall is lower than 0.9, then increase `ef_construction` and re-run the search.
To set the parameters, you can define them when creating the annlite:
```python
from annlite import AnnLite
ann = AnnLite(128, columns=[('price', float)], data_path="/tmp/annlite_data", ef_construction=200, max_connection=16)
```
## Benchmark
One can run `executor/benchmark.py` to get a quick performance overview.
|Stored data| Indexing time | Query size=1 | Query size=8 | Query size=64|
|---|---|---|---|---|
|10000 | 2.970 | 0.002 | 0.013 | 0.100|
|100000 | 76.474 | 0.011 | 0.078 | 0.649|
|500000 | 467.936 | 0.046 | 0.356 | 2.823|
|1000000 | 1025.506 | 0.091 | 0.695 | 5.778|
Results with filtering can be generated from `examples/benchmark_with_filtering.py`. This script should produce a table similar to:
| Stored data |% same filter| Indexing time | Query size=1 | Query size=8 | Query size=64|
|-----|-----|-----|-----|-----|-----|
| 10000 | 5 | 2.869 | 0.004 | 0.030 | 0.270 |
| 10000 | 15 | 2.869 | 0.004 | 0.035 | 0.294 |
| 10000 | 20 | 3.506 | 0.005 | 0.038 | 0.287 |
| 10000 | 30 | 3.506 | 0.005 | 0.044 | 0.356 |
| 10000 | 50 | 3.506 | 0.008 | 0.064 | 0.484 |
| 10000 | 80 | 2.869 | 0.013 | 0.098 | 0.910 |
| 100000 | 5 | 75.960 | 0.018 | 0.134 | 1.092 |
| 100000 | 15 | 75.960 | 0.026 | 0.211 | 1.736 |
| 100000 | 20 | 78.475 | 0.034 | 0.265 | 2.097 |
| 100000 | 30 | 78.475 | 0.044 | 0.357 | 2.887 |
| 100000 | 50 | 78.475 | 0.068 | 0.565 | 4.383 |
| 100000 | 80 | 75.960 | 0.111 | 0.878 | 6.815 |
| 500000 | 5 | 497.744 | 0.069 | 0.561 | 4.439 |
| 500000 | 15 | 497.744 | 0.134 | 1.064 | 8.469 |
| 500000 | 20 | 440.108 | 0.152 | 1.199 | 9.472 |
| 500000 | 30 | 440.108 | 0.212 | 1.650 | 13.267 |
| 500000 | 50 | 440.108 | 0.328 | 2.637 | 21.961 |
| 500000 | 80 | 497.744 | 0.580 | 4.602 | 36.986 |
| 1000000 | 5 | 1052.388 | 0.131 | 1.031 | 8.212 |
| 1000000 | 15 | 1052.388 | 0.263 | 2.191 | 16.643 |
| 1000000 | 20 | 980.598 | 0.351 | 2.659 | 21.193 |
| 1000000 | 30 | 980.598 | 0.461 | 3.713 | 29.794 |
| 1000000 | 50 | 980.598 | 0.732 | 5.975 | 47.356 |
| 1000000 | 80 | 1052.388 | 1.151 | 9.255 | 73.552 |
Note that:
- query times presented are represented in seconds.
- `% same filter` indicates the amount of data that verifies a filter in the database.
- For example, if `% same filter = 10` and `Stored data = 1_000_000` then it means `100_000` example verify the filter.
## Next steps
If you already have experience with Jina and DocArray, you can start using `AnnLite` right away.
Otherwise, you can check out this advanced tutorial to learn how to use `AnnLite`: [here]() in practice.
## 🙋 FAQ
**1. Why should I use `AnnLite`?**
`AnnLite` is easy to use and intuitive to set up in production. It is also very fast and memory efficient, making it a great choice for approximate nearest neighbor search.
**2. How do I use `AnnLite` with Jina?**
We have implemented an executor for `AnnLite` that can be used with Jina.
```python
from jina import Flow
with Flow().add(uses='jinahub://AnnLiteIndexer', uses_with={'n_dim': 128}) as f:
f.post('/index', inputs=docs)
```
3. Does `AnnLite` support search with filters?
```text
Yes.
```
## Documentation
You can find the documentation on [Github]() and [ReadTheDocs]()
## 🤝 Contribute and spread the word
We are also looking for contributors who want to help us improve: code, documentation, issues, feedback! Here is how you can get started:
- Have a look through GitHub issues labeled "Good first issue".
- Read our Contributor Covenant Code of Conduct
- Open an issue or submit your pull request!
## License
`AnnLite` is licensed under the [Apache License 2.0]().
================================================
FILE: annlite/__init__.py
================================================
__version__ = '0.5.11'
from .index import AnnLite
================================================
FILE: annlite/container.py
================================================
import warnings
from pathlib import Path
from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
import numpy as np
from docarray import Document, DocumentArray
from loguru import logger
if TYPE_CHECKING:
from .core.codec.pq import PQCodec
from .core.codec.projector import ProjectorCodec
from .core.index.hnsw import HnswIndex
from .enums import Metric
from .storage.base import ExpandMode
from .storage.kv import DocStorage
from .storage.table import CellTable, MetaTable
VALID_FILTERABLE_DATA_TYPES = [int, str, float]
class CellContainer:
def __init__(
self,
n_dim: int,
metric: Metric = Metric.COSINE,
n_cells: int = 1,
projector_codec: Optional['ProjectorCodec'] = None,
pq_codec: Optional['PQCodec'] = None,
initial_size: Optional[int] = None,
expand_step_size: int = 50000,
expand_mode: 'ExpandMode' = ExpandMode.STEP,
filterable_attrs: Optional[Dict] = None,
serialize_config: Optional[Dict] = None,
data_path: 'Path' = Path('./data'),
**kwargs,
):
self.n_dim = n_dim
self.metric = metric
self.n_cells = n_cells
self.n_components = projector_codec.n_components if projector_codec else None
self.data_path = data_path
self.serialize_config = serialize_config
self._pq_codec = pq_codec
self._projector_codec = projector_codec
self._vec_indexes = [
HnswIndex(
dim=self.n_components or n_dim,
metric=metric,
initial_size=initial_size,
expand_step_size=expand_step_size,
expand_mode=expand_mode,
pq_codec=pq_codec,
**kwargs,
)
for _ in range(n_cells)
]
self._doc_stores = [
DocStorage(
data_path / f'cell_{_}',
serialize_config=serialize_config or {},
lock=True,
)
for _ in range(n_cells)
]
columns = []
if filterable_attrs:
for attr_name, attr_type in filterable_attrs.items():
if isinstance(attr_type, str):
attr_type = eval(attr_type)
if attr_type not in VALID_FILTERABLE_DATA_TYPES:
raise ValueError(
f'Invalid filterable attribute type `{attr_type}` for attribute `{attr_name}`. '
)
columns.append((attr_name, attr_type))
self._cell_tables = [
CellTable(f'table_{c}', columns=columns) for c in range(n_cells)
]
self._meta_table = MetaTable('metas', data_path=data_path, in_memory=True)
def ivf_search(
self,
x: 'np.ndarray',
cells: 'np.ndarray',
where_clause: str = '',
where_params: Tuple = (),
limit: int = 10,
):
dists = []
doc_idx = []
cell_ids = []
count = 0
for cell_id in cells:
cell_table = self.cell_table(cell_id)
cell_size = cell_table.count()
if cell_size == 0:
continue
indices = None
if where_clause:
indices = cell_table.query(
where_clause=where_clause, where_params=where_params
)
if len(indices) == 0:
continue
indices = np.array(indices, dtype=np.int64)
_dists, _doc_idx = self.vec_index(cell_id).search(
x, limit=min(limit, cell_size), indices=indices
)
if count >= limit and _dists[0] > dists[-1][-1]:
continue
dists.append(_dists)
doc_idx.append(_doc_idx)
cell_ids.extend([cell_id] * len(_dists))
count += len(_dists)
cell_ids = np.array(cell_ids, dtype=np.int64)
if len(dists) != 0:
dists = np.hstack(dists)
doc_idx = np.hstack(doc_idx)
indices = dists.argsort(axis=0)[:limit]
dists = dists[indices]
cell_ids = cell_ids[indices]
doc_idx = doc_idx[indices]
doc_ids = []
for cell_id, offset in zip(cell_ids, doc_idx):
doc_id = self.cell_table(cell_id).get_docid_by_offset(offset)
doc_ids.append(doc_id)
return dists, doc_ids, cell_ids
def filter_cells(
self,
cells: 'np.ndarray',
where_clause: str = '',
where_params: Tuple = (),
limit: int = -1,
offset: int = 0,
order_by: Optional[str] = None,
ascending: bool = True,
include_metadata: bool = False,
):
result = DocumentArray()
if len(cells) > 1 and offset > 0:
raise ValueError('Offset is not supported for multiple cells')
for cell_id in cells:
cell_table = self.cell_table(cell_id)
cell_size = cell_table.count()
if cell_size == 0:
continue
indices = cell_table.query(
where_clause=where_clause,
where_params=where_params,
order_by=order_by,
limit=limit,
offset=offset,
ascending=ascending,
)
if len(indices) == 0:
continue
for offset in indices:
doc_id = self.cell_table(cell_id).get_docid_by_offset(offset)
doc = Document(id=doc_id)
if include_metadata or (len(cells) > 1 and order_by):
doc = self.doc_store(cell_id).get([doc_id])[0]
result.append(doc)
if not order_by and len(result) >= limit > 0:
break
# reordering the results from multiple cells
if order_by and len(cells) > 1:
result = sorted(
result, key=lambda d: d.tags.get(order_by), reverse=not ascending
)
if limit > 0:
result = result[:limit]
result = DocumentArray(result)
return result
def search_cells(
self,
query: 'np.ndarray',
cells: 'np.ndarray',
where_clause: str = '',
where_params: Tuple = (),
limit: int = 10,
include_metadata: bool = False,
):
if self._projector_codec:
query = self._projector_codec.encode(query)
topk_dists, topk_docs = [], []
for x, cell_idx in zip(query, cells):
# x.shape = (self.n_dim,)
dists, doc_ids, cells = self.ivf_search(
x,
cells=cell_idx,
where_clause=where_clause,
where_params=where_params,
limit=limit,
)
topk_dists.append(dists)
match_docs = DocumentArray()
for dist, doc_id, cell_id in zip(dists, doc_ids, cells):
doc = Document(id=doc_id)
if include_metadata:
doc = self.doc_store(cell_id).get([doc_id])[0]
doc.scores[self.metric.name.lower()].value = dist
match_docs.append(doc)
topk_docs.append(match_docs)
return topk_dists, topk_docs
def _search_cells(
self,
query: 'np.ndarray',
cells: 'np.ndarray',
where_clause: str = '',
where_params: Tuple = (),
limit: int = 10,
):
if self._projector_codec:
query = self._projector_codec.encode(query)
topk_dists, topk_ids = [], []
for x, cell_idx in zip(query, cells):
dists, ids, cells = self.ivf_search(
x,
cells=cell_idx,
where_clause=where_clause,
where_params=where_params,
limit=limit,
)
topk_dists.append(dists)
topk_ids.append(ids)
return topk_dists, [np.array(ids, dtype=int) for ids in topk_ids]
def insert(
self,
data: 'np.ndarray',
cells: 'np.ndarray',
docs: 'DocumentArray',
only_index: bool = False,
):
assert len(docs) == len(data)
if self._projector_codec:
data = self._projector_codec.encode(data)
unique_cells, unique_cell_counts = np.unique(cells, return_counts=True)
if len(unique_cells) == 1:
cell_id = unique_cells[0]
offsets = self.cell_table(cell_id).insert(docs)
offsets = np.array(offsets, dtype=np.int64)
self.vec_index(cell_id).add_with_ids(data, offsets)
self._meta_table.bulk_add_address([d.id for d in docs], cells, offsets)
if not only_index:
self.doc_store(cell_id).insert(docs)
else:
for cell_id, cell_count in zip(unique_cells, unique_cell_counts):
# TODO: Jina should allow boolean filtering in docarray to avoid this
# and simply use cells == cell_index
indices = np.where(cells == cell_id)[0]
cell_docs = docs[indices.tolist()]
cell_offsets = self.cell_table(cell_id).insert(cell_docs)
cell_offsets = np.array(cell_offsets, dtype=np.int64)
cell_data = data[indices, :]
self.vec_index(cell_id).add_with_ids(cell_data, cell_offsets)
self._meta_table.bulk_add_address(
[d.id for d in cell_docs], [cell_id] * cell_count, cell_offsets
)
if not only_index:
self.doc_store(cell_id).insert(cell_docs)
logger.debug(f'{len(docs)} new docs added')
def _add_vecs(self, data: 'np.ndarray', cells: 'np.ndarray', offsets: 'np.ndarray'):
assert data.shape[0] == cells.shape[0]
assert data.shape[1] == self.n_dim
unique_cells, _ = np.unique(cells, return_counts=True)
for cell_id in unique_cells:
indices = cells == cell_id
x = data[indices, :]
ids = offsets[indices]
self.vec_index(cell_id).add_with_ids(x, ids)
def update(
self,
data: 'np.ndarray',
cells: 'np.ndarray',
docs: 'DocumentArray',
insert_if_not_found: bool = True,
raise_errors_on_not_found: bool = False,
):
update_success = 0
new_data = []
new_cells = []
new_docs = []
for (
x,
doc,
cell_id,
) in zip(data, docs, cells):
_cell_id, _offset = self._meta_table.get_address(doc.id)
if cell_id == _cell_id:
self.vec_index(cell_id).add_with_ids(x.reshape(1, -1), [_offset])
self.doc_store(cell_id).update([doc])
self.meta_table.add_address(doc.id, cell_id, _offset)
update_success += 1
elif _cell_id is None:
if raise_errors_on_not_found and not insert_if_not_found:
raise Exception(
f'The document (id={doc.id}) cannot be updated as'
f'it is not found in the index'
)
elif not (raise_errors_on_not_found or insert_if_not_found):
warnings.warn(
f'The document (id={doc.id}) cannot be updated as '
f'it is not found in the index',
RuntimeWarning,
)
elif insert_if_not_found:
new_data.append(x)
new_cells.append(cell_id)
new_docs.append(doc)
update_success += 1
else:
continue
else:
# DELETE and INSERT
self.vec_index(_cell_id).delete(_offset)
self.cell_table(_cell_id).delete_by_offset(_offset)
self.doc_store(_cell_id).delete([doc.id])
new_data.append(x)
new_cells.append(cell_id)
new_docs.append(doc)
update_success += 1
if len(new_data) > 0:
new_data = np.stack(new_data)
new_cells = np.array(new_cells, dtype=np.int64)
self.insert(new_data, new_cells, new_docs)
logger.debug(
f'total items for updating: {len(docs)}, ' f'success: {update_success}'
)
def delete(
self,
ids: List[str],
raise_errors_on_not_found: bool = False,
):
delete_success = 0
for doc_id in ids:
cell_id, offset = self._meta_table.get_address(doc_id)
if cell_id is not None:
self.vec_index(cell_id).delete([offset])
self.cell_table(cell_id).delete_by_offset(offset)
self.doc_store(cell_id).delete([doc_id])
self.meta_table.delete_address(doc_id)
delete_success += 1
else:
if raise_errors_on_not_found:
raise Exception(
f'The document (id={doc_id}) cannot be updated as'
f'it is not found in the index'
)
else:
continue
logger.debug(
f'total items for updating: {len(ids)}, ' f'success: {delete_success}'
)
def _rebuild_database(self):
"""rebuild doc_store and meta_table after annlite download databse from hubble"""
self._doc_stores = [
DocStorage(
self.data_path / f'cell_{_}',
serialize_config=self.serialize_config or {},
lock=True,
)
for _ in range(self.n_cells)
]
# self._meta_table = MetaTable('metas', data_path=self.data_path, in_memory=False)
def _get_doc_by_id(self, doc_id: str):
cell_id = 0
if self.n_cells > 1:
cell_id, _ = self._meta_table.get_address(doc_id)
da = self.doc_store(cell_id).get([doc_id])
return da[0] if len(da) > 0 else None
def documents_generator(self, cell_id: int, batch_size: int = 1000):
for docs in self.doc_store(cell_id).batched_iterator(batch_size=batch_size):
yield docs
@property
def cell_tables(self):
return self._cell_tables
@property
def cell_indexes(self):
return self._vec_indexes
def cell_table(self, cell_id: int):
return self._cell_tables[cell_id]
def doc_store(self, cell_id: int):
return self._doc_stores[cell_id]
def vec_index(self, cell_id: int):
return self._vec_indexes[cell_id]
@property
def meta_table(self):
return self._meta_table
@property
def total_docs(self):
return sum([store.size for store in self._doc_stores])
@property
def index_size(self):
return sum([table.size for table in self._cell_tables])
================================================
FILE: annlite/core/__init__.py
================================================
from .codec import PQCodec, ProjectorCodec, VQCodec
================================================
FILE: annlite/core/codec/__init__.py
================================================
from .pq import PQCodec
from .projector import ProjectorCodec
from .vq import VQCodec
================================================
FILE: annlite/core/codec/base.py
================================================
import pickle
from abc import ABC, abstractmethod
from typing import TYPE_CHECKING
if TYPE_CHECKING:
from pathlib import Path
class BaseCodec(ABC):
def __init__(self, require_train: bool = True):
self.require_train = require_train
self._is_trained = False if require_train else True
@abstractmethod
def fit(self, *args, **kwargs):
pass
@abstractmethod
def encode(self):
pass
@abstractmethod
def decode(self):
pass
def dump(self, target_path: 'Path'):
pickle.dump(self, target_path.open('wb'), protocol=4)
@staticmethod
def load(from_path: 'Path'):
return pickle.load(from_path.open('rb'))
@property
def is_trained(self):
return self._is_trained
def _check_trained(self):
assert self.is_trained is True, f'{self.__class__.__name__} requires training'
================================================
FILE: annlite/core/codec/pq.py
================================================
from argparse import ArgumentError
import numpy as np
from scipy.cluster.vq import vq
from annlite import pq_bind
from ...enums import Metric
from ...math import l2_normalize
from ...profile import time_profile
from .base import BaseCodec
# from pqlite.pq_bind import precompute_adc_table, dist_pqcodes_to_codebooks
class PQCodec(BaseCodec):
"""Implementation of Product Quantization (PQ) [Jegou11]_.
For the indexing phase of database vectors,
a `D`-dim input vector is divided into `M` `D`/`M`-dim sub-vectors.
Each sub-vector is quantized into a small integer via `Ks` codewords.
For the querying phase, given a new `D`-dim query vector, the distance between the query
and the database PQ-codes are efficiently approximated via Asymmetric Distance.
All vectors must be np.ndarray with np.float32
.. [Jegou11] H. Jegou et al., "Product Quantization for Nearest Neighbor Search", IEEE TPAMI 2011
:param d_vector: the dimensionality of input vectors
:param n_subvectors: The number of sub-space
:param n_clusters: The number of codewords for each subspace
(typically 256, so that each sub-vector is quantized
into 256 bits pqlite.utils.asymmetric_distance= 1 byte = uint8)
:param n_init: Number of times K-Means is trained with different centroid seeds. Best result of
the `n_init` consecutive runs is selected.
"""
def __init__(
self,
dim: int,
n_subvectors: int = 8,
n_clusters: int = 256,
metric: Metric = Metric.EUCLIDEAN,
n_init: int = 4,
):
super(PQCodec, self).__init__(require_train=True)
self.dim = dim
self.n_subvectors = n_subvectors
self.n_clusters = n_clusters
assert (
dim % n_subvectors == 0
), 'input dimension must be dividable by number of sub-space'
self.d_subvector = dim // n_subvectors
self.code_dtype = (
np.uint8
if n_clusters <= 2**8
else (np.uint16 if n_clusters <= 2**16 else np.uint32)
)
# assert (
# metric == Metric.EUCLIDEAN
# ), f'The distance metric `{metric.name}` is not supported yet!'
self.metric = metric
self.normalize_input = False
if self.metric == Metric.COSINE:
self.normalize_input = True
self._codebooks = np.zeros(
(self.n_subvectors, self.n_clusters, self.d_subvector), dtype=np.float32
)
self.kmeans = []
self.n_init = n_init
def __hash__(self):
return hash(
(
self.__class__.__name__,
self.dim,
self.n_subvectors,
self.n_clusters,
self.metric,
self.code_dtype,
)
)
def fit(self, x: 'np.ndarray', iter: int = 100):
"""Train the K-Means for each cartesian product
:param x: Training vectors with shape=(N, D)
:param iter: Number of iterations in Kmeans
"""
from sklearn.cluster import KMeans
assert x.dtype == np.float32
assert x.ndim == 2
if self.normalize_input:
x = l2_normalize(x)
# [m][ks][ds]: m-th subspace, ks-the codeword, ds-th dim
self._codebooks = np.zeros(
(self.n_subvectors, self.n_clusters, self.d_subvector), dtype=np.float32
)
for m in range(self.n_subvectors):
kmeans = KMeans(
n_clusters=self.n_clusters, max_iter=iter, n_init=self.n_init
)
self.kmeans.append(kmeans)
self.kmeans[m].fit(x[:, m * self.d_subvector : (m + 1) * self.d_subvector])
self._codebooks[m] = self.kmeans[m].cluster_centers_
self._is_trained = True
def partial_fit(self, x: 'np.ndarray'):
"""Given a batch of training vectors, update the internal MiniBatchKMeans.
This method is specially designed to be used when data does not fit in memory.
:param x: Training vectors with shape=(N, D)
"""
assert x.ndim == 2
if self.normalize_input:
x = l2_normalize(x)
if len(self.kmeans) > 0:
for m in range(self.n_subvectors):
self.kmeans[m].partial_fit(
x[:, m * self.d_subvector : (m + 1) * self.d_subvector]
)
else:
from sklearn.cluster import MiniBatchKMeans
for m in range(self.n_subvectors):
self.kmeans.append(MiniBatchKMeans(n_clusters=self.n_clusters))
for m in range(self.n_subvectors):
self.kmeans[m].partial_fit(
x[:, m * self.d_subvector : (m + 1) * self.d_subvector]
)
def build_codebook(self):
"""Constructs sub-codebooks from the current parameters of the models in `self.kmeans`
This step is not necessary if full KMeans is trained used calling `.fit`.
"""
self._codebooks = np.zeros(
(self.n_subvectors, self.n_clusters, self.d_subvector), dtype=np.float32
)
for m in range(self.n_subvectors):
self._codebooks[m] = self.kmeans[m].cluster_centers_
self._is_trained = True
def encode(self, x: 'np.ndarray'):
"""Encode input vectors into PQ-codes.
:param x: Input vectors with shape=(N, D) and dtype=np.float32.
:return: np.ndarray: PQ codes with shape=(N, M) and dtype=self.code_dtype
"""
assert x.dtype == np.float32
assert x.ndim == 2
N, D = x.shape
assert (
D == self.d_subvector * self.n_subvectors
), 'input dimension must be Ds * M'
# codes[n][m] : code of n-th vec, m-th subspace
codes = np.empty((N, self.n_subvectors), dtype=self.code_dtype)
for m in range(self.n_subvectors):
sub_vecs = x[:, m * self.d_subvector : (m + 1) * self.d_subvector]
codes[:, m], _ = vq(sub_vecs, self.codebooks[m])
return codes
def decode(self, codes: 'np.ndarray'):
"""Given PQ-codes, reconstruct original D-dimensional vectors
approximately by fetching the codewords.
:param codes: PQ-cdoes with shape=(N, M) and dtype=self.code_dtype.
Each row is a PQ-code
:return: Reconstructed vectors with shape=(N, D) and dtype=np.float32
"""
assert codes.ndim == 2
N, M = codes.shape
assert M == self.n_subvectors
assert codes.dtype == self.code_dtype
vecs = np.empty((N, self.d_subvector * self.n_subvectors), dtype=np.float32)
for m in range(self.n_subvectors):
vecs[:, m * self.d_subvector : (m + 1) * self.d_subvector] = self.codebooks[
m
][codes[:, m], :]
return vecs
def precompute_adc(self, query: object) -> object:
"""Compute a distance table for a query vector.
The distances are computed by comparing each sub-vector of the query
to the codewords for each sub-subspace.
`dtable[m][ks]` contains the squared Euclidean distance between
the `m`-th sub-vector of the query and the `ks`-th codeword
for the `m`-th sub-space (`self.codewords[m][ks]`).
:param query: Input vector with shape=(D, ) and dtype=np.float32
:return: Distance table. which contains dtable with shape=(M, Ks)
and dtype=np.float32
"""
assert query.dtype == np.float32
assert query.ndim == 1, 'input must be a single vector'
# dtable[m] : distance between m-th subvec and m-th codewords (m-th subspace)
# dtable[m][ks] : distance between m-th subvec and ks-th codeword of m-th codewords
# Warning: the following line produces `ValueError: buffer source array is read-only`
# if no `const` is used in the cython implementation using a memoryview
dtable = pq_bind.precompute_adc_table(
query, self.d_subvector, self.n_clusters, self.codebooks
)
return DistanceTable(dtable)
@property
def codebooks(self):
return self._codebooks
# trained pq interface ----------------
def get_codebook(self) -> 'np.ndarray':
"""Return the codebook parameters.
Expect a 3-dimensional matrix is returned,
with shape (`n_subvectors`, `n_clusters`, `d_subvector`) and dtype float32
"""
return np.ascontiguousarray(self.codebooks, dtype='float32')
def get_subspace_splitting(self):
"""Return subspace splitting setting
:return: tuple of (`n_subvectors`, `n_clusters`, `d_subvector`)
"""
return (self.n_subvectors, self.n_clusters, self.d_subvector)
# def get_dist_mat(self, x: np.ndarray):
# """Return the distance tables in form of matrix for multiple queries
# :param query: shape('N', 'D'),
# :return: ndarray with shape('N', `n_subvectors`, `n_clusters`)
# .. note::
# _description_
# """
# assert x.dtype == np.float32
# assert x.ndim == 2
# N, D = x.shape
# assert (
# D == self.d_subvector * self.n_subvectors
# ), 'input dimension must be Ds * M'
# if self.normalize_input:
# x = l2_normalize(x)
# x = x.reshape(
# N,
# self.n_subvectors,
# 1,
# self.d_subvector,
# )
# if self.metric == Metric.EUCLIDEAN:
# # (1, n_subvectors, n_clusters, d_subvector)
# codebook = self.codebooks[np.newaxis, ...]
# # broadcast to (N, n_subvectors, n_clusters, d_subvector)
# dist_vector = (x - codebook) ** 2
# # reduce to (N, n_subvectors, n_clusters)
# dist_mat = np.sum(dist_vector, axis=3)
# elif self.metric in [Metric.INNER_PRODUCT, Metric.COSINE]:
# # (1, n_subvectors, n_clusters, d_subvector)
# codebook = self.codebooks[np.newaxis, ...]
# # broadcast to (N, n_subvectors, n_clusters, d_subvector)
# dist_vector = x * codebook
# # reduce to (N, n_subvectors, n_clusters)
# dist_mat = 1 / self.n_clusters - np.sum(dist_vector, axis=3)
# else:
# raise ArgumentError(f'Unable support metrics {self.metric}')
# return np.ascontiguousarray(dist_mat, dtype='float32')
def get_dist_mat(self, x: np.ndarray):
"""Return the distance tables in form of matrix for multiple queries
:param query: shape('N', 'D'),
:return: ndarray with shape('N', `n_subvectors`, `n_clusters`)
.. note::
_description_
"""
assert x.dtype == np.float32
assert x.ndim == 2
N, D = x.shape
assert (
D == self.d_subvector * self.n_subvectors
), 'input dimension must be Ds * M'
if self.normalize_input:
x = l2_normalize(x)
if self.metric == Metric.EUCLIDEAN:
dist_mat = pq_bind.batch_precompute_adc_table(
x, self.d_subvector, self.n_clusters, self.codebooks
)
elif self.metric in [Metric.INNER_PRODUCT, Metric.COSINE]:
dist_mat = 1 / self.n_clusters - np.array(
pq_bind.batch_precompute_adc_table_ip(
x, self.d_subvector, self.n_clusters, self.codebooks
),
dtype='float32',
)
else:
raise ArgumentError(f'Unable support metrics {self.metric}')
return np.ascontiguousarray(dist_mat, dtype='float32')
# -------------------------------------
class DistanceTable(object):
"""Distance table from query to codeworkds.
Given a query vector, a PQ/OPQ instance compute this DistanceTable class
using :func:`PQ.dtable` or :func:`OPQ.dtable`.
The Asymmetric Distance from query to each database codes can be computed
by :func:`DistanceTable.adist`.
Args:
dtable (np.ndarray): Distance table with shape=(M, Ks) and dtype=np.float32
computed by :func:`PQ.dtable` or :func:`OPQ.dtable`
Attributes:
dtable (np.ndarray): Distance table with shape=(M, Ks) and dtype=np.float32.
Note that dtable[m][ks] contains the squared Euclidean distance between
(1) m-th sub-vector of query and (2) ks-th codeword for m-th subspace.
"""
def __init__(self, dtable: 'np.ndarray'):
assert dtable.ndim == 2
self.dtable = dtable
def adist(self, codes):
"""Given PQ-codes, compute Asymmetric Distances between the query (self.dtable)
and the PQ-codes.
Args:
codes (np.ndarray): PQ codes with shape=(N, M) and
dtype=pq.code_dtype where pq is a pq instance that creates the codes
Returns:
np.ndarray: Asymmetric Distances with shape=(N, ) and dtype=np.float32
"""
assert codes.ndim == 2
dists = pq_bind.dist_pqcodes_to_codebooks(self.dtable, codes)
# The above line is equivalent to the followings:
# dists = np.zeros((N, )).astype(np.float32)
# for n in range(N):
# for m in range(M):
# dists[n] += self.dtable[m][codes[n][m]]
return dists
================================================
FILE: annlite/core/codec/projector.py
================================================
from typing import Optional
import numpy as np
from .base import BaseCodec
class ProjectorCodec(BaseCodec):
"""Implementation of Projector.
:param n_components: number of components to keep.
:param whiten: when True (False by default) the components_ vectors are multiplied
by the square root of n_samples and then divided by the singular
values to ensure uncorrelated outputs with unit component-wise variances.
:param svd_solver:
If auto: The solver is selected by a default policy based on X.shape and
n_components: if the input data is larger than 500x500 and the number of
components to extract is lower than 80% of the smallest dimension of the
data, then the more efficient ‘randomized’ method is enabled. Otherwise
the exact full SVD is computed and optionally truncated afterwards.
If full: run exact full SVD calling the standard LAPACK solver via scipy.
linalg.svd and select the components by postprocessing.
If arpack: run SVD truncated to n_components calling ARPACK solver via
scipy.sparse.linalg.svds. It requires strictly 0 < n_components < min(X.shape).
"""
def __init__(
self,
dim: int,
n_components: int = 128,
whiten: Optional[bool] = False,
svd_solver: Optional[str] = 'auto',
):
super(ProjectorCodec, self).__init__(require_train=True)
self.dim = dim
self.n_components = n_components
assert self.dim >= self.n_components, (
f'the dimension after projector should be less than original dimension, got '
f'original dimension: {self.dim} and projector dimension: {self.n_components}'
)
self.whiten = whiten
self.svd_solver = svd_solver
self.pca = None
def __hash__(self):
return hash(
(
self.__class__.__name__,
self.dim,
self.n_components,
self.whiten,
self.svd_solver,
)
)
def fit(self, x: 'np.ndarray'):
"""Train projector model
:param x: Training vectors with shape=(N, D)
"""
assert x.ndim == 2
assert (
x.shape[1] == self.dim,
), 'dimension of input data must be equal to "dim"'
assert (
x.shape[0] > self.n_components
), 'number of input data must be larger than or equal to n_components'
if self.pca is None:
from sklearn.decomposition import PCA
self.pca = PCA(
n_components=self.n_components,
whiten=self.whiten,
svd_solver=self.svd_solver,
)
self.pca.fit(x)
self._is_trained = True
def partial_fit(self, x: 'np.ndarray'):
"""Given a batch of training vectors, update the internal projector.
This method is specially designed to be used when data does not fit in memory.
:param x: Training vectors with shape=(N, D)
"""
assert x.ndim == 2
assert x.shape[1] == self.dim, 'dimension of input data must be equal to "dim"'
assert (
x.shape[0] > self.n_components
), 'number of input data must be larger than or equal to n_components'
if self.pca is None:
from sklearn.decomposition import IncrementalPCA
self.pca = IncrementalPCA(
n_components=self.n_components,
whiten=self.whiten,
)
self.pca.partial_fit(x)
self._is_trained = True
def encode(self, x: 'np.ndarray'):
"""Encode input vectors using projector.
:param x: Input vectors with shape=(N, D)
:return: np.ndarray: transformed vectors using projector.
"""
assert x.ndim == 2
assert x.shape[1] == self.dim, 'dimension of input data must be equal to "dim"'
return self.pca.transform(x)
def decode(self, x: 'np.ndarray'):
"""Given transformed vectors, reconstruct original D-dimensional vectors
approximately.
:param x: vectors with shape=(N, self.n_components).
:return: Reconstructed vectors with shape=(N, D)
"""
assert x.ndim == 2
assert x.shape[1] == self.n_components
return self.pca.inverse_transform(x)
@property
def components(self):
"""Principal axes in feature space, representing the directions of maximum
variance in the data.
"""
self._check_trained()
return self.pca.components_
@property
def explained_variance_ratio(self):
"""Percentage of variance explained by each of the selected components."""
self._check_trained()
return self.pca.explained_variance_ratio_
@property
def mean(self):
"""Per-feature empirical mean."""
self._check_trained()
return self.pca.mean_
@property
def var(self):
"""Per-feature empirical variance"""
self._check_trained()
return self.pca.var_
================================================
FILE: annlite/core/codec/vq.py
================================================
import numpy as np
from scipy.cluster.vq import vq
from ...enums import Metric
from .base import BaseCodec
class VQCodec(BaseCodec):
def __init__(
self,
n_clusters: int,
metric: Metric = Metric.EUCLIDEAN,
iter: int = 100,
n_init: int = 4,
*args,
**kwargs
):
super(VQCodec, self).__init__(require_train=True)
self.n_clusters = n_clusters
# assert (
# metric == Metric.EUCLIDEAN
# ), f'The distance metric `{metric.name}` is not supported yet!'
self.metric = metric
self._codebook = None
self.iter = iter
self.kmeans = None
self.n_init = n_init
def __hash__(self):
return hash((self.__class__.__name__, self.n_clusters, self.metric))
def fit(self, x: 'np.ndarray'):
"""Given training vectors, run k-means for each sub-space and create
codewords for each sub-space.
:param x: Training vectors with shape=(N, D) and dtype=np.float32.
:param iter: The number of iteration for k-means
"""
from sklearn.cluster import KMeans
assert x.dtype == np.float32
assert x.ndim == 2
self.kmeans = KMeans(self.n_clusters, max_iter=self.iter, n_init=self.n_init)
self.kmeans.fit(x)
self._codebook = self.kmeans.cluster_centers_
self._is_trained = True
def partial_fit(self, x: 'np.ndarray'):
"""Given a batch of training vectors, update the internal MiniBatchKMeans.
This method is specially designed to be used when data does not fit in memory.
:param x: Training vectors with shape=(N, D)
"""
assert x.ndim == 2
if self.kmeans:
self.kmeans.partial_fit(x)
else:
from sklearn.cluster import MiniBatchKMeans
self.kmeans = MiniBatchKMeans(
n_clusters=self.n_clusters, max_iter=self.iter
)
self.kmeans.partial_fit(x)
def build_codebook(self):
"""Constructs a codebook from the current MiniBatchKmeans
This step is not necessary if full KMeans is trained used calling `.fit`.
"""
self._codebook = self.kmeans.cluster_centers_
self._is_trained = True
def encode(self, x: 'np.ndarray'):
"""Encodes each row of the input array `x` it's closest cluster id."""
self._check_trained()
assert x.dtype == np.float32
assert x.ndim == 2
codes, _ = vq(x, self.codebook)
return codes
def decode(self, x: 'np.ndarray'):
return None
@property
def codebook(self):
self._check_trained()
return self._codebook
================================================
FILE: annlite/core/index/__init__.py
================================================
================================================
FILE: annlite/core/index/base.py
================================================
import abc
from typing import List, Optional, Union
import numpy as np
from ...enums import ExpandMode, Metric
from ...helper import str2dtype
class BaseIndex(abc.ABC):
def __init__(
self,
dim: int,
dtype: Union[np.dtype, str] = np.float32,
metric: Metric = Metric.COSINE,
initial_size: Optional[int] = None,
expand_step_size: int = 10240,
expand_mode: ExpandMode = ExpandMode.STEP,
*args,
**kwargs
):
assert expand_step_size > 0
self.initial_size = initial_size or expand_step_size
self.expand_step_size = expand_step_size
self.expand_mode = expand_mode
self.dim = dim
self.dtype = str2dtype(dtype) if isinstance(dtype, str) else dtype
self.metric = metric
self._size = 0
self._capacity = self.initial_size
@property
def capacity(self) -> int:
return self._capacity
@property
def size(self):
return self._size
@abc.abstractmethod
def add_with_ids(self, x: np.ndarray, ids: List[int], **kwargs):
...
@abc.abstractmethod
def delete(self, ids: List[int]):
...
@abc.abstractmethod
def update_with_ids(self, x: np.ndarray, ids: List[int], **kwargs):
...
def reset(self, capacity: Optional[int] = None):
self._size = 0
self._capacity = capacity or self.initial_size
================================================
FILE: annlite/core/index/flat_index.py
================================================
from typing import List, Optional
import numpy as np
from loguru import logger
from ...math import cdist, top_k
from .base import BaseIndex
class FlatIndex(BaseIndex):
def __init__(self, *args, **kwargs):
super(FlatIndex, self).__init__(*args, **kwargs)
self._data = np.zeros((self.initial_size, self.dim), dtype=self.dtype)
def search(
self, x: np.ndarray, limit: int = 10, indices: Optional[np.ndarray] = None
):
_dim = x.shape[-1]
assert (
_dim == self.dim
), f'the query embedding dimension does not match with index dimension: {_dim} vs {self.dim}'
x = x.reshape((-1, self.dim))
data = self._data[: self.size]
data_ids = np.arange(self.size)
if indices is not None:
data = self._data[indices]
data_ids = data_ids[indices]
dists = cdist(x, data, metric=self.metric.name.lower())
dists, idx = top_k(dists, limit, descending=False)
# TODO: change the shape of return
dists = dists[0]
data_ids = data_ids[idx[0]]
return dists, data_ids
def add_with_ids(self, x: np.ndarray, ids: List[int]):
for idx in ids:
if idx >= self._capacity:
self._expand_capacity()
start = self._size
end = start + len(x)
self._data[ids, :] = x
self._size = end
def _expand_capacity(self):
new_block = np.zeros((self.expand_step_size, self.dim), dtype=self.dtype)
self._data = np.concatenate((self._data, new_block), axis=0)
self._capacity += self.expand_step_size
logger.debug(
f'total storage capacity is expanded by {self.expand_step_size}',
)
def reset(self, capacity: Optional[int] = None):
super().reset(capacity=capacity)
self._data = np.zeros((self.capacity, self.dim), dtype=self.dtype)
def delete(self, ids: List[int]):
raise RuntimeError(
f'the deletion operation is not allowed for {self.__class__.__name__}!'
)
def update_with_ids(self, x: np.ndarray, ids: List[int], **kwargs):
self._data[ids, :] = x
================================================
FILE: annlite/core/index/hnsw/__init__.py
================================================
from .index import HnswIndex
================================================
FILE: annlite/core/index/hnsw/index.py
================================================
import math
import os.path
from functools import wraps
from pathlib import Path
from typing import TYPE_CHECKING, List, Optional, Union
import numpy as np
from loguru import logger
from annlite.hnsw_bind import Index
from ....enums import Metric
from ....math import l2_normalize
from ..base import BaseIndex
if TYPE_CHECKING:
from ...codec.base import BaseCodec
def pre_process(f):
@wraps(f)
def pre_processed(self: 'HnswIndex', x: np.ndarray, *args, **kwargs):
if x.ndim == 1:
x = x.reshape((1, -1))
if x.dtype != self.dtype:
x = x.astype(self.dtype)
if self.normalization_enable:
x = l2_normalize(x)
if self.pq_enable:
if not self.pq_codec.is_trained:
raise RuntimeError(
'Please train the PQ before using HNSW quantization backend'
)
elif not self._set_backend_pq:
self._index.loadPQ(self.pq_codec)
self._set_backend_pq = True
kwargs['pre_process_dtables'] = self.pq_codec.get_dist_mat(x)
x = self.pq_codec.encode(x)
assert kwargs['pre_process_dtables'].dtype == 'float32'
assert kwargs['pre_process_dtables'].flags['C_CONTIGUOUS']
return f(self, x, *args, **kwargs)
else:
return f(self, x, *args, **kwargs)
return pre_processed
class HnswIndex(BaseIndex):
def __init__(
self,
dim: int,
dtype: np.dtype = np.float32,
metric: Metric = Metric.COSINE,
ef_construction: int = 200,
ef_search: int = 50,
max_connection: int = 16,
pq_codec: Optional['BaseCodec'] = None,
index_file: Optional[Union[str, Path]] = None,
**kwargs,
):
"""
:param dim: The dimensionality of vectors to index
:param index_file: A file-like object or a string containing a file name.
:param metric: Distance metric type, can be 'euclidean', 'inner_product', or 'cosine'
:param ef_construction: the size of the dynamic list for the nearest neighbors (used during the building).
:param ef_search: the size of the dynamic list for the nearest neighbors (used during the search).
:param max_connection: The number of bi-directional links created for every new element during construction.
Reasonable range for M is 2-100.
"""
super().__init__(dim, dtype=dtype, metric=metric, **kwargs)
self.ef_construction = ef_construction
self.ef_search = ef_search
self.max_connection = max_connection
self.pq_codec = pq_codec
self._set_backend_pq = False
self.index_file = index_file
self._init_hnsw_index()
def _init_hnsw_index(self):
self._index = Index(space=self.space_name, dim=self.dim)
if self.index_file:
if os.path.exists(self.index_file):
logger.info(
f'indexer will be loaded from {self.index_file}',
)
self.load(self.index_file)
else:
raise FileNotFoundError(
f'index path: {self.index_file} does not exist',
)
else:
if self.pq_codec is not None and self.pq_codec.is_trained:
self._index.init_index(
max_elements=self.capacity,
ef_construction=self.ef_construction,
M=self.max_connection,
pq_codec=self.pq_codec,
)
self._set_backend_pq = True
else:
self._index.init_index(
max_elements=self.capacity,
ef_construction=self.ef_construction,
M=self.max_connection,
pq_codec=None,
)
self._set_backend_pq = False
self._index.set_ef(self.ef_search)
def load(self, index_file: Union[str, Path]):
self._index.load_index(str(index_file))
if self.pq_codec:
self._index.loadPQ(self.pq_codec)
def dump(self, index_file: Union[str, Path]):
self._index.save_index(str(index_file))
@pre_process
def add_with_ids(
self,
x: 'np.ndarray',
ids: List[int],
# kwargs maybe used by pre_process
pre_process_dtables=None,
):
max_id = max(ids) + 1
if max_id > self.capacity:
expand_steps = math.ceil(max_id / self.expand_step_size)
self._expand_capacity(expand_steps * self.expand_step_size)
self._index.add_items(x, ids=ids, dtables=pre_process_dtables)
@pre_process
def search(
self,
query: 'np.ndarray',
limit: int = 10,
indices: Optional['np.ndarray'] = None,
# kwargs maybe used by pre_process
pre_process_dtables=None,
):
ef_search = max(self.ef_search, limit)
self._index.set_ef(ef_search)
if indices is not None:
# TODO: add a smart strategy to speed up this case (bruteforce search would be better)
if len(indices) < limit:
limit = len(indices)
ids, dists = self._index.knn_query_with_filter(
query, filters=indices, k=limit, dtables=pre_process_dtables
)
else:
ids, dists = self._index.knn_query(
query, k=limit, dtables=pre_process_dtables
)
# convert squared l2 into euclidean distance
if self.metric == Metric.EUCLIDEAN:
dists = np.sqrt(dists)
return dists[0], ids[0]
def delete(self, ids: List[int]):
for i in ids:
self._index.mark_deleted(i)
def update_with_ids(self, x: 'np.ndarray', ids: List[int], **kwargs):
raise RuntimeError(
f'the update operation is not allowed for {self.__class__.__name__}!'
)
def _expand_capacity(self, new_capacity: int):
self._capacity = new_capacity
self._index.resize_index(new_capacity)
logger.debug(
f'HNSW index capacity is expanded by {self.expand_step_size}',
)
def reset(self, capacity: Optional[int] = None):
super().reset(capacity=capacity)
self._init_hnsw_index()
@property
def size(self):
return self._index.element_count
@property
def space_name(self):
if self.metric == Metric.EUCLIDEAN:
return 'l2'
elif self.metric == Metric.INNER_PRODUCT:
return 'ip'
return 'cosine'
@property
def pq_enable(self):
return self.pq_codec is not None
@property
def normalization_enable(self):
return self.metric == Metric.COSINE
================================================
FILE: annlite/core/index/pq_index.py
================================================
from typing import List, Optional
import numpy as np
from ...math import top_k
from ..codec.pq import PQCodec
from .flat_index import FlatIndex
# TODO: deprecated this index
class PQIndex(FlatIndex): # pragma: no cover
def __init__(
self,
dim: int,
pq_codec: PQCodec,
**kwargs,
):
assert pq_codec is not None
self._dense_dim = dim
super(PQIndex, self).__init__(
pq_codec.n_subvectors, dtype=pq_codec.code_dtype, **kwargs
)
self._pq_codec = pq_codec
def add_with_ids(self, x: np.ndarray, ids: List[int]):
x = self._pq_codec.encode(x)
super(PQIndex, self).add_with_ids(x, ids)
def search(
self, x: np.ndarray, limit: int = 10, indices: Optional[np.ndarray] = None
):
_dim = x.shape[-1]
assert (
_dim == self._pq_codec.dim
), f'the query embedding dimension does not match with index dimension: {_dim} vs {self.dim}'
precomputed = self._pq_codec.precompute_adc(x)
codes = self._data
data_idx = np.arange(self._capacity)
if indices is not None:
codes = self._data[indices]
data_idx = data_idx[indices]
dists = precomputed.adist(codes) # (10000, )
dists = np.expand_dims(dists, axis=0)
dists, ids = top_k(dists, limit, descending=False)
# TODO: change the shape of return
ids = ids[0]
if indices is not None:
ids = data_idx[ids]
return dists[0], ids
================================================
FILE: annlite/enums.py
================================================
from enum import IntEnum
class BetterEnum(IntEnum):
"""The base class of Enum."""
def __str__(self):
return self.name
@classmethod
def from_string(cls, s: str):
"""
Parse the enum from a string.
:param s: string representation of the enum value
:return: enum value
"""
try:
return cls[s.upper()]
except KeyError:
raise ValueError(
f'{s.upper()} is not a valid enum for {cls!r}, must be one of {list(cls)}'
)
class Metric(BetterEnum):
EUCLIDEAN = 1
INNER_PRODUCT = 2
COSINE = 3
class ExpandMode(BetterEnum):
STEP = 1
DOUBLE = 2
ADAPTIVE = 3
================================================
FILE: annlite/executor.py
================================================
import threading
import time
import traceback
import warnings
from threading import Thread
from typing import Dict, List, Optional, Tuple, Union
from docarray import Document, DocumentArray
from jina import Executor, requests
from jina.logging.logger import JinaLogger
INDEX_BATCH_SIZE = 1024
class AnnLiteIndexer(Executor):
"""A simple indexer that wraps the AnnLite indexer and adds a simple interface for indexing and searching.
:param n_dim: Dimensionality of vectors to index
:param metric: Distance metric type. Can be 'euclidean', 'inner_product', or 'cosine'
:param limit: Number of results to get for each query document in search
:param n_components: Number of components to use for dimensionality reduction
:param match_args: the arguments to `DocumentArray`'s match function
:param data_path: the workspace of the AnnLiteIndexer but not support when shards > 1.
:param ef_construction: The construction time/accuracy trade-off
:param ef_search: The query time accuracy/speed trade-off
:param max_connection: The maximum number of outgoing connections in the
graph (the "M" parameter)
:param include_metadata: If True, return the document metadata in response
:param index_access_paths: Default traversal paths on docs
(used for indexing, delete and update), e.g. '@r', '@c', '@r,c'
:param search_access_paths: Default traversal paths on docs
(used for search), e.g. '@r', '@c', '@r,c'
:param columns: A list or dict of column names to index.
:param dim: Deprecated, use n_dim instead
"""
def __init__(
self,
n_dim: int = 0,
metric: str = 'cosine',
limit: int = 10,
n_components: Optional[int] = None,
match_args: Optional[Dict] = None,
data_path: Optional[str] = None,
ef_construction: Optional[int] = None,
ef_search: Optional[int] = None,
max_connection: Optional[int] = None,
include_metadata: bool = True,
index_access_paths: str = '@r',
search_access_paths: str = '@r',
columns: Optional[Union[List[Tuple[str, str]], Dict[str, str]]] = None,
list_like: Optional[bool] = False,
dim: int = None,
*args,
**kwargs,
):
super().__init__(*args, **kwargs)
self.logger = JinaLogger(self.__class__.__name__)
n_dim = n_dim or dim
if not n_dim:
raise ValueError('Please specify the dimension of the vectors to index!')
self.n_components = n_components
self.metric = metric
self.match_args = match_args or {}
self.include_metadata = include_metadata
if limit:
self.match_args.update({'limit': limit})
self.index_access_paths = index_access_paths
if 'index_traversal_paths' in kwargs:
warnings.warn(
f'`index_traversal_paths` is deprecated. Use `index_access_paths` instead.'
)
self.index_access_paths = kwargs['index_traversal_paths']
self.search_access_paths = search_access_paths
if 'search_traversal_paths' in kwargs:
warnings.warn(
f'`search_traversal_paths` is deprecated. Use `search_access_paths` instead.'
)
self.search_access_paths = kwargs['search_traversal_paths']
self._data_buffer = DocumentArray()
self._index_batch_size = INDEX_BATCH_SIZE
self._max_length_queue = 2 * self._index_batch_size
self._index_lock = threading.Lock()
self.logger = JinaLogger(getattr(self.metas, 'name', self.__class__.__name__))
if getattr(self.runtime_args, 'shards', 1) > 1 and data_path:
raise ValueError(
'`data_path` is not supported when shards > 1, please use `workspace` instead'
)
config = {
'n_dim': n_dim,
'n_components': n_components,
'metric': metric,
'ef_construction': ef_construction,
'ef_search': ef_search,
'max_connection': max_connection,
'data_path': data_path or self.workspace or './workspace',
'columns': columns,
'list_like': list_like,
}
self._index = DocumentArray(storage='annlite', config=config)
# start indexing thread in background to group indexing requests
# together and perform batch indexing at once
self._start_index_loop()
@requests(on='/index')
def index(
self, docs: Optional[DocumentArray] = None, parameters: dict = {}, **kwargs
):
"""Index new documents
:param docs: the Documents to index
:param parameters: dictionary with options for indexing
Keys accepted:
- 'access_paths': traversal paths on docs, e.g. '@r', '@c', '@r,c'
"""
if not docs:
return
access_paths = parameters.get('access_paths', self.index_access_paths)
flat_docs = docs[access_paths]
if len(flat_docs) == 0:
return
while len(self._data_buffer) >= self._max_length_queue:
time.sleep(0.001)
with self._index_lock:
self._data_buffer.extend(flat_docs)
def _start_index_loop(self):
"""Start the indexing loop in background.
This loop is responsible for batch indexing the documents in the buffer.
"""
def _index_loop():
try:
while True:
# if the buffer is none, will break the loop
if self._data_buffer is None:
break
# if the buffer is empty, will wait for new documents to be added
if len(self._data_buffer) == 0:
time.sleep(0.1) # sleep for 100ms
continue
# acquire the lock to prevent threading issues
with self._index_lock:
batch_docs = self._data_buffer.pop(
range(
self._index_batch_size
if len(self._data_buffer) > self._index_batch_size
else len(self._data_buffer)
)
)
self._index.extend(batch_docs)
self.logger.debug(f'indexing {len(batch_docs)} docs done...')
except Exception as e:
self.logger.error(traceback.format_exc())
raise e
self._index_thread = Thread(target=_index_loop, daemon=False)
self._index_thread.start()
@requests(on='/update')
def update(
self, docs: Optional[DocumentArray] = None, parameters: dict = {}, **kwargs
):
"""Update existing documents
:param docs: the Documents to update
:param parameters: dictionary with options for updating
Keys accepted:
- 'access_paths': traversal paths on docs, e.g. '@r', '@c', '@r,c'
- 'raise_errors_on_not_found': if True, raise an error if a document is not found. Default is False.
"""
if not docs:
return
access_paths = parameters.get('access_paths', self.index_access_paths)
raise_errors_on_not_found = parameters.get('raise_errors_on_not_found', False)
flat_docs = docs[access_paths]
if len(flat_docs) == 0:
return
with self._index_lock:
if len(self._data_buffer) > 0:
raise RuntimeError(
f'Cannot update documents while the pending documents in the buffer are not indexed yet. '
'Please wait for the pending documents to be indexed.'
)
for doc in flat_docs:
try:
self._index[doc.id] = doc
except IndexError:
if raise_errors_on_not_found:
raise Exception(
f'The document (id={doc.id}) cannot be updated as'
f'it is not found in the index'
)
else:
self.logger.warning(
f'cannot update doc {doc.id} as it does not exist in storage'
)
@requests(on='/delete')
def delete(self, parameters: dict = {}, **kwargs):
"""Delete existing documents
Delete entries from the index by id
:param parameters: parameters to the request
"""
delete_ids = parameters.get('ids', [])
if len(delete_ids) == 0:
return
with self._index_lock:
if len(self._data_buffer) > 0:
raise RuntimeError(
f'Cannot delete documents while the pending documents in the buffer are not indexed yet. '
'Please wait for the pending documents to be indexed.'
)
del self._index[delete_ids]
@requests(on='/search')
def search(
self, docs: Optional[DocumentArray] = None, parameters: dict = {}, **kwargs
):
"""Perform a vector similarity search and retrieve Document matches
Search can be performed with candidate filtering. Filters are a triplet (column,operator,value).
More than a filter can be applied during search. Therefore, conditions for a filter are specified as a list triplets.
Each triplet contains:
- column: Column used to filter.
- operator: Binary operation between two values. Some supported operators include `['>','<','=','<=','>=']`.
- value: value used to compare a candidate.
:param docs: the Documents to search with
:param parameters: dictionary for parameters for the search operation
Keys accepted:
- 'access_paths' (str): traversal paths on docs, e.g. '@r', '@c', '@r,c'
- 'filter' (dict): the filtering conditions on document tags
- 'limit' (int): nr of matches to get per Document
"""
if not docs:
return
access_paths = parameters.get('access_paths', self.search_access_paths)
flat_docs = docs[access_paths]
match_args = (
{**self.match_args, **parameters}
if parameters is not None
else self.match_args
)
with self._index_lock:
# if len(self._data_buffer) > 0:
# raise RuntimeError(
# f'Cannot search documents while the pending documents in the buffer are not indexed yet. '
# 'Please wait for the pending documents to be indexed.'
# )
flat_docs.match(self._index, **match_args)
@requests(on='/backup')
def backup(self, parameters: Optional[Dict] = {}, **kwargs):
"""
Backup data to local or remote.
Use api of <class 'annlite.index.AnnLite'>
Keys accepted:
- 'target' (str): the name of indexer you want to backup as
"""
target_name = parameters.get('target_name', None)
token = parameters.get('token', None)
if target_name:
target_name = f'{target_name}_{self.runtime_args.shard_id}'
with self._index_lock:
if len(self._data_buffer) > 0:
raise RuntimeError(
f'Cannot backup documents while the pending documents in the buffer are not indexed yet. '
'Please wait for the pending documents to be indexed.'
)
self._index._annlite.backup(target_name, token)
if self._index._list_like:
self._index._save_offset2ids()
@requests(on='/restore')
def restore(self, parameters: Optional[Dict] = {}, **kwargs):
"""
Restore data from local or remote.
Use api of <class 'annlite.index.AnnLite'>
"""
source_name = parameters.get('source_name', None)
token = parameters.get('token', None)
if source_name:
source_name = f'{source_name}_{self.runtime_args.shard_id}'
self._index._annlite.restore(source_name, token)
if self._index._list_like:
self._index._load_offset2ids()
@requests(on='/filter')
def filter(self, parameters: Dict, **kwargs):
"""
Query documents from the indexer by the filter `query` object in parameters. The `query` object must follow the
specifications in the `find` method of `DocumentArray` using annlite: https://docarray.jina.ai/fundamentals/documentarray/find/#filter-with-query-operators
:param parameters: Dictionary to define the `filter` that you want to use.
"""
return self._index.find(parameters.get('filter', None))
@requests(on='/fill_embedding')
def fill_embedding(self, docs: DocumentArray, **kwargs):
"""
retrieve embedding of Documents by id
:param docs: DocumentArray to search with
"""
for doc in docs:
doc.embedding = self._index[doc.id].embedding
@requests(on='/status')
def status(self, **kwargs) -> DocumentArray:
"""Return the document containing status information about the indexer.
The status will contain information on the total number of indexed and deleted
documents, and on the number of (searchable) documents currently in the index.
"""
status = Document(
tags={
'appending_size': len(self._data_buffer),
'total_docs': len(self._index),
'index_size': len(self._index),
}
)
return DocumentArray([status])
def flush(self):
"""Flush all the data in the buffer to the index"""
while len(self._data_buffer) > 0:
time.sleep(0.1)
@requests(on='/clear')
def clear(self, **kwargs):
"""Clear the index of all entries."""
self.flush()
with self._index_lock:
self._data_buffer = None
self._index_thread.join()
self._data_buffer = DocumentArray()
self._index.clear()
self._start_index_loop()
def close(self, **kwargs):
"""Close the index."""
super().close()
self.flush()
# wait for the index thread to finish
with self._index_lock:
self._data_buffer = None
self._index_thread.join()
# WARNING: the commented code below hangs the close in pytest `pytest tests/test_*.py`
# But don't know why. It works fine in `pytest tests/test_executor.py` and normal python execution
del self._index
================================================
FILE: annlite/filter.py
================================================
from typing import Dict
LOGICAL_OPERATORS = {'$and': 'AND', '$or': 'OR'}
COMPARISON_OPERATORS = {
'$lt': '<',
'$gt': '>',
'$lte': '<=',
'$gte': '>=',
'$eq': '=',
'$neq': '!=',
}
MEMBERSHIP_OPERATORS = {'$in': 'IN', '$nin': 'NOT IN'}
def _sql_parsing(data, default_logic: str = 'AND'):
"""
:param data: JSON Object (dict).
:param parameters: dict.
:return: where clause (str) built from data
"""
where_clause = ''
parameters = []
if isinstance(data, dict):
for i, (key, value) in enumerate(data.items()):
if key in LOGICAL_OPERATORS:
clause, params = _sql_parsing(
value, default_logic=LOGICAL_OPERATORS[key]
)
if i == 0:
where_clause += clause
else:
where_clause += f' {LOGICAL_OPERATORS[key]} {clause}'
parameters.extend(params)
elif key.startswith('$'):
raise ValueError(
f'The operator {key} is not supported yet, please double check the given filters!'
)
else:
if i > 0:
where_clause += f' {default_logic} '
items = list(value.items())
if len(items) == 0:
raise ValueError(f'The query express is illegal: {data}')
elif len(items) > 1:
clause_list, params_list = [], []
for op, val in items:
_clause, _params = _sql_parsing({key: {op: val}})
clause_list.append(_clause)
params_list.extend(_params)
where_clause += f' AND '.join(clause_list)
parameters.extend(params_list)
else:
op, val = items[0]
if op in LOGICAL_OPERATORS:
clause, params = _sql_parsing(
val, default_logic=LOGICAL_OPERATORS[op]
)
where_clause += clause
parameters.extend(params)
elif op in COMPARISON_OPERATORS:
parameters.append(val)
where_clause += f'({key} {COMPARISON_OPERATORS[op]} ?)'
elif op in MEMBERSHIP_OPERATORS:
parameters.extend(val)
where_clause += f'({key} {MEMBERSHIP_OPERATORS[op]}({", ".join(["?"]*len(val))}))'
else:
raise ValueError(
f'The operator {op} is not supported yet, please double check the given filters!'
)
elif isinstance(data, list):
clause_list, params_list = [], []
for d in data:
_clause, _params = _sql_parsing(d)
clause_list.append(_clause)
params_list.extend(_params)
where_clause += '(' + f' {default_logic} '.join(clause_list) + ')'
parameters.extend(params_list)
elif isinstance(data, str):
return data, parameters
else:
raise ValueError(f'The query express is illegal: {data}')
return where_clause, tuple(parameters)
class Filter(object):
"""A class to parse query language to SQL where clause."""
def __init__(self, tree_data: Dict = {}):
self.tree_data = tree_data
def parse_where_clause(self):
return _sql_parsing(self.tree_data or {})
================================================
FILE: annlite/helper.py
================================================
import sys
import numpy as np
from loguru import logger
def setup_logging(debug: bool):
"""
Setup the log formatter for AnnLite.
"""
log_level = 'INFO'
if debug:
log_level = 'DEBUG'
logger.remove()
logger.add(
sys.stdout,
colorize=True,
level=log_level,
)
def str2dtype(dtype_str: str):
if dtype_str in ['double', 'float64']:
dtype = np.float64
elif dtype_str in ['half', 'float16']:
dtype = np.float16
elif dtype_str in ['float', 'float32']:
dtype = np.float32
elif dtype_str in ['bfloat16']:
dtype = np.bfloat16
elif dtype_str in ['long', 'int64']:
dtype = np.int64
elif dtype_str in ['int', 'int32']:
dtype = np.int32
elif dtype_str in ['int16']:
dtype = np.int16
elif dtype_str in ['int8']:
dtype = np.int8
elif dtype_str in ['uint8']:
dtype = np.uint8
elif dtype_str in ['bool']:
dtype = np.bool
else:
raise TypeError(f'Unrecognized dtype string: {dtype_str}')
return dtype
================================================
FILE: annlite/hubble_tools.py
================================================
import os
import platform
import shutil
import time
from pathlib import Path
from typing import Optional, Union
from filesplit.merge import Merge
from filesplit.split import Split
from loguru import logger
ignored_extn = ['.DS_Store']
def get_size(input: Path) -> float:
import os
return os.stat(str(input)).st_size / (1024 * 1024)
def make_archive(input: Path, output_name: str) -> Path:
"""
This function will create a zip archive of the input file (tmp.zip) at the
same folder of input path.
"""
output_path = shutil.make_archive(
os.path.join(str(input.parent), output_name),
'zip',
str(input.parent),
str(input.name),
)
return Path(output_path)
class Uploader:
def __init__(self, size_limit=1024, client=None):
"""
This class create a filesplit object to split the file into small pieces and
upload them on to hubble.
:params size_limit: The max size of split files.
:params client: hubble client used for uploading.
"""
self.size_limit = size_limit
self.client = client
def upload_file(
self, input: Path, target_name: str, type: str, cell_id: Union[int, str]
):
logger.info(f'Start to upload single file: {input} to hubble ...')
size = get_size(input)
if size > self.size_limit:
split_list = self._split_file(input)
self.upload_directory(split_list, target_name, type, cell_id, merge=False)
shutil.rmtree(split_list)
else:
if self._check_exists(target_name, type, input.name):
return
self._upload_hubble(input, target_name, type, input.name, cell_id)
def upload_directory(
self,
input: Path,
target_name: str,
type: str,
cell_id: Union[int, str],
merge: bool = True,
):
def _upload():
if self._check_exists(target_name, type, str(idx) + '.zip'):
return
Path.mkdir(input.parent / str(idx))
for f in split_list:
shutil.copy(f, input.parent / str(idx))
output_path = make_archive(input.parent / str(idx), str(idx) + '.zip')
self._upload_hubble(
output_path, target_name, type, str(idx) + '.zip', cell_id
)
Path(output_path).unlink()
shutil.rmtree(input.parent / str(idx))
logger.info(f'Start to upload directory: {input} to hubble ...')
if merge:
size_list = list(
zip(list(input.iterdir()), [get_size(f) for f in list(input.iterdir())])
)
sorted_size_list = sorted(size_list, key=lambda x: x[1])
split_list = []
total_size = 0
idx = 0
for file_name, file_size in sorted_size_list:
for extn in ignored_extn:
if extn in str(file_name):
continue
if total_size + file_size > self.size_limit:
if len(split_list) == 0:
raise Exception(
f'The smallest file: {file_size} is bigger '
f'than size_limit. Please set a larger value '
f'of size_limit, now is {self.size_limit}MB.'
)
_upload()
idx += 1
total_size = 0
split_list = [file_name]
else:
split_list.append(file_name)
total_size += file_size
if len(split_list) > 0:
_upload()
else:
for idx, file_name in enumerate(list(input.glob('*'))):
if self._check_exists(target_name, type, str(file_name.name)):
continue
self._upload_hubble(
file_name, target_name, type, str(file_name.name), cell_id
)
def archive_and_upload(
self,
target_name: str,
type: str,
file_name: str,
cell_id: Union[int, str],
root_path: Path,
upload_folder: str,
):
if self._check_exists(target_name, type, file_name):
return
upload_file = shutil.make_archive(
os.path.join(str(root_path), f'{target_name}_{type}'),
'zip',
str(root_path),
upload_folder,
)
logger.info(
f'Start to upload: {upload_file} to hubble. '
f'[target_name: {target_name}, '
f'type: {type}, '
f'file_name: {file_name}, '
f'cell_id: {cell_id}].'
)
self.client.upload_artifact(
f=upload_file,
metadata={
'name': target_name,
'type': type,
'file_name': file_name,
'cell': cell_id,
},
)
Path(upload_file).unlink()
def _check_exists(self, target_name: str, type: str, file_name: str) -> bool:
art_list = self.client.list_artifacts(
filter={
'metaData.name': target_name,
'metaData.type': f'{type}',
'metaData.file_name': f'{file_name}',
}
)
if len(art_list['data']) != 0:
logger.info(
f'[target_name: {target_name}, type: {type}, file_name: {file_name}] '
f'already exists on hubble, will skip it ...'
)
return True
else:
return False
def _split_file(self, input: Path) -> Path:
output_dir = input / f'{input}_split'
if output_dir.exists():
logger.info(
f'Origin file: {str(input)} has already been split to: {output_dir}, will skip ...'
)
return output_dir
Path.mkdir(output_dir)
Split(str(input), str(output_dir)).bysize(size=self.size_limit * 1024 * 1024)
num_files = len(list(output_dir.glob('*')))
logger.info(
f'Origin file: {str(input)} has been split '
f'into {num_files} parts. Output file: {output_dir}'
)
return output_dir
def _upload_hubble(
self,
upload_file: Path,
target_name: str,
type: str,
file_name: str,
cell_id: Union[str, int],
):
logger.info(
f'Start to upload: {upload_file} to hubble. '
f'[target_name: {target_name}, '
f'type: {type}, '
f'file_name: {file_name}, '
f'cell_id: {cell_id}].'
)
start_time = time.time()
failed_times = 0
while True:
try:
self.client.upload_artifact(
f=str(upload_file),
metadata={
'name': target_name,
'type': type,
'file_name': file_name,
'cell': cell_id,
},
show_progress=True,
)
break
except Exception as e:
logger.info(e)
failed_times += 1
if failed_times == 3:
logger.info(
f'Tried more than 3 times to upload {upload_file}, type is: {type}, will exist...'
)
return
else:
continue
logger.info(
f'Takes {time.time() - start_time} seconds to upload {upload_file}.'
)
class Merger:
def __init__(self, restore_path, client):
"""
This class creates an object to download and merge the split files from hubble.
:param restore_path: tmp directory for downloading and merging files.
:param client: hubble client used for merging files.
"""
self.restore_path = restore_path
self.restore_path.mkdir(parents=True)
self.client = client
def merge_file(self, inputdir: Path, outputdir: Path, outputfilename: Path):
Merge(
inputdir=str(inputdir),
outputdir=str(outputdir),
outputfilename=str(outputfilename),
).merge()
def get_artifact_ids(self, art_list, type: str, cell_id: Optional[int] = None):
ids = [
[
art['_id'],
art['metaData']['type'],
art['metaData']['file_name'],
art['metaData']['cell'],
]
for art in art_list['data']
if type == art['metaData']['type']
]
if cell_id:
ids = [item for item in ids if int(item[3]) == cell_id]
ids = [[item[0], item[1], item[2]] for item in ids]
return ids
def download(self, ids, download_folder):
Path.mkdir(self.restore_path / download_folder)
for ids, type, file_name in ids:
self.client.download_artifact(
id=ids,
f=str(self.restore_path / download_folder / file_name),
show_progress=True,
)
================================================
FILE: annlite/index.py
================================================
import hashlib
import logging
import os
import platform
import warnings
from pathlib import Path
from typing import TYPE_CHECKING, Dict, List, Optional, Union
import numpy as np
from docarray.math.ndarray import to_numpy_array
from loguru import logger
if TYPE_CHECKING:
from docarray import DocumentArray
from .container import CellContainer
from .core import PQCodec, ProjectorCodec, VQCodec
from .enums import Metric
from .filter import Filter
from .helper import setup_logging
from .math import cdist, top_k
MAX_TRAINING_DATA_SIZE = 10240
class AnnLite(CellContainer):
""":class:`AnnLite` is an approximate nearest neighbor search library.
To create a :class:`AnnLite` object, simply:
.. highlight:: python
.. code-block:: python
ann = AnnLite(256, metric='cosine')
:param n_dim: dimensionality of input vectors. there are 2 constraints on dim:
(1) it needs to be divisible by n_subvectors; (2) it needs to be a multiple of 4.*
:param metric: distance metric type, can be 'euclidean', 'inner_product', or 'cosine'.
:param n_subvectors: number of sub-quantizers, essentially this is the byte size of
each quantized vector, default is None.
:param n_cells: number of coarse quantizer clusters, default is 1.
:param n_probe: number of cells to search for each query, default is 16.
:param n_components: number of components to keep.
:param initial_size: initial capacity assigned to each voronoi cell of coarse quantizer.
``n_cells * initial_size`` is the number of vectors that can be stored initially.
if any cell has reached its capacity, that cell will be automatically expanded.
If you need to add vectors frequently, a larger value for init_size is recommended.
:param columns: the columns to be indexed for fast filtering, default is None.
:param filterable_attrs: a dict of attributes to be indexed for fast filtering, default is None.
The key is the attribute name, and the value is the attribute type. And it only works when ``columns`` is None.
:param data_path: path to the directory where the data is stored.
:param create_if_missing: if False, do not create the directory path if it is missing.
:param read_only: if True, the index is not writable.
:param verbose: if True, will print the debug logging info.
.. note::
Remember that the shape of any tensor that contains data points has to be `[n_data, dim]`.
"""
def __init__(
self,
n_dim: int,
metric: Union[str, Metric] = 'cosine',
n_cells: int = 1,
n_subvectors: Optional[int] = None,
n_clusters: Optional[int] = 256,
n_probe: int = 16,
n_components: Optional[int] = None,
initial_size: Optional[int] = None,
expand_step_size: int = 10240,
columns: Optional[Union[Dict, List]] = None,
filterable_attrs: Optional[Dict] = None,
data_path: Union[Path, str] = Path('./data'),
create_if_missing: bool = True,
read_only: bool = False,
verbose: bool = False,
**kwargs,
):
setup_logging(verbose)
if 'dim' in kwargs:
warnings.warn(
'The argument `dim` will be deprecated, please use `n_dim` instead.'
)
n_dim = kwargs['dim']
if n_subvectors:
assert (
n_dim % n_subvectors == 0
), '"n_dim" needs to be divisible by "n_subvectors"'
self.n_dim = n_dim
self.n_components = n_components
self.n_subvectors = n_subvectors
self.n_clusters = n_clusters
self.n_probe = max(n_probe, n_cells)
self.n_cells = n_cells
self.size_limit = 2048
if isinstance(metric, str):
metric = Metric.from_string(metric)
self.metric = metric
self._use_smart_probing = True
self.read_only = read_only
data_path = Path(data_path)
if create_if_missing:
data_path.mkdir(parents=True, exist_ok=True)
self.data_path = data_path
self._projector_codec = None
if self._projector_codec_path.exists():
logger.info(
f'Load pre-trained projector codec (n_components={self.n_components}) from {self.model_path}'
)
self._projector_codec = ProjectorCodec.load(self._projector_codec_path)
elif n_components:
logger.info(
f'Initialize Projector codec (n_components={self.n_components})'
)
self._projector_codec = ProjectorCodec(
n_dim, n_components=self.n_components
)
self._vq_codec = None
if self._vq_codec_path.exists():
logger.info(
f'Load trained VQ codec (K={self.n_cells}) from {self.model_path}'
)
self._vq_codec = VQCodec.load(self._vq_codec_path)
elif n_cells > 1:
logger.info(f'Initialize VQ codec (K={self.n_cells})')
self._vq_codec = VQCodec(self.n_cells, metric=self.metric)
self._pq_codec = None
if self._pq_codec_path.exists():
logger.info(
f'Load trained PQ codec (n_subvectors={self.n_subvectors}) from {self.model_path}'
)
self._pq_codec = PQCodec.load(self._pq_codec_path)
elif n_subvectors:
logger.info(f'Initialize PQ codec (n_subvectors={self.n_subvectors})')
self._pq_codec = PQCodec(
dim=n_dim
if not self._projector_codec
else self._projector_codec.n_components,
n_subvectors=self.n_subvectors,
n_clusters=self.n_clusters,
metric=self.metric,
)
if columns is not None:
if filterable_attrs:
logger.warning('`filterable_attrs` will be overwritten by `columns`.')
filterable_attrs = {}
for n, t in columns.items() if isinstance(columns, dict) else columns:
filterable_attrs[n] = t
super(AnnLite, self).__init__(
n_dim,
metric=metric,
projector_codec=self._projector_codec,
pq_codec=self._pq_codec,
n_cells=n_cells,
initial_size=initial_size,
expand_step_size=expand_step_size,
filterable_attrs=filterable_attrs,
data_path=data_path,
**kwargs,
)
if not self.is_trained and self.total_docs > 0:
# train the index from scratch based on the data in the data_path
logger.info(f'Train the index by reading data from {self.data_path}')
total_size = 0
# TODO: add a progress bar
for docs in self.documents_generator(0, batch_size=1024):
x = to_numpy_array(docs.embeddings)
total_size += x.shape[0]
self.partial_train(x, auto_save=True, force_train=True)
if total_size >= MAX_TRAINING_DATA_SIZE:
break
logger.info(f'Total training data size: {total_size}')
if self.total_docs > 0:
self.restore()
def _sanity_check(self, x: 'np.ndarray'):
assert x.ndim == 2, 'inputs must be a 2D array'
assert (
x.shape[1] == self.n_dim
), f'inputs must have the same dimension as the index , got {x.shape[1]}, expected {self.n_dim}'
return x.shape
def train(self, x: 'np.ndarray', auto_save: bool = True, force_train: bool = False):
"""Train the index with the given data.
:param x: the ndarray data for training.
:param auto_save: if False, will not dump the trained model to ``model_path``.
:param force_train: if True, enforce to retrain the model, and overwrite the model if ``auto_save=True``.
"""
n_data, _ = self._sanity_check(x)
if self.is_trained and not force_train:
logger.warning(
'The indexer has been trained or is not trainable. Please use ``force_train=True`` to retrain.'
)
return
if self._projector_codec:
logger.info(
f'Start training Projector codec (n_components={self.n_components}) with {n_data} data...'
)
self._projector_codec.fit(x)
if self._vq_codec:
logger.info(
f'Start training VQ codec (K={self.n_cells}) with {n_data} data...'
)
self._vq_codec.fit(x)
if self._pq_codec:
logger.info(
f'Start training PQ codec (n_subvectors={self.n_subvectors}) with {n_data} data...'
)
self._pq_codec.fit(x)
logger.info(f'The annlite is successfully trained!')
if auto_save:
self.dump_model()
def partial_train(
self, x: np.ndarray, auto_save: bool = True, force_train: bool = False
):
"""Partially train the index with the given data.
:param x: the ndarray data for training.
:param auto_save: if False, will not dump the trained model to ``model_path``.
:param force_train: if True, enforce to retrain the model, and overwrite the model if ``auto_save=True``.
"""
n_data, _ = self._sanity_check(x)
if self.is_trained and not force_train:
logger.warning(
'The annlite has been trained or is not trainable. Please use ``force_train=True`` to retrain.'
)
return
if self._projector_codec:
logging.info(
f'Partial training Projector codec (n_components={self.n_components}) with {n_data} data...'
)
self._projector_codec.partial_fit(x)
if self._vq_codec:
logger.info(
f'Partial training VQ codec (K={self.n_cells}) with {n_data} data...'
)
self._vq_codec.partial_fit(x)
if self._pq_codec:
logger.info(
f'Partial training PQ codec (n_subvectors={self.n_subvectors}) with {n_data} data...'
)
self._pq_codec.partial_fit(x)
if auto_save:
self.dump_model()
def index(self, docs: 'DocumentArray', **kwargs):
"""Add the documents to the index.
:param docs: the document array to be indexed.
"""
if self.read_only:
logger.error('The indexer is readonly, cannot add new documents')
return
if not self.is_trained:
raise RuntimeError(f'The indexer is not trained, cannot add new documents')
x = to_numpy_array(docs.embeddings)
n_data, _ = self._sanity_check(x)
assigned_cells = (
self._vq_codec.encode(x)
if self._vq_codec
else np.zeros(n_data, dtype=np.int64)
)
return super(AnnLite, self).insert(x, assigned_cells, docs)
def update(
self,
docs: 'DocumentArray',
raise_errors_on_not_found: bool = False,
insert_if_not_found: bool = True,
**kwargs,
):
"""Update the documents in the index.
:param insert_if_not_found: whether to raise error when updated id is not found.
:param raise_errors_on_not_found: whether to raise exception when id not found.
:param docs: the document array to be updated.
"""
if self.read_only:
logger.error('The indexer is readonly, cannot update documents')
return
if not self.is_trained:
raise RuntimeError(f'The indexer is not trained, cannot add new documents')
x = to_numpy_array(docs.embeddings)
n_data, _ = self._sanity_check(x)
assigned_cells = (
self._vq_codec.encode(x)
if self._vq_codec
else np.zeros(n_data, dtype=np.int64)
)
return super(AnnLite, self).update(
x,
assigned_cells,
docs,
raise_errors_on_not_found=raise_errors_on_not_found,
insert_if_not_found=insert_if_not_found,
)
def search(
self,
docs: 'DocumentArray',
filter: Optional[dict] = None,
limit: int = 10,
include_metadata: bool = True,
**kwargs,
):
"""Search the index, and attach matches to the query Documents in `docs`
:param docs: the document array to be searched.
:param filter: the filter to be applied to the search.
:param limit: the number of results to get for each query document in search
:param include_metadata: whether to return document metadata in response.
"""
if not self.is_trained:
raise RuntimeError(f'The indexer is not trained, cannot add new documents')
query_np = to_numpy_array(docs.embeddings)
match_dists, match_docs = self.search_by_vectors(
query_np, filter=filter, limit=limit, include_metadata=include_metadata
)
for doc, matches in zip(docs, match_docs):
doc.matches = matches
def search_by_vectors(
self,
query_np: 'np.ndarray',
filter: Optional[dict] = None,
limit: int = 10,
include_metadata: bool = True,
):
"""Search the index by vectors, and return the matches.
:param query_np: the query vectors.
:param filter: the filter to be applied to the search.
:param limit: the number of results to get for each query document in search
:param include_metadata: whether to return document metadata in response.
"""
cells = self._cell_selection(query_np, limit)
where_clause, where_params = Filter(filter or {}).parse_where_clause()
match_dists, match_docs = self.search_cells(
query=query_np,
cells=cells,
where_clause=where_clause,
where_params=where_params,
limit=limit,
include_metadata=include_metadata,
)
return match_dists, match_docs
def filter(
self,
filter: Dict,
limit: int = 10,
offset: int = 0,
order_by: Optional[str] = None,
ascending: bool = True,
include_metadata: bool = True,
):
"""Find the documents by the filter.
:param filter: the filter to be applied to the search.
:param limit: the number of results.
:param offset: the offset of the results.
:param order_by: the field to order the results.
:param ascending: whether to order the results in ascending order.
:param include_metadata: whether to return document metadata in response.
"""
cells = [x for x in range(self.n_cells)]
where_clause, where_params = Filter(filter or {}).parse_where_clause()
match_docs = self.filter_cells(
cells=cells,
where_clause=where_clause,
where_params=where_params,
limit=limit,
offset=offset,
order_by=order_by,
ascending=ascending,
include_metadata=include_metadata,
)
if limit > 0:
return match_docs[:limit]
return match_docs
def get_doc_by_id(self, doc_id: str):
"""Get the document by id.
:param doc_id: the document id.
"""
return self._get_doc_by_id(doc_id)
def get_docs(
self,
filter: Optional[dict] = None,
limit: int = 10,
offset: int = 0,
order_by: Optional[str] = None,
ascending: bool = True,
):
"""Get the documents.
:param filter: the filter to be applied to the search.
:param limit: the number of results.
:param offset: the offset of the results.
:param order_by: the field to order the results.
:param ascending: whether to order the results in ascending order. It only works when `order_by` is specified.
"""
return self.filter(
filter=filter,
limit=limit,
offset=offset,
order_by=order_by,
ascending=ascending,
include_metadata=True,
)
def _cell_selection(self, query_np, limit):
n_data, _ = self._sanity_check(query_np)
if self._vq_codec:
dists = cdist(
query_np, self._vq_codec.codebook, metric=self.metric.name.lower()
)
dists, cells = top_k(dists, k=self.n_probe)
else:
cells = np.zeros((n_data, 1), dtype=np.int64)
# if self.use_smart_probing and self.n_probe > 1:
# p = -topk_sims.abs().sqrt()
# p = torch.softmax(p / self.smart_probing_temperature, dim=-1)
#
# # p_norm = p.norm(dim=-1)
# # sqrt_d = self.n_probe ** 0.5
# # score = 1 - (p_norm * sqrt_d - 1) / (sqrt_d - 1) - 1e-6
# # n_probe_list = torch.ceil(score * (self.n_probe) ).long()
#
# max_n_probe = torch.tensor(self.n_probe, device=self.device)
# normalized_entropy = - torch.sum(p * torch.log2(p) / torch.log2(max_n_probe), dim=-1)
# n_probe_list = torch.ceil(normalized_entropy * max_n_probe).long()
# else:
# n_probe_list = None
return cells
def search_numpy(
self,
query_np: 'np.ndarray',
filter: Dict = {},
limit: int = 10,
**kwargs,
):
"""Search the index and return distances to the query and ids of the closest documents.
:param query_np: matrix containing query vectors as rows
:param filter: the filtering conditions
:param limit: the number of results to get for each query document in search
"""
if not self.is_trained:
raise RuntimeError(f'The indexer is not trained, cannot add new documents')
dists, doc_ids = self._search_numpy(query_np, filter, limit)
return dists, doc_ids
def _search_numpy(self, query_np: 'np.ndarray', filter: Dict = {}, limit: int = 10):
"""Search approximate nearest vectors in different cells, returns distances and ids
:param query_np: matrix containing query vectors as rows
:param filter: the filtering conditions
:param limit: the number of results to get for each query document in search
"""
cells = self._cell_selection(query_np, limit)
where_clause, where_params = Filter(filter).parse_where_clause()
dists, ids = self._search_cells(
query=query_np,
cells=cells,
where_clause=where_clause,
where_params=where_params,
limit=limit,
)
return dists, ids
def delete(
self,
docs: Union['DocumentArray', List[str]],
raise_errors_on_not_found: bool = False,
):
"""Delete entries from the index by id
:param raise_errors_on_not_found: whether to raise exception when id not found.
:param docs: the documents to delete
"""
super().delete(
docs if isinstance(docs, list) else docs[:, 'id'], raise_errors_on_not_found
)
def clear(self):
"""Clear the whole database"""
for cell_id in range(self.n_cells):
self.vec_index(cell_id).reset()
self.cell_table(cell_id).clear()
self.doc_store(cell_id).clear()
self.meta_table.clear()
def close(self):
for cell_id in range(self.n_cells):
self.doc_store(cell_id).close()
def encode(self, x: 'np.ndarray'):
n_data, _ = self._sanity_check(x)
if self._projector_codec:
x = self._projector_codec.encode(x)
if self._vq_codec:
x = self._pq_codec.encode(x)
return x
def decode(self, x: 'np.ndarray'):
assert len(x.shape) == 2
assert x.shape[1] == self.n_subvectors
if self._pq_codec:
x = self._pq_codec.decode(x)
if self._projector_codec:
x = self._projector_codec.decode(x)
return x
@property
def params_hash(self):
model_metas = (
f'n_dim: {self.n_dim} '
f'metric: {self.metric} '
f'n_cells: {self.n_cells} '
f'n_components: {self.n_components} '
f'n_subvectors: {self.n_subvectors}'
)
return hashlib.md5(f'{model_metas}'.encode()).hexdigest()
@property
def model_path(self):
return self.data_path / f'parameters-{self.params_hash}'
@property
def _vq_codec_path(self):
return self.model_path / f'vq_codec.params'
@property
def _pq_codec_path(self):
return self.model_path / f'pq_codec.params'
@property
def _projector_codec_path(self):
return self.model_path / f'projector_codec.params'
@property
def index_hash(self):
latest_commit = self.meta_table.get_latest_commit()
date_time = latest_commit[-1] if latest_commit else None
if date_time:
if platform.system() == 'Windows':
return date_time.isoformat('#', 'hours')
return date_time.isoformat('#', 'seconds')
else:
import datetime
return (
datetime.datetime.utcnow().isoformat('#', 'hours')
if platform.system() == 'Windows'
else datetime.datetime.utcnow().isoformat('#', 'seconds')
)
@property
def index_path(self):
if self.index_hash:
return (
self.data_path
/ f'snapshot-{self.params_hash}'
/ f'{self.index_hash}-SNAPSHOT'
)
return None
@property
def snapshot_path(self):
paths = list(
(self.data_path / f'snapshot-{self.params_hash}').glob(f'*-SNAPSHOT')
)
if paths:
paths = sorted(paths, key=lambda x: x.name)
return paths[-1]
return None
@property
def remote_store_client(self):
try:
import hubble
os.environ['JINA_AUTH_TOKEN'] = self.token
client = hubble.Client(max_retries=None, jsonify=True)
client.get_user_info()
return client
except Exception as ex:
logger.error(f'Not login to hubble yet.')
raise ex
def backup(self, target_name: Optional[str] = None, token: Optional[str] = None):
# file lock will be released when backup to remote, this will
# release the file lock. And it's only needed in Windows
# since we need to release file lock before we can access rocksdb files.
if not target_name:
logger.info('dump to local ...')
self.dump()
else:
if token is None:
logger.error(f'back up to remote needs token')
logger.info(f'dump to remote: {target_name}')
self.close()
self._backup_index_to_remote(target_name, token)
def restore(self, source_name: Optional[str] = None, token: Optional[str] = None):
# file lock will be released when restore from remote
if not source_name:
if self.total_docs > 0:
logger.info(f'restore Annlite from local')
self._rebuild_index_from_local()
else:
if token is None:
logger.error(f'restore from remote needs token')
logger.info(f'restore Annlite from artifact: {source_name}')
self.close()
self._rebuild_index_from_remote(source_name, token)
def dump_model(self):
logger.info(f'Save the parameters to {self.model_path}')
self.model_path.mkdir(parents=True, exist_ok=True)
if self._projector_codec:
self._projector_codec.dump(self._projector_codec_path)
if self._vq_codec:
self._vq_codec.dump(self._vq_codec_path)
if self._pq_codec:
self._pq_codec.dump(self._pq_codec_path)
def dump_index(self):
import shutil
logger.info(f'Save the indexer to {self.index_path}')
try:
if Path.exists(self.index_path):
logger.info(
f'Index path {self.index_path} already exists, will be '
f'overwritten'
)
shutil.rmtree(self.index_path)
self.index_path.mkdir(parents=True)
for cell_id in range(self.n_cells):
self.vec_index(cell_id).dump(self.index_path / f'cell_{cell_id}.hnsw')
self.cell_table(cell_id).dump(self.index_path / f'cell_{cell_id}.db')
self.meta_table.dump(self.index_path / f'meta.db')
except Exception as ex:
logger.error(f'Failed to dump the indexer, {ex!r}')
if self.index_path:
shutil.rmtree(self.index_path)
def dump(self):
self.dump_model()
self.dump_index()
def _backup_index_to_remote(self, target_name: str, token: str):
self.dump()
from .hubble_tools import Uploader
self.token = token
client = self.remote_store_client
uploader = Uploader(size_limit=self.size_limit, client=client)
for cell_id in range(self.n_cells):
# upload database
uploader.upload_directory(
Path(self.data_path) / f'cell_{cell_id}',
target_name=target_name,
type='database',
cell_id=cell_id,
)
# upload hnsw file
uploader.upload_file(
Path(self.index_path) / f'cell_{cell_id}.hnsw',
target_name=target_name,
type='hnsw',
cell_id=cell_id,
)
# upload cell_table
uploader.upload_file(
Path(self.index_path) / f'cell_{cell_id}.db',
target_name=target_name,
type='cell_table',
cell_id=cell_id,
)
# upload meta table
uploader.upload_file(
Path(self.index_path) / 'meta.db',
target_name=target_name,
type='meta_table',
cell_id=0,
)
# upload training model
uploader.archive_and_upload(
target_name,
'model',
'model.zip',
'all',
self.model_path.parent,
str(self.model_path.name),
)
def _rebuild_index_from_local(self):
if self.snapshot_path:
logger.info(f'Load the indexer from snapshot {self.snapshot_path}')
for cell_id in range(self.n_cells):
self.vec_index(cell_id).load(
self.snapshot_path / f'cell_{cell_id}.hnsw'
)
self.cell_table(cell_id).load(self.snapshot_path / f'cell_{cell_id}.db')
self.meta_table.load(self.snapshot_path / f'meta.db')
else:
logger.info(f'Rebuild the indexer from scratch')
for cell_id in range(self.n_cells):
cell_size = self.doc_store(cell_id).size
if cell_size == 0:
continue # skip empty cell
logger.debug(
f'Rebuild the index of cell-{cell_id} ({cell_size} docs)...'
)
for docs in self.documents_generator(cell_id, batch_size=10240):
x = to_numpy_array(docs.embeddings)
assigned_cells = np.ones(len(docs), dtype=np.int64) * cell_id
super().insert(x, assigned_cells, docs, only_index=True)
logger.debug(f'Rebuild the index of cell-{cell_id} done')
if self.model_path:
logger.info(f'Load the model from {self.model_path}')
self._reload_models()
def _rebuild_index_from_remote(self, source_name: str, token: str):
import shutil
from .hubble_tools import Merger
self.token = token
client = self.remote_store_client
art_list = client.list_artifacts(
filter={'metaData.name': source_name}, pageSize=100
)
if len(art_list['data']) == 0:
logger.info(f'The indexer `{source_name}` not found. ')
else:
logger.info(f'Load the indexer `{source_name}` from remote store')
restore_path = self.data_path / 'restore'
merger = Merger(restore_path=restore_path, client=client)
for cell_id in range(self.n_cells):
# download hnsw files and merge and load
logger.info(f'Load the hnsw `{source_name}` from remote store')
hnsw_ids = merger.get_artifact_ids(
art_list, type='hnsw', cell_id=cell_id
)
merger.download(ids=hnsw_ids, download_folder=f'hnsw_{cell_id}')
if len(hnsw_ids) > 1:
merger.merge_file(
inputdir=restore_path / f'hnsw_{cell_id}',
outputdir=restore_path / f'hnsw_{cell_id}',
outputfilename=Path(f'cell_{cell_id}.hnsw'),
)
self.vec_index(cell_id).load(
restore_path / f'hnsw_{cell_id}' / f'cell_{cell_id}.hnsw'
)
shutil.rmtree(restore_path / f'hnsw_{cell_id}')
# download cell_table files and merge and load
logger.info(f'Load the cell_table `{source_name}` from remote store')
cell_table_ids = merger.get_artifact_ids(
art_list, type='cell_table', cell_id=cell_id
)
merger.download(
ids=cell_table_ids, download_folder=f'cell_table_{cell_id}'
)
if len(cell_table_ids) > 1:
merger.merge_file(
inputdir=restore_path / f'cell_table_{cell_id}',
outputdir=restore_path / f'cell_table_{cell_id}',
outputfilename=Path(f'cell_{cell_id}.db'),
)
self.cell_table(cell_id).load(
restore_path / f'cell_table_{cell_id}' / f'cell_{cell_id}.db'
)
shutil.rmtree(restore_path / f'cell_table_{cell_id}')
# download database files and rebuild
logger.info(f'Load the database `{source_name}` from remote store')
database_ids = merger.get_artifact_ids(
art_list, type='database', cell_id=cell_id
)
merger.download(ids=database_ids, download_folder='database')
for zip_file in list((restore_path / 'database').iterdir()):
# default has only one cell
shutil.unpack_archive(zip_file, self.data_path / f'cell_{cell_id}')
for f in list(
(
self.data_path
/ f'cell_{cell_id}'
/ zip_file.name.split('.zip')[0]
).iterdir()
):
origin_database_path = (
self.data_path / f'cell_{cell_id}' / f.name
)
if origin_database_path.exists():
origin_database_path.unlink()
f.rename(self.data_path / f'cell_{cell_id}' / f.name)
shutil.rmtree(
self.data_path
/ f'cell_{cell_id}'
/ zip_file.name.split('.zip')[0]
)
Path(zip_file).unlink()
self._rebuild_database()
# download meta_table files
logger.info(f'Load the meta_table `{source_name}` from remote store')
meta_table_ids = merger.get_artifact_ids(
art_list, type='meta_table', cell_id=0
)
merger.download(ids=meta_table_ids, download_folder='meta_table')
if len(meta_table_ids) > 1:
merger.merge_file(
inputdir=restore_path / 'meta_table',
outputdir=restore_path / 'meta_table',
outputfilename=Path('meta.db'),
)
self._meta_table.load(restore_path / 'meta_table' / 'meta.db')
shutil.rmtree(restore_path / 'meta_table')
# download model files
logger.info(f'Load the model `{source_name}` from remote store')
file_name = str(self.model_path.parent / f'{source_name}_model.zip')
model_id = [
art['_id']
for art in art_list['data']
if 'model' in art['metaData']['type']
]
assert len(model_id) == 1
client.download_artifact(
id=model_id[0],
f=file_name,
show_progress=True,
)
shutil.unpack_archive(file_name, self.model_path.parent)
self._reload_models()
Path(file_name).unlink()
shutil.rmtree(restore_path)
@property
def is_trained(self):
if self._projector_codec and (not self._projector_codec.is_trained):
return False
if self._vq_codec and (not self._vq_codec.is_trained):
return False
if self._pq_codec and (not self._pq_codec.is_trained):
return False
return True
def _reload_models(self):
if self._projector_codec_path.exists():
self._projector_codec = ProjectorCodec.load(self._projector_codec_path)
if self._vq_codec_path.exists():
self._vq_codec = VQCodec.load(self._vq_codec_path)
if self._pq_codec_path.exists():
self._pq_codec = PQCodec.load(self._pq_codec_path)
@property
def use_smart_probing(self):
return self._use_smart_probing
@use_smart_probing.setter
def use_smart_probing(self, value):
assert type(value) is bool
self._use_smart_probing = value
@property
def stat(self):
"""Get information on status of the indexer."""
return {
'total_docs': self.total_docs,
'index_size': self.index_size,
'n_cells': self.n_cells,
'n_dim': self.n_dim,
'n_components': self.n_components,
'metric': self.metric.name,
'is_trained': self.is_trained,
}
# @property
# def smart_probing_temperature(self):
# return self._smart_probing_temperature
#
# @smart_probing_temperature.setter
# def smart_probing_temperature(self, value):
# assert value > 0
# assert self.use_smart_probing, 'set use_smart_probing to True first'
# self._smart_probing_temperature = value
================================================
FILE: annlite/math.py
================================================
from typing import Tuple
import numpy as np
def l2_normalize(x: 'np.ndarray', eps: float = np.finfo(np.float32).eps):
"""Scale input vectors individually to unit norm.
:param x: The data to normalize
:param eps: a small jitter to avoid divde by zero
:return: Normalized input X
"""
norms = np.einsum('ij,ij->i', x, x)
np.sqrt(norms, norms)
constant_mask = norms < 10 * eps
norms[constant_mask] = 1.0
return x / norms[:, np.newaxis]
def cosine(
x_mat: 'np.ndarray', y_mat: 'np.ndarray', eps: float = np.finfo(np.float32).eps
) -> 'np.ndarray':
"""Cosine distance between each row in x_mat and each row in y_mat.
:param x_mat: np.ndarray with ndim=2
:param y_mat: np.ndarray with ndim=2
:param eps: a small jitter to avoid divde by zero
:return: np.ndarray with ndim=2
"""
return 1 - np.clip(
(np.dot(x_mat, y_mat.T) + eps)
/ (
np.outer(np.linalg.norm(x_mat, axis=1), np.linalg.norm(y_mat, axis=1)) + eps
),
-1,
1,
)
def sqeuclidean(x_mat: 'np.ndarray', y_mat: 'np.ndarray') -> 'np.ndarray':
"""Squared Euclidean distance between each row in x_mat and each row in y_mat.
:param x_mat: np.ndarray with ndim=2
:param y_mat: np.ndarray with ndim=2
:return: np.ndarray with ndim=2
"""
return (
np.sum(y_mat**2, axis=1)
+ np.sum(x_mat**2, axis=1)[:, np.newaxis]
- 2 * np.dot(x_mat, y_mat.T)
)
def euclidean(x_mat: 'np.ndarray', y_mat: 'np.ndarray') -> 'np.ndarray':
"""Euclidean distance between each row in x_mat and each row in y_mat.
:param x_mat: scipy.sparse like array with ndim=2
:param y_mat: scipy.sparse like array with ndim=2
:return: np.ndarray with ndim=2
"""
return np.sqrt(sqeuclidean(x_mat, y_mat))
def pdist(
x_mat: 'np.ndarray',
metric: str,
) -> 'np.ndarray':
"""Computes Pairwise distances between observations in n-dimensional space.
:param x_mat: Union['np.ndarray','scipy.sparse.csr_matrix', 'scipy.sparse.coo_matrix'] of ndim 2
:param metric: string describing the metric type
:return: np.ndarray of ndim 2
"""
return cdist(x_mat, x_mat, metric)
def cdist(x_mat: 'np.ndarray', y_mat: 'np.ndarray', metric: str) -> 'np.ndarray':
"""Computes the pairwise distance between each row of X and each row on Y according to `metric`.
- Let `n_x = x_mat.shape[0]`
- Let `n_y = y_mat.shape[0]`
- Returns a matrix `dist` of shape `(n_x, n_y)` with `dist[i,j] = metric(x_mat[i], y_mat[j])`.
:param x_mat: numpy or scipy array of ndim 2
:param y_mat: numpy or scipy array of ndim 2
:param metric: string describing the metric type
:return: np.ndarray of ndim 2
"""
dists = {'cosine': cosine, 'sqeuclidean': sqeuclidean, 'euclidean': euclidean}[
metric
](x_mat, y_mat)
return dists
def top_k(
values: 'np.ndarray', k: int, descending: bool = False
) -> Tuple['np.ndarray', 'np.ndarray']:
"""Finds values and indices of the k largest entries for the last dimension.
:param values: array of distances
:param k: number of values to retrieve
:param descending: find top k biggest values
:return: indices and distances
"""
if descending:
values = -values
if k >= values.shape[1]:
idx = values.argsort(axis=1)[:, :k]
values = np.take_along_axis(values, idx, axis=1)
else:
idx_ps = values.argpartition(kth=k, axis=1)[:, :k]
values = np.take_along_axis(values, idx_ps, axis=1)
idx_fs = values.argsort(axis=1)
idx = np.take_along_axis(idx_ps, idx_fs, axis=1)
values = np.take_along_axis(values, idx_fs, axis=1)
if descending:
values = -values
return values, idx
================================================
FILE: annlite/profile.py
================================================
import cProfile
import pstats
import random
from functools import wraps
random.seed(20)
try:
import builtins
line_profile = builtins.profile
except AttributeError:
# No line profiler, provide a pass-through version
def profile(func):
return func
line_profile = profile
def time_profile(
output_file=None, sort_by='cumulative', lines_to_print=None, strip_dirs=False
):
"""A
gitextract_ifbk854h/
├── .gitattributes
├── .github/
│ ├── release-template.ejs
│ ├── requirements-test.txt
│ └── workflows/
│ ├── cd.yml
│ ├── ci.yml
│ ├── force-release.yml
│ └── tag.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CHANGELOG.md
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── annlite/
│ ├── __init__.py
│ ├── container.py
│ ├── core/
│ │ ├── __init__.py
│ │ ├── codec/
│ │ │ ├── __init__.py
│ │ │ ├── base.py
│ │ │ ├── pq.py
│ │ │ ├── projector.py
│ │ │ └── vq.py
│ │ └── index/
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── flat_index.py
│ │ ├── hnsw/
│ │ │ ├── __init__.py
│ │ │ └── index.py
│ │ └── pq_index.py
│ ├── enums.py
│ ├── executor.py
│ ├── filter.py
│ ├── helper.py
│ ├── hubble_tools.py
│ ├── index.py
│ ├── math.py
│ ├── profile.py
│ ├── storage/
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── kv.py
│ │ └── table.py
│ └── utils.py
├── benchmarks/
│ ├── filtering_bench.py
│ └── hnsw_bench.py
├── bindings/
│ ├── hnsw_bindings.cpp
│ └── pq_bindings.pyx
├── examples/
│ ├── annlite_vs_simpleindexer.py
│ ├── filter_example.py
│ ├── hnsw_example.py
│ ├── pq_benchmark.py
│ ├── pqlinearscann_benchmark_with_filtering.py
│ └── utils.py
├── executor/
│ ├── Dockerfile
│ ├── README.md
│ ├── benchmark.py
│ ├── config.yml
│ ├── executor.py
│ └── requirements.txt
├── include/
│ └── hnswlib/
│ ├── bruteforce.h
│ ├── fusefilter.h
│ ├── hnswalg.h
│ ├── hnswlib.h
│ ├── space_ip.h
│ ├── space_l2.h
│ ├── space_pq.h
│ └── visited_list_pool.h
├── notebooks/
│ └── fashion_product_search.ipynb
├── pyproject.toml
├── requirements.txt
├── scripts/
│ ├── black.sh
│ ├── get-all-test-paths.sh
│ ├── get-last-release-note.py
│ ├── release.sh
│ └── update-version.sh
├── setup.py
└── tests/
├── __init__.py
├── conftest.py
├── docarray/
│ ├── __init__.py
│ ├── test_add.py
│ ├── test_del.py
│ ├── test_find.py
│ ├── test_get.py
│ └── test_save_load.py
├── executor/
│ ├── __init__.py
│ └── test_executor.py
├── test_codec.py
├── test_crud.py
├── test_dump.py
├── test_enums.py
├── test_filter.py
├── test_hnsw_load_save.py
├── test_index.py
├── test_pq_bind.py
├── test_pq_index.py
├── test_projector.py
├── test_projector_index.py
├── test_store.py
└── test_table.py
SYMBOL INDEX (487 symbols across 58 files)
FILE: annlite/container.py
class CellContainer (line 22) | class CellContainer:
method __init__ (line 23) | def __init__(
method ivf_search (line 88) | def ivf_search(
method filter_cells (line 146) | def filter_cells(
method search_cells (line 201) | def search_cells(
method _search_cells (line 237) | def _search_cells(
method insert (line 262) | def insert(
method _add_vecs (line 310) | def _add_vecs(self, data: 'np.ndarray', cells: 'np.ndarray', offsets: ...
method update (line 323) | def update(
method delete (line 388) | def delete(
method _rebuild_database (line 416) | def _rebuild_database(self):
method _get_doc_by_id (line 429) | def _get_doc_by_id(self, doc_id: str):
method documents_generator (line 437) | def documents_generator(self, cell_id: int, batch_size: int = 1000):
method cell_tables (line 442) | def cell_tables(self):
method cell_indexes (line 446) | def cell_indexes(self):
method cell_table (line 449) | def cell_table(self, cell_id: int):
method doc_store (line 452) | def doc_store(self, cell_id: int):
method vec_index (line 455) | def vec_index(self, cell_id: int):
method meta_table (line 459) | def meta_table(self):
method total_docs (line 463) | def total_docs(self):
method index_size (line 467) | def index_size(self):
FILE: annlite/core/codec/base.py
class BaseCodec (line 9) | class BaseCodec(ABC):
method __init__ (line 10) | def __init__(self, require_train: bool = True):
method fit (line 15) | def fit(self, *args, **kwargs):
method encode (line 19) | def encode(self):
method decode (line 23) | def decode(self):
method dump (line 26) | def dump(self, target_path: 'Path'):
method load (line 30) | def load(from_path: 'Path'):
method is_trained (line 34) | def is_trained(self):
method _check_trained (line 37) | def _check_trained(self):
FILE: annlite/core/codec/pq.py
class PQCodec (line 16) | class PQCodec(BaseCodec):
method __init__ (line 38) | def __init__(
method __hash__ (line 77) | def __hash__(self):
method fit (line 89) | def fit(self, x: 'np.ndarray', iter: int = 100):
method partial_fit (line 117) | def partial_fit(self, x: 'np.ndarray'):
method build_codebook (line 144) | def build_codebook(self):
method encode (line 158) | def encode(self, x: 'np.ndarray'):
method decode (line 179) | def decode(self, codes: 'np.ndarray'):
method precompute_adc (line 200) | def precompute_adc(self, query: object) -> object:
method codebooks (line 227) | def codebooks(self):
method get_codebook (line 231) | def get_codebook(self) -> 'np.ndarray':
method get_subspace_splitting (line 239) | def get_subspace_splitting(self):
method get_dist_mat (line 293) | def get_dist_mat(self, x: np.ndarray):
class DistanceTable (line 330) | class DistanceTable(object):
method __init__ (line 345) | def __init__(self, dtable: 'np.ndarray'):
method adist (line 350) | def adist(self, codes):
FILE: annlite/core/codec/projector.py
class ProjectorCodec (line 8) | class ProjectorCodec(BaseCodec):
method __init__ (line 29) | def __init__(
method __hash__ (line 49) | def __hash__(self):
method fit (line 60) | def fit(self, x: 'np.ndarray'):
method partial_fit (line 85) | def partial_fit(self, x: 'np.ndarray'):
method encode (line 109) | def encode(self, x: 'np.ndarray'):
method decode (line 120) | def decode(self, x: 'np.ndarray'):
method components (line 133) | def components(self):
method explained_variance_ratio (line 141) | def explained_variance_ratio(self):
method mean (line 147) | def mean(self):
method var (line 153) | def var(self):
FILE: annlite/core/codec/vq.py
class VQCodec (line 8) | class VQCodec(BaseCodec):
method __init__ (line 9) | def __init__(
method __hash__ (line 30) | def __hash__(self):
method fit (line 33) | def fit(self, x: 'np.ndarray'):
method partial_fit (line 51) | def partial_fit(self, x: 'np.ndarray'):
method build_codebook (line 68) | def build_codebook(self):
method encode (line 75) | def encode(self, x: 'np.ndarray'):
method decode (line 84) | def decode(self, x: 'np.ndarray'):
method codebook (line 88) | def codebook(self):
FILE: annlite/core/index/base.py
class BaseIndex (line 10) | class BaseIndex(abc.ABC):
method __init__ (line 11) | def __init__(
method capacity (line 36) | def capacity(self) -> int:
method size (line 40) | def size(self):
method add_with_ids (line 44) | def add_with_ids(self, x: np.ndarray, ids: List[int], **kwargs):
method delete (line 48) | def delete(self, ids: List[int]):
method update_with_ids (line 52) | def update_with_ids(self, x: np.ndarray, ids: List[int], **kwargs):
method reset (line 55) | def reset(self, capacity: Optional[int] = None):
FILE: annlite/core/index/flat_index.py
class FlatIndex (line 10) | class FlatIndex(BaseIndex):
method __init__ (line 11) | def __init__(self, *args, **kwargs):
method search (line 15) | def search(
method add_with_ids (line 41) | def add_with_ids(self, x: np.ndarray, ids: List[int]):
method _expand_capacity (line 52) | def _expand_capacity(self):
method reset (line 61) | def reset(self, capacity: Optional[int] = None):
method delete (line 65) | def delete(self, ids: List[int]):
method update_with_ids (line 70) | def update_with_ids(self, x: np.ndarray, ids: List[int], **kwargs):
FILE: annlite/core/index/hnsw/index.py
function pre_process (line 20) | def pre_process(f):
class HnswIndex (line 51) | class HnswIndex(BaseIndex):
method __init__ (line 52) | def __init__(
method _init_hnsw_index (line 84) | def _init_hnsw_index(self):
method load (line 116) | def load(self, index_file: Union[str, Path]):
method dump (line 121) | def dump(self, index_file: Union[str, Path]):
method add_with_ids (line 125) | def add_with_ids(
method search (line 140) | def search(
method delete (line 169) | def delete(self, ids: List[int]):
method update_with_ids (line 173) | def update_with_ids(self, x: 'np.ndarray', ids: List[int], **kwargs):
method _expand_capacity (line 178) | def _expand_capacity(self, new_capacity: int):
method reset (line 185) | def reset(self, capacity: Optional[int] = None):
method size (line 190) | def size(self):
method space_name (line 194) | def space_name(self):
method pq_enable (line 202) | def pq_enable(self):
method normalization_enable (line 206) | def normalization_enable(self):
FILE: annlite/core/index/pq_index.py
class PQIndex (line 11) | class PQIndex(FlatIndex): # pragma: no cover
method __init__ (line 12) | def __init__(
method add_with_ids (line 25) | def add_with_ids(self, x: np.ndarray, ids: List[int]):
method search (line 29) | def search(
FILE: annlite/enums.py
class BetterEnum (line 4) | class BetterEnum(IntEnum):
method __str__ (line 7) | def __str__(self):
method from_string (line 11) | def from_string(cls, s: str):
class Metric (line 25) | class Metric(BetterEnum):
class ExpandMode (line 31) | class ExpandMode(BetterEnum):
FILE: annlite/executor.py
class AnnLiteIndexer (line 15) | class AnnLiteIndexer(Executor):
method __init__ (line 37) | def __init__(
method index (line 116) | def index(
method _start_index_loop (line 141) | def _start_index_loop(self):
method update (line 178) | def update(
method delete (line 220) | def delete(self, parameters: dict = {}, **kwargs):
method search (line 241) | def search(
method backup (line 283) | def backup(self, parameters: Optional[Dict] = {}, **kwargs):
method restore (line 307) | def restore(self, parameters: Optional[Dict] = {}, **kwargs):
method filter (line 321) | def filter(self, parameters: Dict, **kwargs):
method fill_embedding (line 331) | def fill_embedding(self, docs: DocumentArray, **kwargs):
method status (line 341) | def status(self, **kwargs) -> DocumentArray:
method flush (line 357) | def flush(self):
method clear (line 363) | def clear(self, **kwargs):
method close (line 376) | def close(self, **kwargs):
FILE: annlite/filter.py
function _sql_parsing (line 17) | def _sql_parsing(data, default_logic: str = 'AND'):
class Filter (line 93) | class Filter(object):
method __init__ (line 96) | def __init__(self, tree_data: Dict = {}):
method parse_where_clause (line 99) | def parse_where_clause(self):
FILE: annlite/helper.py
function setup_logging (line 7) | def setup_logging(debug: bool):
function str2dtype (line 24) | def str2dtype(dtype_str: str):
FILE: annlite/hubble_tools.py
function get_size (line 15) | def get_size(input: Path) -> float:
function make_archive (line 21) | def make_archive(input: Path, output_name: str) -> Path:
class Uploader (line 35) | class Uploader:
method __init__ (line 36) | def __init__(self, size_limit=1024, client=None):
method upload_file (line 46) | def upload_file(
method upload_directory (line 60) | def upload_directory(
method archive_and_upload (line 121) | def archive_and_upload(
method _check_exists (line 158) | def _check_exists(self, target_name: str, type: str, file_name: str) -...
method _split_file (line 175) | def _split_file(self, input: Path) -> Path:
method _upload_hubble (line 191) | def _upload_hubble(
class Merger (line 240) | class Merger:
method __init__ (line 241) | def __init__(self, restore_path, client):
method merge_file (line 251) | def merge_file(self, inputdir: Path, outputdir: Path, outputfilename: ...
method get_artifact_ids (line 258) | def get_artifact_ids(self, art_list, type: str, cell_id: Optional[int]...
method download (line 274) | def download(self, ids, download_folder):
FILE: annlite/index.py
class AnnLite (line 26) | class AnnLite(CellContainer):
method __init__ (line 59) | def __init__(
method _sanity_check (line 189) | def _sanity_check(self, x: 'np.ndarray'):
method train (line 197) | def train(self, x: 'np.ndarray', auto_save: bool = True, force_train: ...
method partial_train (line 235) | def partial_train(
method index (line 274) | def index(self, docs: 'DocumentArray', **kwargs):
method update (line 297) | def update(
method search (line 334) | def search(
method search_by_vectors (line 361) | def search_by_vectors(
method filter (line 389) | def filter(
method get_doc_by_id (line 424) | def get_doc_by_id(self, doc_id: str):
method get_docs (line 432) | def get_docs(
method _cell_selection (line 458) | def _cell_selection(self, query_np, limit):
method search_numpy (line 485) | def search_numpy(
method _search_numpy (line 505) | def _search_numpy(self, query_np: 'np.ndarray', filter: Dict = {}, lim...
method delete (line 524) | def delete(
method clear (line 539) | def clear(self):
method close (line 547) | def close(self):
method encode (line 551) | def encode(self, x: 'np.ndarray'):
method decode (line 562) | def decode(self, x: 'np.ndarray'):
method params_hash (line 575) | def params_hash(self):
method model_path (line 586) | def model_path(self):
method _vq_codec_path (line 590) | def _vq_codec_path(self):
method _pq_codec_path (line 594) | def _pq_codec_path(self):
method _projector_codec_path (line 598) | def _projector_codec_path(self):
method index_hash (line 602) | def index_hash(self):
method index_path (line 619) | def index_path(self):
method snapshot_path (line 629) | def snapshot_path(self):
method remote_store_client (line 640) | def remote_store_client(self):
method backup (line 652) | def backup(self, target_name: Optional[str] = None, token: Optional[st...
method restore (line 666) | def restore(self, source_name: Optional[str] = None, token: Optional[s...
method dump_model (line 679) | def dump_model(self):
method dump_index (line 689) | def dump_index(self):
method dump (line 712) | def dump(self):
method _backup_index_to_remote (line 716) | def _backup_index_to_remote(self, target_name: str, token: str):
method _rebuild_index_from_local (line 769) | def _rebuild_index_from_local(self):
method _rebuild_index_from_remote (line 799) | def _rebuild_index_from_remote(self, source_name: str, token: str):
method is_trained (line 926) | def is_trained(self):
method _reload_models (line 935) | def _reload_models(self):
method use_smart_probing (line 944) | def use_smart_probing(self):
method use_smart_probing (line 948) | def use_smart_probing(self, value):
method stat (line 953) | def stat(self):
FILE: annlite/math.py
function l2_normalize (line 6) | def l2_normalize(x: 'np.ndarray', eps: float = np.finfo(np.float32).eps):
function cosine (line 21) | def cosine(
function sqeuclidean (line 41) | def sqeuclidean(x_mat: 'np.ndarray', y_mat: 'np.ndarray') -> 'np.ndarray':
function euclidean (line 54) | def euclidean(x_mat: 'np.ndarray', y_mat: 'np.ndarray') -> 'np.ndarray':
function pdist (line 64) | def pdist(
function cdist (line 77) | def cdist(x_mat: 'np.ndarray', y_mat: 'np.ndarray', metric: str) -> 'np....
function top_k (line 94) | def top_k(
FILE: annlite/profile.py
function profile (line 14) | def profile(func):
function time_profile (line 20) | def time_profile(
FILE: annlite/storage/base.py
class Storage (line 10) | class Storage(abc.ABC):
method __init__ (line 11) | def __init__(
method capacity (line 28) | def capacity(self) -> int:
method size (line 33) | def size(self):
method clean (line 37) | def clean(self):
method add (line 41) | def add(
method delete (line 51) | def delete(self, ids: List[str]):
method update (line 55) | def update(
FILE: annlite/storage/kv.py
class DocStorage (line 10) | class DocStorage:
method __init__ (line 13) | def __init__(
method _init_db (line 27) | def _init_db(self, create_if_missing: bool = True, **kwargs):
method insert (line 53) | def insert(self, docs: 'DocumentArray'):
method update (line 64) | def update(self, docs: 'DocumentArray'):
method delete (line 76) | def delete(self, doc_ids: List[str]):
method get (line 85) | def get(self, doc_ids: Union[str, list]) -> DocumentArray:
method clear (line 96) | def clear(self):
method close (line 108) | def close(self):
method __len__ (line 123) | def __len__(self):
method stat (line 127) | def stat(self):
method size (line 131) | def size(self):
method last_transaction_id (line 135) | def last_transaction_id(self):
method batched_iterator (line 138) | def batched_iterator(self, batch_size: int = 1, **kwargs) -> 'Document...
FILE: annlite/storage/table.py
function _converting (line 57) | def _converting(value: Any) -> str:
function time_now (line 67) | def time_now():
function _get_table_names (line 71) | def _get_table_names(
class Table (line 84) | class Table:
method __init__ (line 85) | def __init__(
method execute (line 107) | def execute(self, sql: str, commit: bool = True):
method execute_many (line 112) | def execute_many(self, sql: str, parameters: List[Tuple], commit: bool...
method commit (line 117) | def commit(self):
method create_table (line 120) | def create_table(self):
method drop_table (line 123) | def drop_table(self):
method clear (line 127) | def clear(self):
method load (line 132) | def load(self, data_file: Union[str, Path]):
method dump (line 137) | def dump(self, data_file: Union[str, Path]):
method close (line 142) | def close(self):
method name (line 146) | def name(self):
method schema (line 150) | def schema(self):
class CellTable (line 160) | class CellTable(Table):
method __init__ (line 161) | def __init__(
method columns (line 181) | def columns(self) -> List[str]:
method existed (line 184) | def existed(self):
method add_column (line 187) | def add_column(self, name: str, dtype: str, create_index: bool = True):
method create_index (line 192) | def create_index(self, column: str, commit: bool = True):
method create_table (line 200) | def create_table(self):
method insert (line 213) | def insert(
method query (line 259) | def query(
method delete (line 320) | def delete(self, doc_ids: List[str]):
method get_docid_by_offset (line 329) | def get_docid_by_offset(self, offset: int):
method delete_by_offset (line 336) | def delete_by_offset(self, offset: int):
method exist (line 345) | def exist(self, doc_id: str):
method count (line 349) | def count(self, where_clause: str = '', where_params: Tuple = ()):
method size (line 375) | def size(self):
class MetaTable (line 379) | class MetaTable(Table):
method __init__ (line 380) | def __init__(
method create_table (line 394) | def create_table(self):
method iter_addresses (line 409) | def iter_addresses(
method get_latest_commit (line 418) | def get_latest_commit(self):
method get_address (line 425) | def get_address(self, doc_id: str):
method delete_address (line 431) | def delete_address(self, doc_id: str, commit: bool = True):
method add_address (line 437) | def add_address(self, doc_id: str, cell_id: int, offset: int, commit: ...
method bulk_add_address (line 446) | def bulk_add_address(
FILE: annlite/utils.py
function clean_workspace (line 8) | def clean_workspace():
function docs_with_tags (line 16) | def docs_with_tags(N, D, probs, categories):
function _precision (line 40) | def _precision(predicted, relevant, eval_at):
function _recall (line 52) | def _recall(predicted, relevant, eval_at):
function evaluate (line 63) | def evaluate(predicts, relevants, top_k):
FILE: benchmarks/filtering_bench.py
function docs_with_tags (line 22) | def docs_with_tags(N, D, probs, categories):
FILE: benchmarks/hnsw_bench.py
function _precision (line 16) | def _precision(predicted, relevant, eval_at):
function _recall (line 28) | def _recall(predicted, relevant, eval_at):
function evaluate (line 39) | def evaluate(predicts, relevants, eval_at):
function get_documents (line 67) | def get_documents(nr=10, index_start=0, embeddings=None):
FILE: bindings/hnsw_bindings.cpp
function ParallelFor (line 25) | inline void ParallelFor(size_t start, size_t end, size_t numThreads,
function assert_true (line 79) | inline void assert_true(bool expr, const std::string &msg) {
class Index (line 85) | class Index {
method Index (line 87) | Index(const std::string &space_name, const int dim)
method init_new_index (line 144) | void init_new_index(const size_t maxElements, const size_t M,
method loadPQ (line 165) | void loadPQ(const py::object &pq_codec) {
method set_ef (line 180) | void set_ef(size_t ef) {
method set_num_threads (line 186) | void set_num_threads(int num_threads) {
method saveIndex (line 190) | void saveIndex(const std::string &path_to_index) {
method loadIndex (line 194) | void loadIndex(const std::string &path_to_index, size_t max_elements) {
method normalize_vector (line 206) | void normalize_vector(float *data, float *norm_array) {
method addRows_ (line 216) | void addRows_(int dim, int num_threads, const py::object &ids_,
method addItems (line 286) | void addItems(const py::object &input, py::object ids_ = py::none(),
method knnQuery_return_numpy_ (line 303) | py::object knnQuery_return_numpy_(size_t k, int num_threads,
method knnQuery_return_numpy (line 376) | py::object knnQuery_return_numpy(const py::object &input, size_t k = 1,
method knnQuery_with_filter_ (line 393) | py::object knnQuery_with_filter_(size_t k, int num_threads,
method knnQuery_with_filter (line 496) | py::object knnQuery_with_filter(py::object input,
method getDataReturnList (line 518) | std::vector<std::vector<data_t>>
method getIdsList (line 539) | std::vector<hnswlib::labeltype> getIdsList() {
method getAnnData (line 549) | py::dict getAnnData() const { /* WARNING: Index::getAnnData is not thr...
method getIndexParams (line 674) | py::dict getIndexParams() const { /* WARNING: Index::getAnnData is not
method setAnnData (line 734) | void setAnnData(const py::dict d) { /* WARNING: Index::setAnnData is not
method markDeleted (line 843) | void markDeleted(size_t label) { appr_alg->markDelete(label); }
method resizeIndex (line 845) | void resizeIndex(size_t new_size) { appr_alg->resizeIndex(new_size); }
method getMaxElements (line 847) | size_t getMaxElements() const { return appr_alg->max_elements_; }
method getCurrentCount (line 849) | size_t getCurrentCount() const { return appr_alg->cur_element_count; }
method _loadPQ (line 851) | void _loadPQ(const py::object &pq_abstract) {
function PYBIND11_PLUGIN (line 931) | PYBIND11_PLUGIN(hnsw_bind) {
FILE: examples/annlite_vs_simpleindexer.py
function _precision (line 26) | def _precision(predicted, relevant, eval_at):
function _recall (line 38) | def _recall(predicted, relevant, eval_at):
function evaluate (line 49) | def evaluate(predicts, relevants, eval_at):
function create_data (line 60) | def create_data(n_examples, D):
function create_data_online (line 69) | def create_data_online(n_examples, D, batch_size):
function create_test_data (line 87) | def create_test_data(D, Nq):
FILE: examples/filter_example.py
function clean_workspace (line 24) | def clean_workspace():
function docs_with_tags (line 32) | def docs_with_tags(N, D, probs, categories):
FILE: examples/pq_benchmark.py
function get_documents (line 32) | def get_documents(nr=10, index_start=0, embeddings=None):
FILE: examples/utils.py
function clean_workspace (line 8) | def clean_workspace():
function docs_with_tags (line 16) | def docs_with_tags(N, D, probs, categories):
function _precision (line 40) | def _precision(predicted, relevant, eval_at):
function _recall (line 52) | def _recall(predicted, relevant, eval_at):
function evaluate (line 63) | def evaluate(predicts, relevants, top_k):
FILE: executor/executor.py
class AnnLiteIndexer (line 15) | class AnnLiteIndexer(Executor):
method __init__ (line 37) | def __init__(
method index (line 114) | def index(
method _start_index_loop (line 139) | def _start_index_loop(self):
method update (line 176) | def update(
method delete (line 218) | def delete(self, parameters: dict = {}, **kwargs):
method search (line 239) | def search(
method backup (line 281) | def backup(self, parameters: Optional[Dict] = {}, **kwargs):
method restore (line 303) | def restore(self, parameters: Optional[Dict] = {}, **kwargs):
method filter (line 315) | def filter(self, parameters: Dict, **kwargs):
method fill_embedding (line 325) | def fill_embedding(self, docs: DocumentArray, **kwargs):
method status (line 335) | def status(self, **kwargs) -> DocumentArray:
method flush (line 351) | def flush(self):
method clear (line 357) | def clear(self, **kwargs):
method close (line 370) | def close(self, **kwargs):
FILE: include/hnswlib/bruteforce.h
function namespace (line 7) | namespace hnswlib {
FILE: include/hnswlib/fusefilter.h
function binary_fuse_murmur64 (line 23) | static inline uint64_t binary_fuse_murmur64(uint64_t h) {
function binary_fuse_mix_split (line 31) | static inline uint64_t binary_fuse_mix_split(uint64_t key, uint64_t seed) {
function binary_fuse_rotl64 (line 34) | static inline uint64_t binary_fuse_rotl64(uint64_t n, unsigned int c) {
function binary_fuse_reduce (line 37) | static inline uint32_t binary_fuse_reduce(uint32_t hash, uint32_t n) {
function binary_fuse8_fingerprint (line 41) | static inline uint64_t binary_fuse8_fingerprint(uint64_t hash) {
function binary_fuse_rng_splitmix64 (line 50) | static inline uint64_t binary_fuse_rng_splitmix64(uint64_t *seed) {
type binary_fuse8_t (line 57) | typedef struct binary_fuse8_s {
function binary_fuse_mulhi (line 69) | static inline uint64_t binary_fuse_mulhi(uint64_t a, uint64_t b) { retur...
function binary_fuse_mulhi (line 71) | static inline uint64_t binary_fuse_mulhi(uint64_t a, uint64_t b) {
type binary_hashes_t (line 76) | typedef struct binary_hashes_s {
function binary_hashes_t (line 82) | static inline binary_hashes_t binary_fuse8_hash_batch(uint64_t hash,
function binary_fuse8_hash (line 93) | static inline uint32_t binary_fuse8_hash(int index, uint64_t hash,
function binary_fuse8_contain (line 105) | static inline bool binary_fuse8_contain(uint64_t key,
function binary_fuse_calculate_segment_length (line 115) | static inline uint32_t binary_fuse_calculate_segment_length(uint32_t arity,
function binary_fuse8_max (line 128) | static inline double binary_fuse8_max(double a, double b) {
function binary_fuse_calculate_size_factor (line 135) | static inline double binary_fuse_calculate_size_factor(uint32_t arity,
function binary_fuse8_allocate (line 149) | static inline bool binary_fuse8_allocate(uint32_t size,
function binary_fuse8_size_in_bytes (line 178) | static inline size_t binary_fuse8_size_in_bytes(const binary_fuse8_t *fi...
function binary_fuse8_free (line 183) | static inline void binary_fuse8_free(binary_fuse8_t *filter) {
function binary_fuse_mod3 (line 194) | static inline uint8_t binary_fuse_mod3(uint8_t x) {
function binary_fuse8_populate (line 210) | bool binary_fuse8_populate(const uint64_t *keys, uint32_t size,
type SegmentLengthMask (line 388) | typedef struct binary_fuse16_s {
function binary_fuse16_fingerprint (line 438) | static inline uint64_t binary_fuse16_fingerprint(uint64_t hash) {
function binary_hashes_t (line 442) | static inline binary_hashes_t binary_fuse16_hash_batch(uint64_t hash,
function binary_fuse16_hash (line 453) | static inline uint32_t binary_fuse16_hash(int index, uint64_t hash,
function binary_fuse16_contain (line 465) | static inline bool binary_fuse16_contain(uint64_t key,
function binary_fuse16_allocate (line 478) | static inline bool binary_fuse16_allocate(uint32_t size,
function binary_fuse16_size_in_bytes (line 507) | static inline size_t
function binary_fuse16_free (line 513) | static inline void binary_fuse16_free(binary_fuse16_t *filter) {
function binary_fuse16_populate (line 535) | inline bool binary_fuse16_populate(const uint64_t *keys, uint32_t size,
FILE: include/hnswlib/hnswalg.h
function namespace (line 12) | namespace hnswlib {
FILE: include/hnswlib/hnswlib.h
function cpuid (line 19) | void cpu_x86::cpuid(int32_t out[4], int32_t eax, int32_t ecx) {
function __int64 (line 22) | __int64 xgetbv(unsigned int x) {
function cpuid (line 29) | void cpuid(int32_t cpuInfo[4], int32_t eax, int32_t ecx) {
function xgetbv (line 32) | uint64_t xgetbv(unsigned int index) {
function AVXCapable (line 54) | bool AVXCapable() {
function AVX512Capable (line 81) | bool AVX512Capable() {
function namespace (line 116) | namespace hnswlib {
FILE: include/hnswlib/space_ip.h
function namespace (line 4) | namespace hnswlib {
FILE: include/hnswlib/space_l2.h
function namespace (line 4) | namespace hnswlib {
function L2SqrI4x (line 260) | static int
function L2SqrI (line 287) | static int L2SqrI(const void* __restrict pVect1, const void* __restrict ...
function class (line 302) | class L2SpaceI : public SpaceInterface<int> {
FILE: include/hnswlib/space_pq.h
function namespace (line 6) | namespace hnswlib {
FILE: include/hnswlib/visited_list_pool.h
function namespace (line 7) | namespace hnswlib {
function class (line 38) | class VisitedListPool {
function VisitedList (line 50) | VisitedList *getFreeVisitedList() {
function releaseVisitedList (line 65) | void releaseVisitedList(VisitedList *vl) {
FILE: setup.py
function has_flag (line 93) | def has_flag(compiler, flagname):
function cpp_flag (line 108) | def cpp_flag(compiler):
class BuildExt (line 122) | class BuildExt(build_ext):
method build_extensions (line 146) | def build_extensions(self):
FILE: tests/conftest.py
function docs (line 9) | def docs():
function update_docs (line 23) | def update_docs():
function tmpfile (line 32) | def tmpfile(tmpdir):
function test_disable_telemetry (line 38) | def test_disable_telemetry(monkeypatch):
FILE: tests/docarray/test_add.py
function test_add (line 5) | def test_add(docs):
function test_add_conflict_id (line 19) | def test_add_conflict_id(docs, update_docs):
FILE: tests/docarray/test_del.py
function test_delete_success (line 6) | def test_delete_success(deleted_elmnts):
function test_delete_not_found (line 41) | def test_delete_not_found(expected_failed_deleted_elmnts):
FILE: tests/docarray/test_find.py
function test_find (line 5) | def test_find():
FILE: tests/docarray/test_get.py
function test_success_get_bulk_data (line 7) | def test_success_get_bulk_data(nrof_docs):
function test_error_get_bulk_data_id_not_exist (line 26) | def test_error_get_bulk_data_id_not_exist():
FILE: tests/docarray/test_save_load.py
function test_save_load (line 6) | def test_save_load(tmpfile):
FILE: tests/executor/test_executor.py
function gen_docs (line 22) | def gen_docs(num):
function docs_with_tags (line 31) | def docs_with_tags(N):
function delete_artifact (line 51) | def delete_artifact(tmpname):
function test_index (line 58) | def test_index(tmpfile):
function test_update (line 72) | def test_update(tmpfile):
function test_search (line 96) | def test_search(tmpfile):
function test_search_with_filtering (line 125) | def test_search_with_filtering(tmpfile, columns):
function test_delete (line 146) | def test_delete(tmpfile):
function test_status (line 172) | def test_status(tmpfile):
function test_clear (line 189) | def test_clear(tmpfile):
function test_remote_storage (line 206) | def test_remote_storage(tmpfile):
function test_local_storage (line 238) | def test_local_storage(tmpfile):
function test_remote_storage_with_shards (line 268) | def test_remote_storage_with_shards(tmpfile):
function test_local_storage_with_shards (line 326) | def test_local_storage_with_shards(tmpfile):
function test_local_storage_with_delete (line 377) | def test_local_storage_with_delete(tmpfile):
function test_local_storage_delete_update (line 408) | def test_local_storage_delete_update(tmpfile):
FILE: tests/test_codec.py
function build_data (line 17) | def build_data():
function build_pq_codec (line 23) | def build_pq_codec(build_data):
function minibatch_generator (line 30) | def minibatch_generator(Xtr, batch_size):
function build_pq_codec_online (line 47) | def build_pq_codec_online(build_data):
function test_partial_and_total_fit_same_codebook_shape (line 65) | def test_partial_and_total_fit_same_codebook_shape(
FILE: tests/test_crud.py
function annlite_with_data (line 15) | def annlite_with_data(tmpfile):
function test_get (line 33) | def test_get(annlite_with_data, filter, limit):
function test_update_legal (line 47) | def test_update_legal(annlite_with_data):
function test_update_illegal (line 66) | def test_update_illegal(annlite_with_data):
function test_delete_legal (line 90) | def test_delete_legal(annlite_with_data):
function test_delete_illegal (line 99) | def test_delete_illegal(annlite_with_data):
FILE: tests/test_dump.py
function index_data (line 14) | def index_data():
function test_dump_load (line 23) | def test_dump_load(tmpfile, index_data):
FILE: tests/test_enums.py
function test_metric (line 6) | def test_metric():
FILE: tests/test_filter.py
function test_empty_filter (line 12) | def test_empty_filter():
function test_simple_filter (line 19) | def test_simple_filter():
function test_logic_operator (line 26) | def test_logic_operator():
function test_membership_operator (line 43) | def test_membership_operator():
function test_cases (line 55) | def test_cases():
function test_error_filter (line 104) | def test_error_filter():
function test_filter_with_columns (line 113) | def test_filter_with_columns(tmpfile, columns):
function test_filter_with_dict (line 138) | def test_filter_with_dict(tmpfile, filterable_attrs):
function test_filter_with_limit_offset (line 171) | def test_filter_with_limit_offset(tmpfile, limit, offset, order_by, asce...
function test_filter_with_wrong_columns (line 215) | def test_filter_with_wrong_columns(tmpfile, limit):
FILE: tests/test_hnsw_load_save.py
function build_data (line 13) | def build_data():
function build_hnsw (line 19) | def build_hnsw(build_data):
function test_save_and_load (line 26) | def test_save_and_load(tmpdir, build_hnsw):
function test_loading_from_wrong_path (line 35) | def test_loading_from_wrong_path(tmpfile):
FILE: tests/test_index.py
function annlite_index (line 38) | def annlite_index(tmpfile):
function annlite_with_data (line 48) | def annlite_with_data(tmpfile):
function heterogenenous_da (line 67) | def heterogenenous_da(tmpfile):
function annlite_with_heterogeneous_tags (line 94) | def annlite_with_heterogeneous_tags(tmpfile, heterogenenous_da):
function test_index (line 101) | def test_index(annlite_index):
function test_dtype (line 111) | def test_dtype(annlite_index, dtype):
function test_delete (line 118) | def test_delete(annlite_with_data):
function test_update (line 122) | def test_update(annlite_with_data):
function test_query (line 128) | def test_query(annlite_with_data):
function test_index_query_with_filtering_sorted_results (line 141) | def test_index_query_with_filtering_sorted_results(annlite_with_data):
function test_query_search_filter_float_type (line 155) | def test_query_search_filter_float_type(annlite_with_heterogeneous_tags,...
function test_query_search_numpy_filter_float_type (line 175) | def test_query_search_numpy_filter_float_type(
function test_search_filter_str (line 199) | def test_search_filter_str(annlite_with_heterogeneous_tags, operator):
function test_search_numpy_filter_str (line 218) | def test_search_numpy_filter_str(
function test_search_numpy_membership_filter (line 241) | def test_search_numpy_membership_filter(
function delete_artifact (line 277) | def delete_artifact(tmpname):
function test_local_backup_restore (line 284) | def test_local_backup_restore(tmpdir):
function test_remote_backup_restore (line 303) | def test_remote_backup_restore(tmpdir):
FILE: tests/test_pq_bind.py
function build_data (line 18) | def build_data():
function build_pq_codec (line 24) | def build_pq_codec(build_data):
function test_pq_adc_table_shape (line 31) | def test_pq_adc_table_shape(build_pq_codec):
function test_pq_adc_table_computation (line 36) | def test_pq_adc_table_computation(build_data):
function test_pq_adc_table_computation_interface (line 62) | def test_pq_adc_table_computation_interface(build_pq_codec, build_data):
FILE: tests/test_pq_index.py
function random_docs (line 20) | def random_docs():
function test_pq_index_dist_mat (line 30) | def test_pq_index_dist_mat(random_docs):
function test_hnsw_pq_load_empty (line 52) | def test_hnsw_pq_load_empty(tmpfile, random_docs):
function test_hnsw_pq_load (line 68) | def test_hnsw_pq_load(tmpfile, random_docs):
function test_hnsw_pq_search_multi_clusters (line 81) | def test_hnsw_pq_search_multi_clusters(tmpdir, n_clusters, random_docs):
FILE: tests/test_projector.py
function build_data (line 13) | def build_data():
function build_projector (line 19) | def build_projector(build_data):
function test_encode_decode (line 38) | def test_encode_decode(build_data, build_projector):
function test_save_and_load (line 51) | def test_save_and_load(tmpdir, build_data, build_projector):
FILE: tests/test_projector_index.py
function build_data (line 14) | def build_data():
function build_projector_annlite (line 20) | def build_projector_annlite(tmpfile):
function projector_annlite_with_data (line 26) | def projector_annlite_with_data(build_data, build_projector_annlite):
function test_delete (line 37) | def test_delete(projector_annlite_with_data):
function test_update (line 42) | def test_update(projector_annlite_with_data):
function test_query (line 49) | def test_query(projector_annlite_with_data):
FILE: tests/test_store.py
function test_get (line 6) | def test_get(tmpfile, docs):
function test_update (line 19) | def test_update(tmpfile, docs, update_docs):
function test_delete (line 29) | def test_delete(tmpfile, docs):
function test_clear (line 37) | def test_clear(tmpfile, docs):
function test_batched_iterator (line 46) | def test_batched_iterator(tmpfile, docs):
function test_searalize (line 54) | def test_searalize(tmpfile, protocol, docs):
FILE: tests/test_table.py
function dummy_cell_table (line 8) | def dummy_cell_table():
function sample_docs (line 19) | def sample_docs():
function table_with_data (line 33) | def table_with_data(dummy_cell_table, sample_docs):
function test_create_cell_table (line 38) | def test_create_cell_table():
function test_schema (line 45) | def test_schema(dummy_cell_table):
function test_query (line 50) | def test_query(table_with_data):
function test_get_docid_by_offset (line 59) | def test_get_docid_by_offset(table_with_data):
function test_exist (line 67) | def test_exist(table_with_data):
function test_delete (line 71) | def test_delete(table_with_data):
function test_count (line 80) | def test_count(table_with_data):
function test_create_meta_table (line 109) | def test_create_meta_table(tmpdir):
Condensed preview — 97 files, each showing path, character count, and a content snippet. Download the .json file or copy for the full structured content (1,223K chars).
[
{
"path": ".gitattributes",
"chars": 149,
"preview": "notebooks/* linguist-vendored\ninclude/* linguist-vendored\nbindings/* linguist-vendored\n*.h linguist-detectable=false\n*.c"
},
{
"path": ".github/release-template.ejs",
"chars": 4608,
"preview": "<% var groupCommits = [\n{\n name: 'breaking',\n show: true,\n list: []\n}, {\n name: 'feat',\n show: true,\n "
},
{
"path": ".github/requirements-test.txt",
"chars": 31,
"preview": "pytest\npytest-custom_exit_code\n"
},
{
"path": ".github/workflows/cd.yml",
"chars": 5541,
"preview": "name: CD\n\non:\n push:\n branches:\n - main\n\njobs:\n prep-testbed:\n if: |\n !startsWith(github.event.head_co"
},
{
"path": ".github/workflows/ci.yml",
"chars": 6843,
"preview": "name: CI\n\non:\n pull_request:\n\njobs:\n commit-lint:\n runs-on: ubuntu-latest\n steps:\n - name: find the prev wa"
},
{
"path": ".github/workflows/force-release.yml",
"chars": 3013,
"preview": "name: Manual Release\n\non:\n workflow_dispatch:\n inputs:\n release_token:\n description: 'Your release token"
},
{
"path": ".github/workflows/tag.yml",
"chars": 910,
"preview": "name: Release CD\n\non:\n push:\n tags:\n - \"v*\" # push to version tags trigger the build\n\n#on:\n# push:\n# branc"
},
{
"path": ".gitignore",
"chars": 1600,
"preview": "# Initially taken from Github's Python gitignore file\n# local dev\n.vscode\nbindings/pq_bindings.cpp\n# Byte-compiled / opt"
},
{
"path": ".pre-commit-config.yaml",
"chars": 1310,
"preview": "repos:\n #- repo: https://github.com/terrencepreilly/darglint\n # rev: v1.5.8\n # hooks:\n # - id: darglint\n # fi"
},
{
"path": "CHANGELOG.md",
"chars": 26318,
"preview": "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n<a name=release-note-0-3-0></a>\n## Release Note (`0.3.0`)\n\n> Release time: 2022-03-04 11:49:0"
},
{
"path": "LICENSE",
"chars": 11357,
"preview": " Apache License\n Version 2.0, January 2004\n "
},
{
"path": "MANIFEST.in",
"chars": 123,
"preview": "include setup.py\ninclude requirements.txt\ninclude pyproject.toml\nglobal-include *.pyx\nrecursive-include include/hnswlib/"
},
{
"path": "Makefile",
"chars": 272,
"preview": "pypi: dist\n\ttwine upload dist/*\n\ndist:\n\trm -rf dist/*\n\tpip install build\n\tpython -m build --sdist\n\ntest:\n\tpython -m unit"
},
{
"path": "README.md",
"chars": 14959,
"preview": "<p align=\"center\">\n<br>\n<br>\n<br>\n<img src=\"https://github.com/jina-ai/annlite/blob/main/.github/assets/logo.svg?raw=tru"
},
{
"path": "annlite/__init__.py",
"chars": 51,
"preview": "__version__ = '0.5.11'\n\nfrom .index import AnnLite\n"
},
{
"path": "annlite/container.py",
"chars": 15114,
"preview": "import warnings\nfrom pathlib import Path\nfrom typing import TYPE_CHECKING, Dict, List, Optional, Tuple\n\nimport numpy as "
},
{
"path": "annlite/core/__init__.py",
"chars": 52,
"preview": "from .codec import PQCodec, ProjectorCodec, VQCodec\n"
},
{
"path": "annlite/core/codec/__init__.py",
"chars": 86,
"preview": "from .pq import PQCodec\nfrom .projector import ProjectorCodec\nfrom .vq import VQCodec\n"
},
{
"path": "annlite/core/codec/base.py",
"chars": 888,
"preview": "import pickle\nfrom abc import ABC, abstractmethod\nfrom typing import TYPE_CHECKING\n\nif TYPE_CHECKING:\n from pathlib i"
},
{
"path": "annlite/core/codec/pq.py",
"chars": 13397,
"preview": "from argparse import ArgumentError\n\nimport numpy as np\nfrom scipy.cluster.vq import vq\n\nfrom annlite import pq_bind\n\nfro"
},
{
"path": "annlite/core/codec/projector.py",
"chars": 5172,
"preview": "from typing import Optional\n\nimport numpy as np\n\nfrom .base import BaseCodec\n\n\nclass ProjectorCodec(BaseCodec):\n \"\"\"I"
},
{
"path": "annlite/core/codec/vq.py",
"chars": 2717,
"preview": "import numpy as np\nfrom scipy.cluster.vq import vq\n\nfrom ...enums import Metric\nfrom .base import BaseCodec\n\n\nclass VQCo"
},
{
"path": "annlite/core/index/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "annlite/core/index/base.py",
"chars": 1426,
"preview": "import abc\nfrom typing import List, Optional, Union\n\nimport numpy as np\n\nfrom ...enums import ExpandMode, Metric\nfrom .."
},
{
"path": "annlite/core/index/flat_index.py",
"chars": 2178,
"preview": "from typing import List, Optional\n\nimport numpy as np\nfrom loguru import logger\n\nfrom ...math import cdist, top_k\nfrom ."
},
{
"path": "annlite/core/index/hnsw/__init__.py",
"chars": 29,
"preview": "from .index import HnswIndex\n"
},
{
"path": "annlite/core/index/hnsw/index.py",
"chars": 6835,
"preview": "import math\nimport os.path\nfrom functools import wraps\nfrom pathlib import Path\nfrom typing import TYPE_CHECKING, List, "
},
{
"path": "annlite/core/index/pq_index.py",
"chars": 1548,
"preview": "from typing import List, Optional\n\nimport numpy as np\n\nfrom ...math import top_k\nfrom ..codec.pq import PQCodec\nfrom .fl"
},
{
"path": "annlite/enums.py",
"chars": 704,
"preview": "from enum import IntEnum\n\n\nclass BetterEnum(IntEnum):\n \"\"\"The base class of Enum.\"\"\"\n\n def __str__(self):\n "
},
{
"path": "annlite/executor.py",
"chars": 14801,
"preview": "import threading\nimport time\nimport traceback\nimport warnings\nfrom threading import Thread\nfrom typing import Dict, List"
},
{
"path": "annlite/filter.py",
"chars": 3557,
"preview": "from typing import Dict\n\nLOGICAL_OPERATORS = {'$and': 'AND', '$or': 'OR'}\n\nCOMPARISON_OPERATORS = {\n '$lt': '<',\n "
},
{
"path": "annlite/helper.py",
"chars": 1086,
"preview": "import sys\n\nimport numpy as np\nfrom loguru import logger\n\n\ndef setup_logging(debug: bool):\n \"\"\"\n Setup the log for"
},
{
"path": "annlite/hubble_tools.py",
"chars": 9249,
"preview": "import os\nimport platform\nimport shutil\nimport time\nfrom pathlib import Path\nfrom typing import Optional, Union\n\nfrom fi"
},
{
"path": "annlite/index.py",
"chars": 35233,
"preview": "import hashlib\nimport logging\nimport os\nimport platform\nimport warnings\nfrom pathlib import Path\nfrom typing import TYPE"
},
{
"path": "annlite/math.py",
"chars": 3796,
"preview": "from typing import Tuple\n\nimport numpy as np\n\n\ndef l2_normalize(x: 'np.ndarray', eps: float = np.finfo(np.float32).eps):"
},
{
"path": "annlite/profile.py",
"chars": 2371,
"preview": "import cProfile\nimport pstats\nimport random\nfrom functools import wraps\n\nrandom.seed(20)\n\ntry:\n import builtins\n\n "
},
{
"path": "annlite/storage/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "annlite/storage/base.py",
"chars": 1300,
"preview": "import abc\nfrom typing import TYPE_CHECKING, List, Optional\n\nif TYPE_CHECKING:\n import numpy as np\n\nfrom ..enums impo"
},
{
"path": "annlite/storage/kv.py",
"chars": 4702,
"preview": "import time\nimport warnings\nfrom pathlib import Path\nfrom typing import Dict, List, Union\n\nfrom docarray import Document"
},
{
"path": "annlite/storage/table.py",
"chars": 14184,
"preview": "import datetime\nimport sqlite3\nimport threading\nfrom pathlib import Path\nfrom typing import TYPE_CHECKING, Any, List, Op"
},
{
"path": "annlite/utils.py",
"chars": 1908,
"preview": "import os\nimport shutil\n\nimport numpy as np\nfrom docarray import Document, DocumentArray\n\n\ndef clean_workspace():\n if"
},
{
"path": "benchmarks/filtering_bench.py",
"chars": 2789,
"preview": "import os\nimport shutil\nimport tempfile\n\nimport numpy as np\nfrom jina import Document, DocumentArray\nfrom jina.logging.p"
},
{
"path": "benchmarks/hnsw_bench.py",
"chars": 3684,
"preview": "import tempfile\nimport time\nfrom datetime import date\n\nimport numpy as np\nimport pandas as pd\nfrom docarray import Docum"
},
{
"path": "bindings/hnsw_bindings.cpp",
"chars": 38735,
"preview": "#include \"hnswlib.h\"\n#include <Python.h>\n#include <assert.h>\n#include <atomic>\n#include <iostream>\n#include <memory>\n#in"
},
{
"path": "bindings/pq_bindings.pyx",
"chars": 9965,
"preview": "# distutils: language = c++\n\nimport numpy as np\n\ncimport cython\nfrom libc.stdint cimport (\n int8_t,\n int16_t,\n "
},
{
"path": "examples/annlite_vs_simpleindexer.py",
"chars": 6011,
"preview": "import os\nimport shutil\nimport tempfile\nimport time\n\nimport numpy as np\nimport pandas as pd\nfrom jina import Document, D"
},
{
"path": "examples/filter_example.py",
"chars": 2651,
"preview": "import os\nimport random\nimport shutil\n\nimport numpy as np\nfrom jina import Document, DocumentArray\nfrom jina.logging.pro"
},
{
"path": "examples/hnsw_example.py",
"chars": 1570,
"preview": "import random\nimport tempfile\n\nimport numpy as np\nfrom docarray import Document, DocumentArray\n\nfrom annlite import AnnL"
},
{
"path": "examples/pq_benchmark.py",
"chars": 2577,
"preview": "import time\nfrom datetime import date\n\nimport numpy as np\nimport pandas as pd\nfrom docarray import Document, DocumentArr"
},
{
"path": "examples/pqlinearscann_benchmark_with_filtering.py",
"chars": 3694,
"preview": "import numpy as np\nfrom docarray import Document, DocumentArray\nfrom docarray.math.distance import cdist\nfrom docarray.m"
},
{
"path": "examples/utils.py",
"chars": 1908,
"preview": "import os\nimport shutil\n\nimport numpy as np\nfrom docarray import Document, DocumentArray\n\n\ndef clean_workspace():\n if"
},
{
"path": "executor/Dockerfile",
"chars": 291,
"preview": "FROM jinaai/jina:3-py38-perf\n\nRUN apt-get update && apt-get install --no-install-recommends -y gcc g++ git \\\n && rm -"
},
{
"path": "executor/README.md",
"chars": 3847,
"preview": "# AnnLiteIndexer\n\n`AnnLiteIndexer` uses the [AnnLite](https://github.com/jina-ai/annlite) class for indexing Jina `Docum"
},
{
"path": "executor/benchmark.py",
"chars": 1953,
"preview": "import tempfile\nimport time\n\nimport numpy as np\nfrom jina import DocumentArray\nfrom jina.logging.profile import TimeCont"
},
{
"path": "executor/config.yml",
"chars": 250,
"preview": "jtype: AnnLiteIndexer\npy_modules:\n - ./executor.py\nmetas:\n name: AnnLiteIndexer_v3\n description: A similarity search "
},
{
"path": "executor/executor.py",
"chars": 14566,
"preview": "import threading\nimport time\nimport traceback\nimport warnings\nfrom threading import Thread\nfrom typing import Dict, List"
},
{
"path": "executor/requirements.txt",
"chars": 25,
"preview": "annlite\ncertifi\ndocarray\n"
},
{
"path": "include/hnswlib/bruteforce.h",
"chars": 5519,
"preview": "#pragma once\n#include <algorithm>\n#include <fstream>\n#include <mutex>\n#include <unordered_map>\n\nnamespace hnswlib {\n "
},
{
"path": "include/hnswlib/fusefilter.h",
"chars": 25159,
"preview": "#ifndef BINARYFUSEFILTER_H\n#define BINARYFUSEFILTER_H\n#include <math.h>\n#include <stdbool.h>\n#include <stddef.h>\n#includ"
},
{
"path": "include/hnswlib/hnswalg.h",
"chars": 60442,
"preview": "#pragma once\n\n#include \"hnswlib.h\"\n#include \"visited_list_pool.h\"\n#include <assert.h>\n#include <atomic>\n#include <list>\n"
},
{
"path": "include/hnswlib/hnswlib.h",
"chars": 5372,
"preview": "#pragma once\n#ifndef NO_MANUAL_VECTORIZATION\n#ifdef __SSE__\n#define USE_SSE\n#ifdef __AVX__\n#define USE_AVX\n#ifdef __AVX5"
},
{
"path": "include/hnswlib/space_ip.h",
"chars": 12613,
"preview": "#pragma once\n#include \"hnswlib.h\"\n\nnamespace hnswlib {\n\n static float\n InnerProduct(const void *pVect1, const void"
},
{
"path": "include/hnswlib/space_l2.h",
"chars": 10162,
"preview": "#pragma once\n#include \"hnswlib.h\"\n\nnamespace hnswlib {\n\n static float\n L2Sqr(const void *pVect1v, const void *pVec"
},
{
"path": "include/hnswlib/space_pq.h",
"chars": 2203,
"preview": "#pragma once\n#include \"hnswlib.h\"\n#include <math.h>\n#include <memory>\n#include <stdint.h>\nnamespace hnswlib {\n\ntypedef s"
},
{
"path": "include/hnswlib/visited_list_pool.h",
"chars": 1995,
"preview": "#pragma once\n\n#include <mutex>\n#include <string.h>\n#include <deque>\n\nnamespace hnswlib {\n typedef unsigned short int "
},
{
"path": "notebooks/fashion_product_search.ipynb",
"chars": 685141,
"preview": "{\n \"nbformat\": 4,\n \"nbformat_minor\": 0,\n \"metadata\": {\n \"colab\": {\n \"name\": \"fashion_product_search.ipynb\",\n "
},
{
"path": "pyproject.toml",
"chars": 165,
"preview": "[build-system]\nrequires = [\n \"setuptools>=42\",\n \"wheel\",\n \"cython\",\n \"numpy>=1.10.0\",\n \"pybind11>=2.9.0\","
},
{
"path": "requirements.txt",
"chars": 126,
"preview": "click\ncython\ndocarray>=0.13.16,<0.30.0\ndocarray[common]>=0.13.16,<0.30.0\nfilesplit\nloguru\nnumpy\nrocksdict>=0.3.9\nscikit-"
},
{
"path": "scripts/black.sh",
"chars": 396,
"preview": "#!/bin/bash\npip install black==22.3.0\narrVar=()\necho we ignore non-*.py files\nexcluded_files=(\n docs/conf.py\n)\nfor cha"
},
{
"path": "scripts/get-all-test-paths.sh",
"chars": 285,
"preview": "#!/usr/bin/env bash\n\nset -ex\n\nBATCH_SIZE=5\n\ndeclare -a array1=( \"tests/test_*.py\" )\ndeclare -a array2=( \"tests/docarray/"
},
{
"path": "scripts/get-last-release-note.py",
"chars": 292,
"preview": "## under jina root dir\n# python scripts/get-last-release-note.py\n## result in root/tmp.md\n\nwith open('CHANGELOG.md') as "
},
{
"path": "scripts/release.sh",
"chars": 3504,
"preview": "#!/usr/bin/env bash\n\n# Requirements\n# brew install hub\n# npm install -g git-release-notes\n# pip install twine wheel\n\nset"
},
{
"path": "scripts/update-version.sh",
"chars": 2561,
"preview": "#!/usr/bin/env bash\n\n# Requirements\n# brew install hub\n# npm install -g git-release-notes\n# pip install twine wheel\n\nset"
},
{
"path": "setup.py",
"chars": 6825,
"preview": "import os\nimport platform\nimport sys\nfrom distutils.sysconfig import get_python_inc\n\nimport numpy as np\nimport pybind11\n"
},
{
"path": "tests/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "tests/conftest.py",
"chars": 1052,
"preview": "import tempfile\n\nimport numpy as np\nimport pytest\nfrom docarray import Document, DocumentArray\n\n\n@pytest.fixture(scope='"
},
{
"path": "tests/docarray/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "tests/docarray/test_add.py",
"chars": 649,
"preview": "import pytest\nfrom docarray import DocumentArray\n\n\ndef test_add(docs):\n annlite_doc = DocumentArray(\n storage="
},
{
"path": "tests/docarray/test_del.py",
"chars": 1763,
"preview": "import pytest\nfrom docarray import Document, DocumentArray\n\n\n@pytest.mark.parametrize('deleted_elmnts', [[0, 1], ['r0', "
},
{
"path": "tests/docarray/test_find.py",
"chars": 555,
"preview": "import numpy as np\nfrom docarray import Document, DocumentArray\n\n\ndef test_find():\n nrof_docs = 1000\n num_candidat"
},
{
"path": "tests/docarray/test_get.py",
"chars": 1040,
"preview": "import numpy as np\nimport pytest\nfrom docarray import Document, DocumentArray\n\n\n@pytest.mark.parametrize('nrof_docs', [1"
},
{
"path": "tests/docarray/test_save_load.py",
"chars": 663,
"preview": "import numpy as np\nimport pytest\nfrom docarray import Document, DocumentArray\n\n\ndef test_save_load(tmpfile):\n N = 100"
},
{
"path": "tests/executor/__init__.py",
"chars": 0,
"preview": ""
},
{
"path": "tests/executor/test_executor.py",
"chars": 10818,
"preview": "import os\nimport time\nimport uuid\nfrom unittest.mock import patch\n\nimport hubble\nimport numpy as np\nimport pytest\nfrom j"
},
{
"path": "tests/test_codec.py",
"chars": 1637,
"preview": "import numpy as np\nimport pytest\n\nfrom annlite.core.codec.pq import PQCodec\n\nn_examples = 1000\nn_features = 128\nn_querie"
},
{
"path": "tests/test_crud.py",
"chars": 2772,
"preview": "import random\n\nimport numpy as np\nimport pytest\nfrom docarray import Document, DocumentArray\n\nfrom annlite import AnnLit"
},
{
"path": "tests/test_dump.py",
"chars": 1097,
"preview": "import numpy as np\nimport pytest\nfrom docarray import Document, DocumentArray\n\nfrom annlite import AnnLite\n\nnp.random.se"
},
{
"path": "tests/test_enums.py",
"chars": 152,
"preview": "import pytest\n\nfrom annlite.enums import Metric\n\n\ndef test_metric():\n m = Metric.EUCLIDEAN\n\n assert m.name == 'EUC"
},
{
"path": "tests/test_filter.py",
"chars": 6980,
"preview": "import random\nimport tempfile\n\nimport numpy as np\nimport pytest\nfrom docarray import Document, DocumentArray\n\nfrom annli"
},
{
"path": "tests/test_hnsw_load_save.py",
"chars": 896,
"preview": "import os\n\nimport numpy as np\nimport pytest\n\nfrom annlite.core.index.hnsw import HnswIndex\n\nn_examples = 100\nn_features "
},
{
"path": "tests/test_index.py",
"chars": 9856,
"preview": "import operator\nimport os\nimport random\nimport uuid\nfrom unittest.mock import patch\n\nimport hubble\nimport numpy as np\nim"
},
{
"path": "tests/test_pq_bind.py",
"chars": 2189,
"preview": "import numpy as np\nimport pytest\n\nfrom annlite.core.codec.pq import PQCodec\nfrom annlite.pq_bind import dist_pqcodes_to_"
},
{
"path": "tests/test_pq_index.py",
"chars": 5110,
"preview": "from time import time\n\nimport numpy as np\nimport pytest\nfrom docarray import Document, DocumentArray\nfrom loguru import "
},
{
"path": "tests/test_projector.py",
"chars": 1847,
"preview": "import numpy as np\nimport pytest\n\nfrom annlite.core.codec.projector import ProjectorCodec\n\nn_examples = 1000\nn_features "
},
{
"path": "tests/test_projector_index.py",
"chars": 1584,
"preview": "import numpy as np\nimport pytest\nfrom docarray import Document, DocumentArray\n\nfrom annlite.index import AnnLite\n\nn_exam"
},
{
"path": "tests/test_store.py",
"chars": 1455,
"preview": "import pytest\n\nfrom annlite.storage.kv import DocStorage\n\n\ndef test_get(tmpfile, docs):\n storage = DocStorage(tmpfile"
},
{
"path": "tests/test_table.py",
"chars": 3639,
"preview": "import pytest\nfrom docarray import Document, DocumentArray\n\nfrom annlite.storage.table import CellTable, MetaTable\n\n\n@py"
}
]
About this extraction
This page contains the full source code of the jina-ai/annlite GitHub repository, extracted and formatted as plain text for AI agents and large language models (LLMs). The extraction includes 97 files (1.1 MB), approximately 564.4k tokens, and a symbol index with 487 extracted functions, classes, methods, constants, and types. Use this with OpenClaw, Claude, ChatGPT, Cursor, Windsurf, or any other AI tool that accepts text input. You can copy the full output to your clipboard or download it as a .txt file.
Extracted by GitExtract — free GitHub repo to text converter for AI. Built by Nikandr Surkov.