Repository: asg017/sqlite-vec
Branch: main
Commit: e7ae41b76192
Files: 219
Total size: 1.0 MB
Directory structure:
gitextract_v4s81yo1/
├── .github/
│ └── workflows/
│ ├── fuzz.yaml
│ ├── release.yaml
│ ├── site.yaml
│ └── test.yaml
├── .gitignore
├── ARCHITECTURE.md
├── LICENSE-APACHE
├── LICENSE-MIT
├── Makefile
├── README.md
├── SECURITY.md
├── TODO
├── VERSION
├── benchmarks/
│ ├── README.md
│ ├── exhaustive-memory/
│ │ ├── .gitignore
│ │ ├── Makefile
│ │ ├── README.md
│ │ ├── bench.py
│ │ ├── gist.suite
│ │ ├── requirements.txt
│ │ └── sift.suite
│ ├── micro/
│ │ ├── .gitignore
│ │ ├── Cargo.toml
│ │ ├── benches/
│ │ │ └── my_benchmark.rs
│ │ ├── build.rs
│ │ └── src/
│ │ └── lib.rs
│ ├── profiling/
│ │ ├── build-from-npy.sql
│ │ └── query-k.sql
│ └── self-params/
│ ├── build.py
│ ├── knn.py
│ └── test.py
├── bindings/
│ ├── go/
│ │ └── ncruces/
│ │ └── go-sqlite3.patch
│ ├── python/
│ │ └── extra_init.py
│ └── rust/
│ ├── .gitignore
│ ├── Cargo.toml.tmpl
│ ├── Makefile
│ ├── build.rs
│ └── src/
│ └── lib.rs
├── examples/
│ ├── nbc-headlines/
│ │ ├── .gitignore
│ │ ├── 1_scrape.ipynb
│ │ ├── 2_build.ipynb
│ │ ├── 3_search.ipynb
│ │ ├── Makefile
│ │ └── README.md
│ ├── python-recipes/
│ │ └── openai-sample.py
│ ├── simple-bun/
│ │ ├── .gitignore
│ │ ├── demo.ts
│ │ └── package.json
│ ├── simple-c/
│ │ ├── .gitignore
│ │ ├── Makefile
│ │ └── demo.c
│ ├── simple-deno/
│ │ └── demo.ts
│ ├── simple-go-cgo/
│ │ ├── .gitignore
│ │ ├── Makefile
│ │ ├── demo.go
│ │ ├── go.mod
│ │ └── go.sum
│ ├── simple-go-ncruces/
│ │ ├── .gitignore
│ │ ├── Makefile
│ │ ├── demo.go
│ │ ├── go.mod
│ │ └── go.sum
│ ├── simple-node/
│ │ ├── .gitignore
│ │ ├── demo.mjs
│ │ └── package.json
│ ├── simple-node2/
│ │ ├── .gitignore
│ │ ├── demo.mjs
│ │ ├── package.json
│ │ └── tmp.mjs
│ ├── simple-python/
│ │ ├── .gitignore
│ │ ├── demo.py
│ │ └── requirements.txt
│ ├── simple-ruby/
│ │ ├── .gitignore
│ │ ├── Gemfile
│ │ └── demo.rb
│ ├── simple-rust/
│ │ ├── .gitignore
│ │ ├── Cargo.toml
│ │ └── demo.rs
│ ├── simple-sqlite/
│ │ └── demo.sql
│ ├── simple-wasm/
│ │ └── index.html
│ ├── sqlite3-cli/
│ │ ├── README.md
│ │ └── core_init.c
│ └── wasm/
│ ├── README.md
│ └── wasm.c
├── reference.yaml
├── scripts/
│ ├── progress.ts
│ ├── publish-release.sh
│ └── vendor.sh
├── site/
│ ├── .gitignore
│ ├── .vitepress/
│ │ ├── config.mts
│ │ └── theme/
│ │ ├── HeroImg.vue
│ │ ├── Sponsors.vue
│ │ ├── index.ts
│ │ └── style.css
│ ├── api-reference.md
│ ├── build-ref.mjs
│ ├── compiling.md
│ ├── features/
│ │ ├── knn.md
│ │ └── vec0.md
│ ├── getting-started/
│ │ ├── installation.md
│ │ └── introduction.md
│ ├── guides/
│ │ ├── arithmetic.md
│ │ ├── binary-quant.md
│ │ ├── classifiers.md
│ │ ├── hybrid-search.md
│ │ ├── matryoshka.md
│ │ ├── performance.md
│ │ ├── rag.md
│ │ ├── scalar-quant.md
│ │ └── semantic-search.md
│ ├── index.md
│ ├── package.json
│ ├── project.data.ts
│ ├── public/
│ │ └── fonts/
│ │ └── ZillaSlab-SemiBold.otf
│ ├── sqlite.tmlanguage.json
│ ├── using/
│ │ ├── android-ios.md
│ │ ├── c.md
│ │ ├── datasette.md
│ │ ├── go.md
│ │ ├── js.md
│ │ ├── python.md
│ │ ├── rqlite.md
│ │ ├── ruby.md
│ │ ├── rust.md
│ │ ├── sqlite-utils.md
│ │ └── wasm.md
│ └── versioning.md
├── sqlite-dist.toml
├── sqlite-vec.c
├── sqlite-vec.h.tmpl
├── test.sql
├── tests/
│ ├── .gitignore
│ ├── .python-version
│ ├── Cargo.toml
│ ├── __snapshots__/
│ │ ├── test-auxiliary.ambr
│ │ ├── test-general.ambr
│ │ ├── test-insert-delete.ambr
│ │ ├── test-knn-distance-constraints.ambr
│ │ ├── test-metadata.ambr
│ │ └── test-partition-keys.ambr
│ ├── afbd/
│ │ ├── .gitignore
│ │ ├── .python-version
│ │ ├── Makefile
│ │ ├── README.md
│ │ └── test-afbd.py
│ ├── build.rs
│ ├── conftest.py
│ ├── correctness/
│ │ ├── build.py
│ │ └── test-correctness.py
│ ├── fuzz/
│ │ ├── .gitignore
│ │ ├── Makefile
│ │ ├── README.md
│ │ ├── TODO.md
│ │ ├── corpus/
│ │ │ ├── exec/
│ │ │ │ ├── select1
│ │ │ │ └── vec_version
│ │ │ ├── json/
│ │ │ │ ├── empty
│ │ │ │ ├── valid_2d
│ │ │ │ └── valid_4d
│ │ │ ├── shadow-corrupt/
│ │ │ │ ├── target0
│ │ │ │ ├── target1
│ │ │ │ ├── target2
│ │ │ │ ├── target3
│ │ │ │ ├── target4
│ │ │ │ └── target5
│ │ │ ├── vec-mismatch/
│ │ │ │ ├── dim_mismatch_4d_2d
│ │ │ │ ├── json2d_invalid_blob
│ │ │ │ ├── json4d_invalid_blob
│ │ │ │ ├── json_1d_blob_5byte
│ │ │ │ ├── json_2d_blob_3byte
│ │ │ │ ├── json_valid_blob_invalid
│ │ │ │ ├── json_valid_empty
│ │ │ │ ├── single_f32_bad_text
│ │ │ │ ├── single_normalize_json
│ │ │ │ ├── type_mismatch_f32_bit
│ │ │ │ └── type_mismatch_f32_int8
│ │ │ ├── vec0-create/
│ │ │ │ ├── normal1
│ │ │ │ └── normal2
│ │ │ └── vec0-operations/
│ │ │ ├── ins_del_ins
│ │ │ └── insert5
│ │ ├── exec.c
│ │ ├── exec.dict
│ │ ├── json.c
│ │ ├── metadata-columns.c
│ │ ├── numpy.c
│ │ ├── numpy.dict
│ │ ├── scalar-functions.c
│ │ ├── scalar-functions.dict
│ │ ├── shadow-corrupt.c
│ │ ├── targets/
│ │ │ └── .gitignore
│ │ ├── vec-each.c
│ │ ├── vec-mismatch.c
│ │ ├── vec0-create-full.c
│ │ ├── vec0-create.c
│ │ ├── vec0-create.dict
│ │ ├── vec0-delete-completeness.c
│ │ └── vec0-operations.c
│ ├── fuzz.py
│ ├── helpers.py
│ ├── leak-fixtures/
│ │ ├── each.sql
│ │ ├── knn.sql
│ │ └── vec0-create.sql
│ ├── minimum/
│ │ ├── .gitignore
│ │ ├── Makefile
│ │ └── demo.c
│ ├── pyproject.toml
│ ├── skip.test-correctness.py
│ ├── sqlite-vec-internal.h
│ ├── test-auxiliary.py
│ ├── test-general.py
│ ├── test-insert-delete.py
│ ├── test-knn-distance-constraints.py
│ ├── test-loadable.py
│ ├── test-metadata.py
│ ├── test-partition-keys.py
│ ├── test-unit.c
│ ├── test-wasm.mjs
│ ├── unittest.rs
│ └── utils.py
└── tmp-static.py
================================================
FILE CONTENTS
================================================
================================================
FILE: .github/workflows/fuzz.yaml
================================================
name: "Fuzz"
on:
push:
branches: [main]
schedule:
# Nightly at 2am UTC for longer fuzzing sessions
- cron: "0 2 * * *"
workflow_dispatch:
inputs:
duration:
description: "Fuzz duration per target (seconds)"
default: "60"
permissions:
contents: read
jobs:
fuzz-linux:
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v4
- name: Install LLVM 18
run: |
wget -qO- https://apt.llvm.org/llvm.sh | sudo bash -s -- 18
echo "FUZZ_CC=clang-18" >> $GITHUB_ENV
- run: ./scripts/vendor.sh
- name: Generate sqlite-vec.h
run: make sqlite-vec.h
- name: Build fuzz targets
run: make -C tests/fuzz all FUZZ_CC=$FUZZ_CC FUZZ_LDFLAGS=
- name: Run fuzz targets
run: |
DURATION=${{ github.event.inputs.duration || '60' }}
EXIT_CODE=0
for target in tests/fuzz/targets/*; do
[ -f "$target" ] && [ -x "$target" ] || continue
name=$(basename "$target")
echo "::group::Fuzzing $name ($DURATION seconds)"
corpus="tests/fuzz/corpus/$name"
mkdir -p "$corpus"
dict="tests/fuzz/${name//_/-}.dict"
dict_flag=""
[ -f "$dict" ] && dict_flag="-dict=$dict"
if ! ASAN_OPTIONS=detect_leaks=1 "$target" $dict_flag \
-max_total_time="$DURATION" "$corpus" 2>&1; then
echo "::error::Fuzz target $name found a crash!"
EXIT_CODE=1
fi
echo "::endgroup::"
done
exit $EXIT_CODE
- name: Upload crash artifacts
if: failure()
uses: actions/upload-artifact@v4
with:
name: fuzz-crashes-linux
path: |
crash-*
leak-*
timeout-*
fuzz-macos:
runs-on: macos-14
# Best-effort: Homebrew LLVM 18 runtime dylibs pull in
# __ZnwmSt19__type_descriptor_t (typed allocation ABI) which
# macOS 14's system libc++ doesn't provide, causing dyld to abort.
# Xcode clang doesn't ship libclang_rt.fuzzer_osx.a (no libFuzzer).
# TODO: fix macOS fuzzing (pin older compiler-rt, or static runtime).
continue-on-error: true
steps:
- uses: actions/checkout@v4
- name: Install LLVM 18
run: brew install llvm@18
- run: ./scripts/vendor.sh
- name: Generate sqlite-vec.h
run: make sqlite-vec.h
- name: Build fuzz targets
run: |
LLVM=/opt/homebrew/opt/llvm@18
make -C tests/fuzz all \
FUZZ_CC=$LLVM/bin/clang \
FUZZ_LDFLAGS="-Wl,-ld_classic"
- name: Run fuzz targets
run: |
DURATION=${{ github.event.inputs.duration || '60' }}
EXIT_CODE=0
for target in tests/fuzz/targets/*; do
[ -f "$target" ] && [ -x "$target" ] || continue
name=$(basename "$target")
echo "::group::Fuzzing $name ($DURATION seconds)"
corpus="tests/fuzz/corpus/$name"
mkdir -p "$corpus"
dict="tests/fuzz/${name//_/-}.dict"
dict_flag=""
[ -f "$dict" ] && dict_flag="-dict=$dict"
if ! "$target" $dict_flag \
-max_total_time="$DURATION" "$corpus" 2>&1; then
echo "::error::Fuzz target $name found a crash!"
EXIT_CODE=1
fi
echo "::endgroup::"
done
exit $EXIT_CODE
- name: Upload crash artifacts
if: failure()
uses: actions/upload-artifact@v4
with:
name: fuzz-crashes-macos
path: |
crash-*
leak-*
timeout-*
fuzz-windows:
# Best-effort: libFuzzer works on Windows via LLVM but ASAN/UBSAN
# support is less reliable. Leak detection is not available.
runs-on: windows-2022
continue-on-error: true
steps:
- uses: actions/checkout@v4
- name: Install LLVM
run: choco install llvm -y
- run: bash ./scripts/vendor.sh
shell: bash
- name: Generate sqlite-vec.h
shell: bash
run: make sqlite-vec.h
- name: Build fuzz targets
shell: bash
run: |
export PATH="/c/Program Files/LLVM/bin:$PATH"
cd tests/fuzz
mkdir -p targets
for src in *.c; do
name="${src%.c}"
target_name="${name//-/_}"
echo "Building $target_name from $src"
clang -fsanitize=address,fuzzer \
-I ../../ -I ../../vendor -DSQLITE_CORE -g \
../../vendor/sqlite3.c ../../sqlite-vec.c \
"$src" -o "targets/${target_name}.exe" || {
echo "Warning: failed to build $target_name (best-effort)"
}
done
- name: Run fuzz targets
shell: bash
run: |
export PATH="/c/Program Files/LLVM/bin:$PATH"
DURATION=${{ github.event.inputs.duration || '60' }}
for target in tests/fuzz/targets/*.exe; do
[ -f "$target" ] || continue
name=$(basename "$target" .exe)
echo "=== Fuzzing $name ($DURATION seconds) ==="
corpus="tests/fuzz/corpus/$name"
mkdir -p "$corpus"
"$target" -max_total_time="$DURATION" "$corpus" 2>&1 || {
echo "Warning: $name found an issue or failed"
}
done
- name: Upload crash artifacts
if: failure()
uses: actions/upload-artifact@v4
with:
name: fuzz-crashes-windows
path: |
tests/fuzz/crash-*
tests/fuzz/leak-*
================================================
FILE: .github/workflows/release.yaml
================================================
name: "Release"
on:
release:
types: [published]
permissions:
contents: read
jobs:
build-linux-x86_64-extension:
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v4
- run: ./scripts/vendor.sh
- run: make loadable static
- uses: actions/upload-artifact@v4
with:
name: sqlite-vec-linux-x86_64-extension
path: dist/*
build-macos-x86_64-extension:
runs-on: macos-15-intel
steps:
- uses: actions/checkout@v4
- run: ./scripts/vendor.sh
- run: make loadable static
- uses: actions/upload-artifact@v4
with:
name: sqlite-vec-macos-x86_64-extension
path: dist/*
build-macos-aarch64-extension:
runs-on: macos-14
steps:
- uses: actions/checkout@v4
- run: ./scripts/vendor.sh
- run: make loadable static
- uses: actions/upload-artifact@v4
with:
name: sqlite-vec-macos-aarch64-extension
path: dist/*
build-windows-x86_64-extension:
runs-on: windows-2022
permissions:
contents: write
steps:
- uses: actions/checkout@v4
- uses: ilammy/msvc-dev-cmd@v1
- uses: actions/setup-python@v5
with:
python-version: "3.12"
- run: ./scripts/vendor.sh
shell: bash
- run: make sqlite-vec.h
- run: mkdir dist
- run: cl.exe /fPIC -shared /W4 /Ivendor/ /O2 /LD sqlite-vec.c -o dist/vec0.dll
- uses: actions/upload-artifact@v4
with:
name: sqlite-vec-windows-x86_64-extension
path: dist/*
build-linux-aarch64-extension:
runs-on: ubuntu-22.04-arm
steps:
- uses: actions/checkout@v4
- run: ./scripts/vendor.sh
- run: make loadable static
- uses: actions/upload-artifact@v4
with:
name: sqlite-vec-linux-aarch64-extension
path: dist/*
build-cosmopolitan:
runs-on: macos-latest
permissions:
contents: write
steps:
- uses: actions/checkout@v4
- run: |
mkdir $HOME/cosmo
curl -L -o cosmocc-$COSMO_VERSION.zip https://github.com/jart/cosmopolitan/releases/download/$COSMO_VERSION/cosmocc-$COSMO_VERSION.zip
unzip cosmocc-$COSMO_VERSION.zip -d $HOME/cosmo
env:
COSMO_VERSION: "3.5.4"
- run: ./scripts/vendor.sh
- run: make cli CC=$HOME/cosmo/bin/cosmocc AR=$HOME/cosmo/bin/cosmoar OMIT_SIMD=1
- run: tar -czvf sqlite-vec-$(cat VERSION)-cli-cosmopolitan.tar.gz dist/sqlite3
- run: gh release upload ${{ github.ref_name }} sqlite-vec-$(cat VERSION)-cli-cosmopolitan.tar.gz
env:
GH_TOKEN: ${{ github.token }}
- uses: actions/upload-artifact@v4
with:
name: sqlite-vec-cosmopolitan
path: dist/*
build-wasm32-emscripten:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: mymindstorm/setup-emsdk@v12
with:
version: "latest"
- run: ./scripts/vendor.sh
- run: make sqlite-vec.h
- run: make wasm
- uses: actions/upload-artifact@v4
with:
name: sqlite-vec-wasm32-emscripten
path: dist/.wasm/*
include-hidden-files: true
build-android-extensions:
runs-on: ubuntu-latest
strategy:
matrix:
platforms:
[
{ name: android-aarch64, cc: aarch64-linux-android21-clang },
{ name: android-i686, cc: i686-linux-android21-clang },
{ name: android-x86_64, cc: x86_64-linux-android21-clang },
{ name: android-armv7a, cc: armv7a-linux-androideabi21-clang },
]
steps:
- uses: actions/checkout@v4
- run: ./scripts/vendor.sh
- uses: actions/setup-java@v4
with:
java-version: "17"
distribution: "temurin"
- uses: android-actions/setup-android@v3
- run: |
sdkmanager --install "ndk;27.0.12077973"
echo "ANDROID_NDK_HOME=$ANDROID_SDK_ROOT/ndk/27.0.12077973" >> $GITHUB_ENV
- run: |
ls $ANDROID_NDK_HOME/toolchains/llvm/prebuilt/
ls $ANDROID_NDK_HOME/toolchains/llvm/prebuilt/linux-x86_64/bin
- run: |
export PATH=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/linux-x86_64/bin:$PATH
make CC=${{ matrix.platforms.cc }} \
AR=llvm-ar \
RANLIB=llvm-ranlib \
STRIP=llvm-strip loadable
- uses: actions/upload-artifact@v4
with:
name: sqlite-vec-${{ matrix.platforms.name }}-extension
path: dist/*.so
build-ios-extensions:
runs-on: ${{ matrix.platforms.runner }}
strategy:
fail-fast: false
matrix:
platforms: [
{
name: ios-aarch64,
target: arm64-apple-ios,
sdk: iphoneos,
runner: macos-14,
},
#{
# name: ios-x86_64,
# target: x86_64-apple-ios,
# sdk: iphoneos,
# runner: macos-12,
#},
{
name: iossimulator-aarch64,
target: arm64-apple-ios-simulator,
sdk: iphonesimulator,
runner: macos-14,
},
{
name: iossimulator-x86_64,
target: x86_64-apple-ios-simulator,
sdk: iphonesimulator,
runner: macos-14,
},
]
steps:
- uses: actions/checkout@v4
- uses: maxim-lobanov/setup-xcode@v1
with:
xcode-version: latest-stable
- run: ./scripts/vendor.sh
- run: make CFLAGS="-target ${{ matrix.platforms.target }} -isysroot $(xcrun -sdk ${{ matrix.platforms.sdk }} --show-sdk-path) -fembed-bitcode -DNDEBUG=1" loadable static
- uses: actions/upload-artifact@v4
with:
name: sqlite-vec-${{ matrix.platforms.name }}-extension
path: dist/*
dist:
runs-on: ubuntu-latest
needs:
[
build-linux-x86_64-extension,
build-linux-aarch64-extension,
build-macos-x86_64-extension,
build-macos-aarch64-extension,
build-windows-x86_64-extension,
build-wasm32-emscripten,
build-android-extensions,
build-ios-extensions,
]
environment:
name: release
permissions:
contents: write
id-token: write
steps:
- uses: actions/checkout@v4
- uses: actions/download-artifact@v4
with:
name: sqlite-vec-linux-x86_64-extension
path: dist/linux-x86_64
- uses: actions/download-artifact@v4
with:
name: sqlite-vec-linux-aarch64-extension
path: dist/linux-aarch64
- uses: actions/download-artifact@v4
with:
name: sqlite-vec-macos-x86_64-extension
path: dist/macos-x86_64
- uses: actions/download-artifact@v4
with:
name: sqlite-vec-macos-aarch64-extension
path: dist/macos-aarch64
- uses: actions/download-artifact@v4
with:
name: sqlite-vec-windows-x86_64-extension
path: dist/windows-x86_64
- uses: actions/download-artifact@v4
with:
name: sqlite-vec-wasm32-emscripten
path: dist/wasm32-emscripten
- uses: actions/download-artifact@v4
with:
name: sqlite-vec-android-aarch64-extension
path: dist/android-aarch64
- uses: actions/download-artifact@v4
with:
name: sqlite-vec-android-i686-extension
path: dist/android-i686
- uses: actions/download-artifact@v4
with:
name: sqlite-vec-android-x86_64-extension
path: dist/android-x86_64
- uses: actions/download-artifact@v4
with:
name: sqlite-vec-android-armv7a-extension
path: dist/android-armv7a
- uses: actions/download-artifact@v4
with:
name: sqlite-vec-ios-aarch64-extension
path: dist/ios-aarch64
- uses: actions/download-artifact@v4
with:
name: sqlite-vec-iossimulator-aarch64-extension
path: dist/iossimulator-aarch64
- uses: actions/download-artifact@v4
with:
name: sqlite-vec-iossimulator-x86_64-extension
path: dist/iossimulator-x86_64
- run: make sqlite-vec.h
- uses: asg017/setup-sqlite-dist@fadb0183a6ec70c3f1942de7d232b087ff2bacd1
- run: sqlite-dist build --set-version $(cat VERSION)
- run: |
gh release upload ${{ github.ref_name }} \
.sqlite-dist/amalgamation/* \
.sqlite-dist/github_releases/* \
.sqlite-dist/spm/* \
.sqlite-dist/sqlpkg/* \
.sqlite-dist/checksums.txt \
.sqlite-dist/sqlite-dist-manifest.json \
.sqlite-dist/install.sh
env:
GH_TOKEN: ${{ github.token }}
- name: Install node
uses: actions/setup-node@v4
with:
node-version: "24"
registry-url: "https://registry.npmjs.org"
- run: |
VERSION=$(cat VERSION)
if echo "$VERSION" | grep -q "alpha"; then
TAG=alpha
elif echo "$VERSION" | grep -q "beta"; then
TAG=beta
else
TAG=latest
fi
npm publish --provenance --access public --tag $TAG .sqlite-dist/npm/sqlite-vec-darwin-arm64.tar.gz
npm publish --provenance --access public --tag $TAG .sqlite-dist/npm/sqlite-vec-darwin-x64.tar.gz
npm publish --provenance --access public --tag $TAG .sqlite-dist/npm/sqlite-vec-linux-x64.tar.gz
npm publish --provenance --access public --tag $TAG .sqlite-dist/npm/sqlite-vec-linux-arm64.tar.gz
npm publish --provenance --access public --tag $TAG .sqlite-dist/npm/sqlite-vec-windows-x64.tar.gz
npm publish --provenance --access public --tag $TAG .sqlite-dist/npm/sqlite-vec-wasm-demo.tar.gz
npm publish --provenance --access public --tag $TAG .sqlite-dist/npm/sqlite-vec.tar.gz
env:
NODE_AUTH_TOKEN: ""
- uses: ruby/setup-ruby@v1
with:
ruby-version: 3.2
- run: |
for file in .sqlite-dist/gem/*; do
gem push "$file"
done
env:
GEM_HOST_API_KEY: ${{ secrets.GEM_HOST_API_KEY }}
- uses: actions/setup-python@v5
with:
python-version: "3.12"
- run: pip install twine
- run: |
twine upload .sqlite-dist/pip/*
twine upload .sqlite-dist/datasette/*
twine upload .sqlite-dist/sqlite_utils/*
env:
TWINE_USERNAME: __token__
TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
upload-crate:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions-rs/toolchain@v1
with:
toolchain: stable
- run: ./scripts/vendor.sh
- run: make sqlite-vec.h
- run: make deps
working-directory: ./bindings/rust
- run: cargo publish --no-verify
working-directory: ./bindings/rust
env:
CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }}
build-ncruces-go:
if: false
runs-on: ubuntu-latest
permissions:
contents: write
steps:
- uses: actions/checkout@v4
- run: make sqlite-vec.h
- uses: actions/checkout@v4
with:
repository: ncruces/go-sqlite3
path: go-sqlite3
- run: git apply ../bindings/go/ncruces/go-sqlite3.patch
working-directory: go-sqlite3/
- run: |
mkdir -p tools/
[ -d "tools/wasi-sdk" ] || curl -#L "$WASI_SDK" | tar xzC tools &
[ -d "tools/binaryen" ] || curl -#L "$BINARYEN" | tar xzC tools &
wait
mv "tools/wasi-sdk"* "tools/wasi-sdk"
mv "tools/binaryen"* "tools/binaryen"
sqlite3/download.sh
embed/build.sh
env:
WASI_SDK: "https://github.com/WebAssembly/wasi-sdk/releases/download/wasi-sdk-23/wasi-sdk-23.0-x86_64-linux.tar.gz"
BINARYEN: "https://github.com/WebAssembly/binaryen/releases/download/version_118/binaryen-version_118-x86_64-linux.tar.gz"
working-directory: go-sqlite3/
- uses: actions/checkout@v4
with:
repository: asg017/sqlite-vec-go-bindings
path: sqlite-vec-go-bindings
token: ${{secrets.NCRUCES_BINDINGS_REPO_PAT}}
- run: |
cp go-sqlite3/embed/sqlite3.wasm sqlite-vec-go-bindings/ncruces/sqlite3.wasm
cp sqlite-vec.c sqlite-vec-go-bindings/cgo/sqlite-vec.c
cp sqlite-vec.h sqlite-vec-go-bindings/cgo/sqlite-vec.h
- run: |
git config user.name "Alex Garcia"
git config user.email "alexsebastian.garcia@gmail.com"
git add .
git commit --allow-empty -m "AUTOMATED ${{ github.ref_name }}" || exit 0
git tag "${{ github.ref_name }}"
git push origin main "${{ github.ref_name }}"
working-directory: sqlite-vec-go-bindings
env:
GITHUB_TOKEN: ${{secrets.NCRUCES_BINDINGS_REPO_PAT}}
================================================
FILE: .github/workflows/site.yaml
================================================
name: Deploy Site
on:
workflow_dispatch: {}
push:
branches:
- main
paths:
- "site/**"
- ".github/**"
- "VERSION"
- "reference.yaml"
jobs:
deploy:
runs-on: ubuntu-latest
permissions:
pages: write
id-token: write
environment:
name: github-pages
url: ${{ steps.deployment.outputs.page_url }}
steps:
- uses: actions/checkout@v4
- uses: actions/setup-node@v4
with:
cache: npm
cache-dependency-path: site/package-lock.json
- run: npm ci
working-directory: site/
- run: make site-build
- uses: actions/configure-pages@v2
- uses: actions/upload-pages-artifact@v4
with:
path: site/.vitepress/dist
- id: deployment
uses: actions/deploy-pages@v4
================================================
FILE: .github/workflows/test.yaml
================================================
name: "Test"
on:
push:
branches:
- main
permissions:
contents: read
jobs:
build-linux-x86_64-extension:
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v4
- uses: astral-sh/setup-uv@v7
- run: ./scripts/vendor.sh
- run: make loadable static
- run: uv sync --directory tests
- run: make test-loadable
- uses: actions/upload-artifact@v4
with:
name: sqlite-vec-linux-x86_64-extension
path: dist/*
build-macos-x86_64-extension:
runs-on: macos-15-intel
steps:
- uses: actions/checkout@v4
- uses: astral-sh/setup-uv@v7
with:
enable-cache: true
- run: uv python install 3.12
- run: ./scripts/vendor.sh
- run: make loadable static
- run: uv sync --directory tests
- run: make test-loadable
- uses: actions/upload-artifact@v4
with:
name: sqlite-vec-macos-x86_64-extension
path: dist/*
build-macos-aarch64-extension:
runs-on: macos-14
steps:
- uses: actions/checkout@v4
- uses: astral-sh/setup-uv@v7
with:
enable-cache: true
- run: ./scripts/vendor.sh
- run: make loadable static
- run: uv sync --directory tests
- run: make test-loadable
- uses: actions/upload-artifact@v4
with:
name: sqlite-vec-macos-aarch64-extension
path: dist/*
build-windows-x86_64-extension:
runs-on: windows-2022
steps:
- uses: actions/checkout@v4
- uses: ilammy/msvc-dev-cmd@v1
- uses: astral-sh/setup-uv@v7
with:
enable-cache: true
- run: ./scripts/vendor.sh
shell: bash
- run: make sqlite-vec.h
- run: mkdir dist
- run: cl.exe /fPIC -shared /W4 /Ivendor/ /O2 /LD sqlite-vec.c -o dist/vec0.dll
- run: uv sync --directory tests
- run: make test-loadable
shell: bash
- uses: actions/upload-artifact@v4
with:
name: sqlite-vec-windows-x86_64-extension
path: dist/*
build-linux-aarch64-extension:
runs-on: ubuntu-22.04-arm
steps:
- uses: actions/checkout@v4
- uses: astral-sh/setup-uv@v7
- run: ./scripts/vendor.sh
- run: make loadable static
- run: uv sync --directory tests
- run: make test-loadable
- uses: actions/upload-artifact@v4
with:
name: sqlite-vec-linux-aarch64-extension
path: dist/*
build-android-extensions:
runs-on: ubuntu-latest
strategy:
matrix:
platforms:
[
{ name: android-aarch64, cc: aarch64-linux-android21-clang },
{ name: android-i686, cc: i686-linux-android21-clang },
{ name: android-x86_64, cc: x86_64-linux-android21-clang },
{ name: android-armv7a, cc: armv7a-linux-androideabi21-clang },
]
steps:
- uses: actions/checkout@v4
- run: ./scripts/vendor.sh
- uses: actions/setup-java@v4
with:
java-version: "17"
distribution: "temurin"
- uses: android-actions/setup-android@v3
- run: |
sdkmanager --install "ndk;27.0.12077973"
echo "ANDROID_NDK_HOME=$ANDROID_SDK_ROOT/ndk/27.0.12077973" >> $GITHUB_ENV
- run: |
ls $ANDROID_NDK_HOME/toolchains/llvm/prebuilt/
ls $ANDROID_NDK_HOME/toolchains/llvm/prebuilt/linux-x86_64/bin
- run: |
export PATH=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/linux-x86_64/bin:$PATH
make CC=${{ matrix.platforms.cc }} \
AR=llvm-ar \
RANLIB=llvm-ranlib \
STRIP=llvm-strip loadable
- uses: actions/upload-artifact@v4
with:
name: sqlite-vec-android-${{ matrix.platforms.name }}-extension
path: dist/*.so
build-ios-extensions:
runs-on: ${{ matrix.platforms.runner }}
strategy:
fail-fast: false
matrix:
platforms: [
{
name: ios-arm64,
target: arm64-apple-ios,
sdk: iphoneos,
runner: macos-14,
},
#{
# name: ios-x86_64,
# target: x86_64-apple-ios,
# sdk: iphoneos,
# runner: macos-12,
#},
{
name: ios-simulator-arm64,
target: arm64-apple-ios-simulator,
sdk: iphonesimulator,
runner: macos-14,
},
{
name: ios-simulator-x86_64,
target: x86_64-apple-ios-simulator,
sdk: iphonesimulator,
runner: macos-14,
},
]
steps:
- uses: actions/checkout@v4
- uses: maxim-lobanov/setup-xcode@v1
with:
xcode-version: latest-stable
- run: ./scripts/vendor.sh
- run: make CFLAGS="-target ${{ matrix.platforms.target }} -isysroot $(xcrun -sdk ${{ matrix.platforms.sdk }} --show-sdk-path) -fembed-bitcode -DNDEBUG=1" loadable static
- uses: actions/upload-artifact@v4
with:
name: sqlite-vec-${{ matrix.platforms.name }}-extension
path: dist/*
build-wasm32-emscripten:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: mymindstorm/setup-emsdk@v12
with:
version: "latest"
- run: ./scripts/vendor.sh
- run: make sqlite-vec.h
- run: make wasm
- run: ls; ls dist; ls dist/.wasm
- uses: actions/upload-artifact@v4
with:
name: sqlite-vec-wasm32-emscripten
path: dist/.wasm/*
include-hidden-files: true
build-pyodide:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: mymindstorm/setup-emsdk@v12
with:
version: "latest"
- run: ./scripts/vendor.sh
- run: make sqlite-vec.h
- run: |
emcc -c ./sqlite-vec.c -o ./sqlite-vec.o -I ./vendor -fPIC -g3 \
-DSTDC_HEADERS=1 -DHAVE_SYS_TYPES_H=1 -DHAVE_SYS_STAT_H=1 -DHAVE_STDLIB_H=1 -DHAVE_STRING_H=1 -DHAVE_MEMORY_H=1 -DHAVE_STRINGS_H=1 -DHAVE_INTTYPES_H=1 -DHAVE_STDINT_H=1 -DHAVE_UNISTD_H=1 -DHAVE_FDATASYNC=1 -DHAVE_USLEEP=1 -DHAVE_LOCALTIME_R=1 -DHAVE_GMTIME_R=1 -DHAVE_DECL_STRERROR_R=1 -DHAVE_STRERROR_R=1 -DHAVE_POSIX_FALLOCATE=1 -DSQLITE_ENABLE_MATH_FUNCTIONS=1 -DSQLITE_ENABLE_FTS4=1 -DSQLITE_ENABLE_FTS5=1 -DSQLITE_ENABLE_RTREE=1 -DSQLITE_ENABLE_GEOPOLY=1 -DSQLITE_OMIT_POPEN=1 -DSQLITE_THREADSAFE=0
emcc ./sqlite-vec.o -o vec0.so -s SIDE_MODULE=1 -g3 -s WASM_BIGINT=1
- run: ls
- uses: actions/upload-artifact@v4
with:
name: sqlite-vec-pyodide
path: vec0.so
build-ncruces-go:
if: false
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- run: make sqlite-vec.h
- uses: actions/checkout@v4
with:
repository: ncruces/go-sqlite3
path: go-sqlite3
- run: git apply ../bindings/go/ncruces/go-sqlite3.patch
working-directory: go-sqlite3/
- run: |
mkdir -p tools/
[ -d "tools/wasi-sdk" ] || curl -#L "$WASI_SDK" | tar xzC tools &
[ -d "tools/binaryen" ] || curl -#L "$BINARYEN" | tar xzC tools &
wait
mv "tools/wasi-sdk"* "tools/wasi-sdk"
mv "tools/binaryen"* "tools/binaryen"
sqlite3/download.sh
embed/build.sh
env:
WASI_SDK: "https://github.com/WebAssembly/wasi-sdk/releases/download/wasi-sdk-23/wasi-sdk-23.0-x86_64-linux.tar.gz"
BINARYEN: "https://github.com/WebAssembly/binaryen/releases/download/version_118/binaryen-version_118-x86_64-linux.tar.gz"
working-directory: go-sqlite3/
- uses: actions/upload-artifact@v4
with:
name: sqlite-vec-ncruces-go
path: go-sqlite3/embed/sqlite3.wasm
build-cosmopolitan:
runs-on: macos-latest
steps:
- uses: actions/checkout@v4
- run: |
mkdir $HOME/cosmo
curl -L -o cosmocc-$COSMO_VERSION.zip https://github.com/jart/cosmopolitan/releases/download/$COSMO_VERSION/cosmocc-$COSMO_VERSION.zip
unzip cosmocc-$COSMO_VERSION.zip -d $HOME/cosmo
env:
COSMO_VERSION: "3.5.4"
- run: ./scripts/vendor.sh
- run: make cli CC=$HOME/cosmo/bin/cosmocc AR=$HOME/cosmo/bin/cosmoar OMIT_SIMD=1
- uses: actions/upload-artifact@v4
with:
name: sqlite-vec-cosmopolitan
path: dist/*
test-minimum:
runs-on: macos-latest
steps:
- uses: actions/checkout@v4
- run: make sqlite-vec.h
- run: make test
working-directory: tests/minimum
================================================
FILE: .gitignore
================================================
/target
.vscode
sift/
*.tar.gz
*.db
*.npy
*.bin
*.out
venv/
vendor/
dist/
*.pyc
*.db-journal
alexandria/
openai/
examples/supabase-dbpedia
examples/ann-filtering
examples/dbpedia-openai
examples/imdb
examples/sotu
sqlite-vec.h
tmp/
poetry.lock
*.jsonl
memstat.c
memstat.*
================================================
FILE: ARCHITECTURE.md
================================================
# `sqlite-vec` Architecture
Internal documentation for how `sqlite-vec` works under-the-hood. Not meant for
users of the `sqlite-vec` project, consult
[the official `sqlite-vec` documentation](https://alexgarcia.xyz/sqlite-vec) for
how-to-guides. Rather, this is for people interested in how `sqlite-vec` works
and some guidelines to any future contributors.
Very much a WIP.
## `vec0`
### Shadow Tables
#### `xyz_chunks`
- `chunk_id INTEGER`
- `size INTEGER`
- `validity BLOB`
- `rowids BLOB`
#### `xyz_rowids`
- `rowid INTEGER`
- `id`
- `chunk_id INTEGER`
- `chunk_offset INTEGER`
#### `xyz_vector_chunksNN`
- `rowid INTEGER`
- `vector BLOB`
#### `xyz_auxiliary`
- `rowid INTEGER`
- `valueNN [type]`
#### `xyz_metadatachunksNN`
- `rowid INTEGER`
- `data BLOB`
#### `xyz_metadatatextNN`
- `rowid INTEGER`
- `data TEXT`
### idxStr
The `vec0` idxStr is a string composed of single "header" character and 0 or
more "blocks" of 4 characters each.
The "header" charcter denotes the type of query plan, as determined by the
`enum vec0_query_plan` values. The current possible values are:
| Name | Value | Description |
| -------------------------- | ----- | ---------------------------------------------------------------------- |
| `VEC0_QUERY_PLAN_FULLSCAN` | `'1'` | Perform a full-scan on all rows |
| `VEC0_QUERY_PLAN_POINT` | `'2'` | Perform a single-lookup point query for the provided rowid |
| `VEC0_QUERY_PLAN_KNN` | `'3'` | Perform a KNN-style query on the provided query vector and parameters. |
Each 4-character "block" is associated with a corresponding value in `argv[]`.
For example, the 1st block at byte offset `1-4` (inclusive) is the 1st block and
is associated with `argv[1]`. The 2nd block at byte offset `5-8` (inclusive) is
associated with `argv[2]` and so on. Each block describes what kind of value or
filter the given `argv[i]` value is.
#### `VEC0_IDXSTR_KIND_KNN_MATCH` (`'{'`)
`argv[i]` is the query vector of the KNN query.
The remaining 3 characters of the block are `_` fillers.
#### `VEC0_IDXSTR_KIND_KNN_K` (`'}'`)
`argv[i]` is the limit/k value of the KNN query.
The remaining 3 characters of the block are `_` fillers.
#### `VEC0_IDXSTR_KIND_KNN_ROWID_IN` (`'['`)
`argv[i]` is the optional `rowid in (...)` value, and must be handled with
[`sqlite3_vtab_in_first()` / `sqlite3_vtab_in_next()`](https://www.sqlite.org/c3ref/vtab_in_first.html).
The remaining 3 characters of the block are `_` fillers.
#### `VEC0_IDXSTR_KIND_KNN_PARTITON_CONSTRAINT` (`']'`)
`argv[i]` is a "constraint" on a specific partition key.
The second character of the block denotes which partition key to filter on,
using `A` to denote the first partition key column, `B` for the second, etc. It
is encoded with `'A' + partition_idx` and can be decoded with `c - 'A'`.
The third character of the block denotes which operator is used in the
constraint. It will be one of the values of `enum vec0_partition_operator`, as
only a subset of operations are supported on partition keys.
The fourth character of the block is a `_` filler.
#### `VEC0_IDXSTR_KIND_POINT_ID` (`'!'`)
`argv[i]` is the value of the rowid or id to match against for the point query.
The remaining 3 characters of the block are `_` fillers.
#### `VEC0_IDXSTR_KIND_METADATA_CONSTRAINT` (`'&'`)
`argv[i]` is the value of the `WHERE` constraint for a metdata column in a KNN
query.
The second character of the block denotes which metadata column the constraint
belongs to, using `A` to denote the first metadata column column, `B` for the
second, etc. It is encoded with `'A' + metadata_idx` and can be decoded with
`c - 'A'`.
The third character of the block is the constraint operator. It will be one of
`enum vec0_metadata_operator`, as only a subset of operators are supported on
metadata column KNN filters.
The foruth character of the block is a `_` filler.
================================================
FILE: LICENSE-APACHE
================================================
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2024 Alex Garcia
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
================================================
FILE: LICENSE-MIT
================================================
MIT License
Copyright (c) 2024 Alex Garcia
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
================================================
FILE: Makefile
================================================
COMMIT=$(shell git rev-parse HEAD)
VERSION=$(shell cat VERSION)
DATE=$(shell date +'%FT%TZ%z')
INSTALL_LIB_DIR = /usr/local/lib
INSTALL_INCLUDE_DIR = /usr/local/include
INSTALL_BIN_DIR = /usr/local/bin
ifndef CC
CC=gcc
endif
ifndef AR
AR=ar
endif
ifeq ($(shell uname -s),Darwin)
CONFIG_DARWIN=y
else ifeq ($(OS),Windows_NT)
CONFIG_WINDOWS=y
else
CONFIG_LINUX=y
endif
ifdef CONFIG_DARWIN
LOADABLE_EXTENSION=dylib
endif
ifdef CONFIG_LINUX
LOADABLE_EXTENSION=so
CFLAGS += -lm
endif
ifdef CONFIG_WINDOWS
LOADABLE_EXTENSION=dll
endif
ifndef OMIT_SIMD
ifeq ($(shell uname -sm),Darwin x86_64)
CFLAGS += -mavx -DSQLITE_VEC_ENABLE_AVX
endif
ifeq ($(shell uname -sm),Darwin arm64)
CFLAGS += -mcpu=apple-m1 -DSQLITE_VEC_ENABLE_NEON
endif
endif
ifdef USE_BREW_SQLITE
SQLITE_INCLUDE_PATH=-I/opt/homebrew/opt/sqlite/include
SQLITE_LIB_PATH=-L/opt/homebrew/opt/sqlite/lib
CFLAGS += $(SQLITE_INCLUDE_PATH) $(SQLITE_LIB_PATH)
endif
ifdef IS_MACOS_ARM
RENAME_WHEELS_ARGS=--is-macos-arm
else
RENAME_WHEELS_ARGS=
endif
prefix=dist
$(prefix):
mkdir -p $(prefix)
TARGET_LOADABLE=$(prefix)/vec0.$(LOADABLE_EXTENSION)
TARGET_STATIC=$(prefix)/libsqlite_vec0.a
TARGET_STATIC_H=$(prefix)/sqlite-vec.h
TARGET_CLI=$(prefix)/sqlite3
loadable: $(TARGET_LOADABLE)
static: $(TARGET_STATIC)
cli: $(TARGET_CLI)
all: loadable static cli
OBJS_DIR=$(prefix)/.objs
LIBS_DIR=$(prefix)/.libs
BUILD_DIR=$(prefix)/.build
$(OBJS_DIR): $(prefix)
mkdir -p $@
$(LIBS_DIR): $(prefix)
mkdir -p $@
$(BUILD_DIR): $(prefix)
mkdir -p $@
$(TARGET_LOADABLE): sqlite-vec.c sqlite-vec.h $(prefix)
$(CC) \
-fPIC -shared \
-Wall -Wextra \
-Ivendor/ \
-O3 \
$(CFLAGS) \
$< -o $@
$(TARGET_STATIC): sqlite-vec.c sqlite-vec.h $(prefix) $(OBJS_DIR)
$(CC) -Ivendor/ $(CFLAGS) -DSQLITE_CORE -DSQLITE_VEC_STATIC \
-O3 -c $< -o $(OBJS_DIR)/vec.o
$(AR) rcs $@ $(OBJS_DIR)/vec.o
$(TARGET_STATIC_H): sqlite-vec.h $(prefix)
cp $< $@
$(OBJS_DIR)/sqlite3.o: vendor/sqlite3.c $(OBJS_DIR)
$(CC) -c -g3 -O3 -DSQLITE_EXTRA_INIT=core_init -DSQLITE_CORE -DSQLITE_ENABLE_STMT_SCANSTATUS -DSQLITE_ENABLE_BYTECODE_VTAB -DSQLITE_ENABLE_EXPLAIN_COMMENTS -I./vendor $< -o $@
$(LIBS_DIR)/sqlite3.a: $(OBJS_DIR)/sqlite3.o $(LIBS_DIR)
$(AR) rcs $@ $<
$(BUILD_DIR)/shell-new.c: vendor/shell.c $(BUILD_DIR)
sed 's/\/\*extra-version-info\*\//EXTRA_TODO/g' $< > $@
$(OBJS_DIR)/shell.o: $(BUILD_DIR)/shell-new.c $(OBJS_DIR)
$(CC) -c -g3 -O3 \
-I./vendor \
-DSQLITE_ENABLE_STMT_SCANSTATUS -DSQLITE_ENABLE_BYTECODE_VTAB -DSQLITE_ENABLE_EXPLAIN_COMMENTS \
-DEXTRA_TODO="\"CUSTOMBUILD:sqlite-vec\n\"" \
$< -o $@
$(LIBS_DIR)/shell.a: $(OBJS_DIR)/shell.o $(LIBS_DIR)
$(AR) rcs $@ $<
$(OBJS_DIR)/sqlite-vec.o: sqlite-vec.c $(OBJS_DIR)
$(CC) -c -g3 -Ivendor/ -I./ $(CFLAGS) $< -o $@
$(LIBS_DIR)/sqlite-vec.a: $(OBJS_DIR)/sqlite-vec.o $(LIBS_DIR)
$(AR) rcs $@ $<
$(TARGET_CLI): sqlite-vec.h $(LIBS_DIR)/sqlite-vec.a $(LIBS_DIR)/shell.a $(LIBS_DIR)/sqlite3.a examples/sqlite3-cli/core_init.c $(prefix)
$(CC) -g3 \
-Ivendor/ -I./ \
-DSQLITE_CORE \
-DSQLITE_VEC_STATIC \
-DSQLITE_THREADSAFE=0 -DSQLITE_ENABLE_FTS4 \
-DSQLITE_ENABLE_STMT_SCANSTATUS -DSQLITE_ENABLE_BYTECODE_VTAB -DSQLITE_ENABLE_EXPLAIN_COMMENTS \
-DSQLITE_EXTRA_INIT=core_init \
$(CFLAGS) \
-ldl -lm \
examples/sqlite3-cli/core_init.c $(LIBS_DIR)/shell.a $(LIBS_DIR)/sqlite3.a $(LIBS_DIR)/sqlite-vec.a -o $@
sqlite-vec.h: sqlite-vec.h.tmpl VERSION
VERSION=$(shell cat VERSION) \
DATE=$(shell date -r VERSION +'%FT%TZ%z') \
SOURCE=$(shell git log -n 1 --pretty=format:%H -- VERSION) \
VERSION_MAJOR=$$(echo $$VERSION | cut -d. -f1) \
VERSION_MINOR=$$(echo $$VERSION | cut -d. -f2) \
VERSION_PATCH=$$(echo $$VERSION | cut -d. -f3 | cut -d- -f1) \
envsubst < $< > $@
clean:
rm -rf dist
FORMAT_FILES=sqlite-vec.h sqlite-vec.c
format: $(FORMAT_FILES)
clang-format -i $(FORMAT_FILES)
black tests/test-loadable.py
lint: SHELL:=/bin/bash
lint:
diff -u <(cat $(FORMAT_FILES)) <(clang-format $(FORMAT_FILES))
progress:
deno run --allow-read=sqlite-vec.c scripts/progress.ts
evidence-of:
@echo "EVIDENCE-OF: V$(shell printf "%05d" $$((RANDOM % 100000)))_$(shell printf "%05d" $$((RANDOM % 100000)))"
test:
sqlite3 :memory: '.read test.sql'
.PHONY: version loadable static test clean gh-release evidence-of install uninstall
publish-release:
./scripts/publish-release.sh
# -k test_vec0_update
test-loadable: loadable
uv run --managed-python --project tests pytest -vv -s -x . tests/test-*.py
test-loadable-snapshot-update: loadable
uv run --managed-python --project tests pytest -vv tests/test-loadable.py --snapshot-update
test-loadable-watch:
watchexec --exts c,py,Makefile --clear -- make test-loadable
test-unit:
$(CC) -DSQLITE_CORE -DSQLITE_VEC_TEST tests/test-unit.c sqlite-vec.c vendor/sqlite3.c -I./ -Ivendor -o $(prefix)/test-unit && $(prefix)/test-unit
fuzz-build:
$(MAKE) -C tests/fuzz all
fuzz-quick: fuzz-build
@echo "Running all fuzz targets for 30 seconds each..."
@for target in tests/fuzz/targets/*; do \
[ -f "$$target" ] && [ -x "$$target" ] || continue; \
name=$$(basename $$target); \
echo "=== Fuzzing $$name ==="; \
corpus="tests/fuzz/corpus/$$name"; \
mkdir -p "$$corpus"; \
dict="tests/fuzz/$${name//_/-}.dict"; \
dict_flag=""; \
[ -f "$$dict" ] && dict_flag="-dict=$$dict"; \
"$$target" $$dict_flag \
-max_total_time=30 "$$corpus" 2>&1 || true; \
done
fuzz-long: fuzz-build
@echo "Running all fuzz targets for 5 minutes each..."
@for target in tests/fuzz/targets/*; do \
[ -f "$$target" ] && [ -x "$$target" ] || continue; \
name=$$(basename $$target); \
echo "=== Fuzzing $$name ==="; \
corpus="tests/fuzz/corpus/$$name"; \
mkdir -p "$$corpus"; \
dict="tests/fuzz/$${name//_/-}.dict"; \
dict_flag=""; \
[ -f "$$dict" ] && dict_flag="-dict=$$dict"; \
"$$target" $$dict_flag \
-max_total_time=300 "$$corpus" 2>&1 || true; \
done
site-dev:
npm --prefix site run dev
site-build:
npm --prefix site run build
install:
install -d $(INSTALL_LIB_DIR)
install -d $(INSTALL_INCLUDE_DIR)
install -m 644 sqlite-vec.h $(INSTALL_INCLUDE_DIR)
@if [ -f $(TARGET_LOADABLE) ]; then \
install -m 644 $(TARGET_LOADABLE) $(INSTALL_LIB_DIR); \
fi
@if [ -f $(TARGET_STATIC) ]; then \
install -m 644 $(TARGET_STATIC) $(INSTALL_LIB_DIR); \
fi
@if [ -f $(TARGET_CLI) ]; then \
sudo install -m 755 $(TARGET_CLI) $(INSTALL_BIN_DIR); \
fi
ldconfig
uninstall:
rm -f $(INSTALL_LIB_DIR)/$(notdir $(TARGET_LOADABLE))
rm -f $(INSTALL_LIB_DIR)/$(notdir $(TARGET_STATIC))
rm -f $(INSTALL_LIB_DIR)/$(notdir $(TARGET_CLI))
rm -f $(INSTALL_INCLUDE_DIR)/sqlite-vec.h
ldconfig
# ███████████████████████████████ WASM SECTION ███████████████████████████████
WASM_DIR=$(prefix)/.wasm
$(WASM_DIR): $(prefix)
mkdir -p $@
SQLITE_WASM_VERSION=3450300
SQLITE_WASM_YEAR=2024
SQLITE_WASM_SRCZIP=$(BUILD_DIR)/sqlite-src.zip
SQLITE_WASM_COMPILED_SQLITE3C=$(BUILD_DIR)/sqlite-src-$(SQLITE_WASM_VERSION)/sqlite3.c
SQLITE_WASM_COMPILED_MJS=$(BUILD_DIR)/sqlite-src-$(SQLITE_WASM_VERSION)/ext/wasm/jswasm/sqlite3.mjs
SQLITE_WASM_COMPILED_WASM=$(BUILD_DIR)/sqlite-src-$(SQLITE_WASM_VERSION)/ext/wasm/jswasm/sqlite3.wasm
TARGET_WASM_LIB=$(WASM_DIR)/libsqlite_vec.wasm.a
TARGET_WASM_MJS=$(WASM_DIR)/sqlite3.mjs
TARGET_WASM_WASM=$(WASM_DIR)/sqlite3.wasm
TARGET_WASM=$(TARGET_WASM_MJS) $(TARGET_WASM_WASM)
$(SQLITE_WASM_SRCZIP): $(BUILD_DIR)
curl -o $@ https://www.sqlite.org/$(SQLITE_WASM_YEAR)/sqlite-src-$(SQLITE_WASM_VERSION).zip
touch $@
$(SQLITE_WASM_COMPILED_SQLITE3C): $(SQLITE_WASM_SRCZIP) $(BUILD_DIR)
rm -rf $(BUILD_DIR)/sqlite-src-$(SQLITE_WASM_VERSION)/ || true
unzip -q -o $< -d $(BUILD_DIR)
(cd $(BUILD_DIR)/sqlite-src-$(SQLITE_WASM_VERSION)/ && ./configure --enable-all && make sqlite3.c)
touch $@
$(TARGET_WASM_LIB): examples/wasm/wasm.c sqlite-vec.c $(BUILD_DIR) $(WASM_DIR)
emcc -O3 -I./ -Ivendor -DSQLITE_CORE -c examples/wasm/wasm.c -o $(BUILD_DIR)/wasm.wasm.o
emcc -O3 -I./ -Ivendor -DSQLITE_CORE -c sqlite-vec.c -o $(BUILD_DIR)/sqlite-vec.wasm.o
emar rcs $@ $(BUILD_DIR)/wasm.wasm.o $(BUILD_DIR)/sqlite-vec.wasm.o
$(SQLITE_WASM_COMPILED_MJS) $(SQLITE_WASM_COMPILED_WASM): $(SQLITE_WASM_COMPILED_SQLITE3C) $(TARGET_WASM_LIB)
(cd $(BUILD_DIR)/sqlite-src-$(SQLITE_WASM_VERSION)/ext/wasm && \
make sqlite3_wasm_extra_init.c=../../../../.wasm/libsqlite_vec.wasm.a jswasm/sqlite3.mjs jswasm/sqlite3.wasm \
)
$(TARGET_WASM_MJS): $(SQLITE_WASM_COMPILED_MJS)
cp $< $@
$(TARGET_WASM_WASM): $(SQLITE_WASM_COMPILED_WASM)
cp $< $@
wasm: $(TARGET_WASM)
# ███████████████████████████████ END WASM ███████████████████████████████
================================================
FILE: README.md
================================================
# `sqlite-vec`
[](https://discord.gg/Ve7WeCJFXk)
An extremely small, "fast enough" vector search SQLite extension that runs
anywhere! A successor to [`sqlite-vss`](https://github.com/asg017/sqlite-vss)
> [!IMPORTANT]
> _`sqlite-vec` is a pre-v1, so expect breaking changes!_
- Store and query float, int8, and binary vectors in `vec0` virtual tables
- Written in pure C, no dependencies, runs anywhere SQLite runs
(Linux/MacOS/Windows, in the browser with WASM, Raspberry Pis, etc.)
- Store non-vector data in metadata, auxiliary, or partition key columns
sqlite-vec is a
Mozilla Builders project ,
with additional sponsorship from
Fly.io ,
Turso ,
SQLite Cloud , and
Shinkai .
See the Sponsors section for more details.
## Installing
See [Installing `sqlite-vec`](https://alexgarcia.xyz/sqlite-vec/installation.html)
for more details.
| Language | Install | More Info | |
| -------------- | ---------------------------------------------------- | ------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| Python | `pip install sqlite-vec` | [`sqlite-vec` with Python](https://alexgarcia.xyz/sqlite-vec/python.html) | [](https://pypi.org/project/sqlite-vec/) |
| Node.js | `npm install sqlite-vec` | [`sqlite-vec` with Node.js](https://alexgarcia.xyz/sqlite-vec/js.html) | [](https://www.npmjs.com/package/sqlite-vec) |
| Ruby | `gem install sqlite-vec` | [`sqlite-vec` with Ruby](https://alexgarcia.xyz/sqlite-vec/ruby.html) |  |
| Go | `go get -u github.com/asg017/sqlite-vec/bindings/go` | [`sqlite-vec` with Go](https://alexgarcia.xyz/sqlite-vec/go.html) | [](https://pkg.go.dev/github.com/asg017/asg017/sqlite-vec-go-bindings/cgo) |
| Rust | `cargo add sqlite-vec` | [`sqlite-vec` with Rust](https://alexgarcia.xyz/sqlite-vec/rust.html) | [](https://crates.io/crates/sqlite-vec) |
| Datasette | `datasette install datasette-sqlite-vec` | [`sqlite-vec` with Datasette](https://alexgarcia.xyz/sqlite-vec/datasette.html) | [](https://datasette.io/plugins/datasette-sqlite-vec) |
| rqlite | `rqlited -extensions-path=sqlite-vec.tar.gz` | [`sqlite-vec` with rqlite](https://alexgarcia.xyz/sqlite-vec/rqlite.html) | [](https://rqlite.io/docs/guides/extensions/) |
| `sqlite-utils` | `sqlite-utils install sqlite-utils-sqlite-vec` | [`sqlite-vec` with sqlite-utils](https://alexgarcia.xyz/sqlite-vec/sqlite-utils.html) | [](https://datasette.io/plugins/datasette-sqlite-vec) |
| Github Release | | |  |
## Sample usage
```sql
.load ./vec0
create virtual table vec_examples using vec0(
sample_embedding float[8]
);
-- vectors can be provided as JSON or in a compact binary format
insert into vec_examples(rowid, sample_embedding)
values
(1, '[-0.200, 0.250, 0.341, -0.211, 0.645, 0.935, -0.316, -0.924]'),
(2, '[0.443, -0.501, 0.355, -0.771, 0.707, -0.708, -0.185, 0.362]'),
(3, '[0.716, -0.927, 0.134, 0.052, -0.669, 0.793, -0.634, -0.162]'),
(4, '[-0.710, 0.330, 0.656, 0.041, -0.990, 0.726, 0.385, -0.958]');
-- KNN style query
select
rowid,
distance
from vec_examples
where sample_embedding match '[0.890, 0.544, 0.825, 0.961, 0.358, 0.0196, 0.521, 0.175]'
order by distance
limit 2;
/*
┌───────┬──────────────────┐
│ rowid │ distance │
├───────┼──────────────────┤
│ 2 │ 2.38687372207642 │
│ 1 │ 2.38978505134583 │
└───────┴──────────────────┘
*/
```
## Sponsors
Development of `sqlite-vec` is supported by multiple generous sponsors! Mozilla
is the main sponsor through the new Builders project.
`sqlite-vec` is also sponsored by the following companies:
As well as multiple individual supporters on
[Github sponsors](https://github.com/sponsors/asg017/)!
If your company interested in sponsoring `sqlite-vec` development, send me an
email to get more info: https://alexgarcia.xyz
## See Also
- [**`sqlite-ecosystem`**](https://github.com/asg017/sqlite-ecosystem), Maybe
more 3rd party SQLite extensions I've developed
- [**`sqlite-rembed`**](https://github.com/asg017/sqlite-rembed), Generate text
embeddings from remote APIs like OpenAI/Nomic/Ollama, meant for testing and
SQL scripts
- [**`sqlite-lembed`**](https://github.com/asg017/sqlite-lembed), Generate text
embeddings locally from embedding models in the `.gguf` format
================================================
FILE: SECURITY.md
================================================
Please report any security vulnerabilities to alexsebastian.garcia@gmail.com . Avould using public Github issues whenever possible. I will get back to you as quickly as possible.
================================================
FILE: TODO
================================================
- [ ] add `xyz_info` shadow table with version etc.
- later
- [ ] partition: UPDATE support
- [ ] skip invalid validity entries in knn filter?
- [ ] nulls in metadata
- [ ] partition `x in (...)` handling
- [ ] blobs/date/datetime
- [ ] uuid/ulid perf
- [ ] Aux columns: `NOT NULL` constraint
- [ ] Metadata columns: `NOT NULL` constraint
- [ ] Partiion key: `NOT NULL` constraint
- [ ] dictionary encoding?
- [ ] properly sqlite3_vtab_nochange / sqlite3_value_nochange handling
- [ ] perf
- [ ] aux: cache INSERT
- [ ] aux: LEFT JOIN on `_rowids` queries to avoid N lookup queries
================================================
FILE: VERSION
================================================
0.1.8-alpha.1
================================================
FILE: benchmarks/README.md
================================================
================================================
FILE: benchmarks/exhaustive-memory/.gitignore
================================================
data/
================================================
FILE: benchmarks/exhaustive-memory/Makefile
================================================
data/:
mkdir -p $@
data/sift: data/
curl -o data/sift.tar.gz ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz
tar -xvzf data/sift.tar.gz -C data/
rm data/sift.tar.gz
data/gist: data/
curl -o data/gist.tar.gz ftp://ftp.irisa.fr/local/texmex/corpus/gist.tar.gz
tar -xvzf data/gist.tar.gz -C data/
rm data/gist.tar.gz
================================================
FILE: benchmarks/exhaustive-memory/README.md
================================================
# `sqlite-vec` In-memory benchmark comparisions
This repo contains a benchmarks that compares KNN queries of `sqlite-vec` to other in-process vector search tools using **brute force linear scans only**. These include:
- [Faiss IndexFlatL2](https://faiss.ai/)
- [usearch with `exact=True`](https://github.com/unum-cloud/usearch)
- [libsql vector search with `vector_distance_cos`](https://turso.tech/vector)
- [numpy](https://numpy.org/), using [this approach](https://github.com/EthanRosenthal/nn-vs-ann)
- [duckdb with `list_cosine_similarity`](https://duckdb.org/docs/sql/functions/nested.html#list_cosine_similaritylist1-list2)
- [`sentence_transformers.util.semantic_search`](https://sbert.net/docs/package_reference/util.html#sentence_transformers.util.semantic_search)
- [hnswlib BFIndex](https://github.com/nmslib/hnswlib/blob/c1b9b79af3d10c6ee7b5d0afa1ce851ae975254c/TESTING_RECALL.md?plain=1#L8)
Again **ONLY BRUTE FORCE LINEAR SCANS ARE TESTED**. This benchmark does **not** test approximate nearest neighbors (ANN) implementations. This benchmarks is extremely narrow to just testing KNN searches using brute force.
A few other caveats:
- Only brute-force linear scans, no ANN
- Only CPU is used. The only tool that does offer GPU is Faiss anyway.
- Only in-memory datasets are used. Many of these tools do support serializing and reading from disk (including `sqlite-vec`) and possibly `mmap`'ing, but this only tests in-memory datasets. Mostly because of numpy
- Queries are made one after the other, **not batched.** Some tools offer APIs to query multiple inputs at the same time, but this benchmark runs queries sequentially. This was done to emulate "server request"-style queries, but multiple users would send queries at different times, making batching more difficult. To note, `sqlite-vec` does **not** support batched queries yet.
These tests are run in Python. Vectors are provided as an in-memory numpy array, and each test converts that numpy array into whatever makes sense for the given tool. For example, `sqlite-vec` tests will read those vectors into a SQLite table. DuckDB will read them into an Array array then create a DuckDB table from that.
================================================
FILE: benchmarks/exhaustive-memory/bench.py
================================================
import numpy as np
import numpy.typing as npt
import time
import sqlite3
import pandas as pd
from dataclasses import dataclass
from rich.console import Console
from rich.table import Table
from typing import List, Optional
@dataclass
class BenchResult:
tool: str
build_time_ms: float
query_times_ms: List[float]
def duration(seconds: float):
ms = seconds * 1000
return f"{int(ms)}ms"
def cosine_similarity(
vec: npt.NDArray[np.float32], mat: npt.NDArray[np.float32], do_norm: bool = True
) -> npt.NDArray[np.float32]:
sim = vec @ mat.T
if do_norm:
sim /= np.linalg.norm(vec) * np.linalg.norm(mat, axis=1)
return sim
def topk(
vec: npt.NDArray[np.float32],
mat: npt.NDArray[np.float32],
k: int = 5,
do_norm: bool = True,
) -> tuple[npt.NDArray[np.int32], npt.NDArray[np.float32]]:
sim = cosine_similarity(vec, mat, do_norm=do_norm)
# Rather than sorting all similarities and taking the top K, it's faster to
# argpartition and then just sort the top K.
# The difference is O(N logN) vs O(N + k logk)
indices = np.argpartition(-sim, kth=k)[:k]
top_indices = np.argsort(-sim[indices])
return indices[top_indices], sim[top_indices]
def ivecs_read(fname):
a = np.fromfile(fname, dtype="int32",)
d = a[0]
return a.reshape(-1, d + 1)[:, 1:].copy()
def fvecs_read(fname, sample):
return ivecs_read(fname).view("float32")[:sample]
def bench_hnsw(base, query):
import hnswlib
t0 = time.time()
p = hnswlib.Index(space="ip", dim=128) # possible options are l2, cosine or ip
# NOTE: Use default settings from the README.
print("buildings hnsw")
p.init_index(max_elements=base.shape[0], ef_construction=200, M=16)
ids = np.arange(base.shape[0])
p.add_items(base, ids)
p.set_ef(50)
print("build time", time.time() - t0)
results = []
times = []
t = time.time()
for idx, q in enumerate(query):
t0 = time.time()
result = p.knn_query(q, k=5)
results.append(result)
times.append(time.time() - t0)
print(time.time() - t)
print("hnsw avg", np.mean(times))
return results
def bench_hnsw_bf(base, query, k) -> BenchResult:
import hnswlib
print("hnswlib-bf")
dimensions = base.shape[1]
t0 = time.time()
p = hnswlib.BFIndex(space="l2", dim=dimensions)
p.init_index(max_elements=base.shape[0])
ids = np.arange(base.shape[0])
p.add_items(base, ids)
build_time = time.time() - t0
results = []
times = []
t = time.time()
for idx, q in enumerate(query):
t0 = time.time()
result = p.knn_query(q, k=k)
results.append(result)
times.append(time.time() - t0)
return BenchResult("hnswlib-bf", build_time, times)
def bench_numpy(base, query, k) -> BenchResult:
print("numpy...")
times = []
results = []
for idx, q in enumerate(query):
t0 = time.time()
result = topk(q, base, k=k)
results.append(result)
times.append(time.time() - t0)
return BenchResult("numpy", 0, times)
def bench_sqlite_vec(base, query, page_size, chunk_size, k) -> BenchResult:
dimensions = base.shape[1]
print(f"sqlite-vec {page_size} {chunk_size}...")
db = sqlite3.connect(":memory:")
db.execute(f"PRAGMA page_size = {page_size}")
db.enable_load_extension(True)
db.load_extension("../../dist/vec0")
db.execute(
f"""
create virtual table vec_sift1m using vec0(
chunk_size={chunk_size},
vector float[{dimensions}]
)
"""
)
t = time.time()
with db:
db.executemany(
"insert into vec_sift1m(vector) values (?)",
list(map(lambda x: [x.tobytes()], base)),
)
build_time = time.time() - t
times = []
results = []
for (
idx,
q,
) in enumerate(query):
t0 = time.time()
result = db.execute(
"""
select
rowid,
distance
from vec_sift1m
where vector match ?
and k = ?
order by distance
""",
[q.tobytes(), k],
).fetchall()
assert len(result) == k
times.append(time.time() - t0)
return BenchResult(f"sqlite-vec vec0 ({page_size}|{chunk_size})", build_time, times)
def bench_sqlite_vec_scalar(base, query, page_size, k) -> BenchResult:
print(f"sqlite-vec-scalar...")
db = sqlite3.connect(":memory:")
db.enable_load_extension(True)
db.load_extension("../../dist/vec0")
db.execute(f"PRAGMA page_size={page_size}")
db.execute(f"create table sift1m(vector);")
t = time.time()
with db:
db.executemany(
"insert into sift1m(vector) values (?)",
list(map(lambda x: [x.tobytes()], base)),
)
build_time = time.time() - t
times = []
results = []
t = time.time()
for (
idx,
q,
) in enumerate(query):
t0 = time.time()
result = db.execute(
"""
select
rowid,
vec_distance_l2(?, vector) as distance
from sift1m
order by distance
limit ?
""",
[q.tobytes(), k],
).fetchall()
assert len(result) == k
times.append(time.time() - t0)
return BenchResult(f"sqlite-vec-scalar ({page_size})", build_time, times)
def bench_libsql(base, query, page_size, k) -> BenchResult:
print(f"libsql ...")
dimensions = base.shape[1]
db = sqlite3.connect(":memory:")
db.enable_load_extension(True)
assert db.execute("select 'vector' in (select name from pragma_function_list)").fetchone()[0] == 1
db.execute(f"PRAGMA page_size={page_size}")
db.execute(f"create table vectors(vector f32_blob({dimensions}));")
# TODO: only does DiskANN?
#db.execute("CREATE INDEX vectors_idx ON vectors (libsql_vector_idx(vector, 'metric=cosine'))")
t = time.time()
with db:
db.executemany(
"insert into vectors(vector) values (?)",
list(map(lambda x: [x.tobytes()], base)),
)
build_time = time.time() - t
times = []
results = []
t = time.time()
for (
idx,
q,
) in enumerate(query):
t0 = time.time()
result = db.execute(
"""
select
rowid,
vector_distance_cos(?, vector) as distance
FROM vectors
order by 2
limit ?
""",
[q.tobytes(), k],
).fetchall()
times.append(time.time() - t0)
return BenchResult(f"libsql ({page_size})", build_time, times)
def register_np(db, array, name):
ptr = array.__array_interface__["data"][0]
nvectors, dimensions = array.__array_interface__["shape"]
element_type = array.__array_interface__["typestr"]
assert element_type == " BenchResult:
print(f"sqlite-vec static...")
db = sqlite3.connect(":memory:")
db.enable_load_extension(True)
db.load_extension("../../dist/vec0")
t = time.time()
register_np(db, base, "base")
build_time = time.time() - t
times = []
results = []
for (
idx,
q,
) in enumerate(query):
t0 = time.time()
result = db.execute(
"""
select
rowid
from base
where vector match ?
and k = ?
order by distance
""",
[q.tobytes(), k],
).fetchall()
assert len(result) == k
times.append(time.time() - t0)
return BenchResult(f"sqlite-vec static", build_time, times)
def bench_faiss(base, query, k) -> BenchResult:
import faiss
dimensions = base.shape[1]
print("faiss...")
t = time.time()
index = faiss.IndexFlatL2(dimensions)
index.add(base)
build_time = time.time() - t
times = []
results = []
t = time.time()
for idx, q in enumerate(query):
t0 = time.time()
distances, rowids = index.search(x=np.array([q]), k=k)
results.append(rowids)
times.append(time.time() - t0)
return BenchResult("faiss", build_time, times)
def bench_lancedb(base, query, k) -> BenchResult:
import lancedb
print('lancedb...')
dimensions = base.shape[1]
db = lancedb.connect("a")
data = [{"vector": row.reshape(1, -1)[0]} for row in base]
# Create a DataFrame where each row is a 1D array
df = pd.DataFrame(data=data, columns=["vector"])
t = time.time()
db.create_table("t", data=df)
build_time = time.time() - t
tbl = db.open_table("t")
times = []
for q in query:
t0 = time.time()
result = tbl.search(q).limit(k).to_arrow()
times.append(time.time() - t0)
return BenchResult("lancedb", build_time, times)
def bench_duckdb(base, query, k) -> BenchResult:
import duckdb
import pyarrow as pa
print("duckdb...")
dimensions = base.shape[1]
db = duckdb.connect(":memory:")
db.execute(f"CREATE TABLE t(vector float[{dimensions}])")
t0 = time.time()
pa_base = pa.Table.from_arrays([pa.array(list(base))], names=['vector'])
pa_base
db.execute(f"INSERT INTO t(vector) SELECT vector::float[{dimensions}] FROM pa_base")
build_time = time.time() - t0
times = []
for q in query:
t0 = time.time()
result = db.execute(
f"""
SELECT
rowid,
array_cosine_similarity(vector, ?::float[{dimensions}])
FROM t
ORDER BY 2 DESC
LIMIT ?
""", [q, k]).fetchall()
times.append(time.time() - t0)
return BenchResult("duckdb", build_time, times)
def bench_sentence_transformers(base, query, k) -> BenchResult:
from sentence_transformers.util import semantic_search
print("sentence-transformers")
dimensions = base.shape[1]
t0 = time.time()
build_time = time.time() - t0
times = []
for q in query:
t0 = time.time()
result = semantic_search(q, base, top_k=k)
times.append(time.time() - t0)
return BenchResult("sentence-transformers", build_time, times)
def bench_chroma(base, query, k):
import chromadb
from chromadb.utils.batch_utils import create_batches
chroma_client = chromadb.EphemeralClient()
collection = chroma_client.create_collection(name="my_collection")
t = time.time()
for batch in create_batches(api=chroma_client, ids=[str(x) for x in range(len(base))], embeddings=base.tolist()):
collection.add(*batch)
build_time = time.time() - t
times = []
for q in query:
t0 = time.time()
result = collection.query(
query_embeddings=[q.tolist()],
n_results=k,
)
times.append(time.time() - t0)
#print("chroma avg", duration(np.mean(times)))
return BenchResult("chroma", build_time, times)
def bench_usearch_npy(base, query, k) -> BenchResult:
from usearch.index import Index, search, MetricKind
times = []
for q in query:
t0 = time.time()
# result = index.search(q, exact=True)
result = search(base, q, k, MetricKind.L2sq, exact=True)
times.append(time.time() - t0)
return BenchResult("usearch numpy exact=True", 0, times)
def bench_usearch_special(base, query, k) -> BenchResult:
from usearch.index import Index, search, MetricKind
dimensions = base.shape[1]
index = Index(ndim=dimensions)
t = time.time()
index.add(np.arange(len(base)), base)
build_time = time.time() - t
times = []
for q in query:
t0 = time.time()
result = index.search(q, exact=True)
times.append(time.time() - t0)
return BenchResult("usuearch index", build_time, times)
def suite(name, base, query, k, benchmarks):
print(f"Starting benchmark suite: {name} {base.shape}, k={k}")
results = []
for b in benchmarks:
if b == "faiss":
results.append(bench_faiss(base, query, k=k))
elif b == "vec-static":
results.append(bench_sqlite_vec_static(base, query, k=k))
elif b.startswith("vec-scalar"):
_, page_size = b.split('.')
results.append(bench_sqlite_vec_scalar(base, query, page_size, k=k))
elif b.startswith("libsql"):
_, page_size = b.split('.')
results.append(bench_libsql(base, query, page_size, k=k))
elif b.startswith("vec-vec0"):
_, page_size, chunk_size = b.split('.')
results.append(bench_sqlite_vec(base, query, int(page_size), int(chunk_size), k=k))
elif b == "usearch":
results.append(bench_usearch_npy(base, query, k=k))
elif b == "hnswlib":
results.append(bench_hnsw_bf(base, query, k=k))
elif b == "numpy":
results.append(bench_numpy(base, query, k=k))
elif b == "duckdb":
results.append(bench_duckdb(base, query, k=k))
elif b == "sentence-transformers":
results.append(bench_sentence_transformers(base, query, k=k))
elif b == "chroma":
results.append(bench_chroma(base, query, k=k))
else:
raise Exception(f"unknown benchmark {b}")
#results.append(bench_sqlite_vec(base, query, 32768, 512, k=k))
#results.append(bench_sqlite_vec(base, query, 32768, 256, k=k))
#results.append(bench_sqlite_vec_expo(base, query, k=k))
# n = bench_chroma(base[:40000], query, k=k)
# n = bench_usearch_special(base, query, k=k)
# n = bench_sqlite_vec(base, query, 4096, 1024, k=k)
# n = bench_sqlite_vec(base, query, 32768, 1024, k=k)
# blessed
### #for pgsz in [4096, 8192, 16384, 32768, 65536]:
### # for chunksz in [8, 32, 128, 512, 1024, 2048]:
### # results.append(bench_sqlite_vec(base, query, pgsz, chunksz, k=k))
### # n = bench_sqlite_vec(base, query, 16384, 64, k=k)
### # n = bench_sqlite_vec(base, query, 16384, 32, k=k)
### results.append(bench_sqlite_normal(base, query, 8192, k=k))
### results.append(bench_lancedb(base, query, k=k))
### #h = bench_hnsw(base, query)
table = Table(
title=f"{name}: {base.shape[0]:,} {base.shape[1]}-dimension vectors, k={k}"
)
table.add_column("Tool")
table.add_column("Build Time (ms)", justify="right")
table.add_column("Query time (ms)", justify="right")
for res in sorted(results, key=lambda x: np.mean(x.query_times_ms)):
table.add_row(
res.tool, duration(res.build_time_ms), duration(np.mean(res.query_times_ms))
)
console = Console()
console.print(table)
import argparse
def parse_args():
parser = argparse.ArgumentParser(description="Benchmark processing script.")
# Required arguments
parser.add_argument("-n", "--name", required=True, help="Name of the benchmark.")
parser.add_argument(
"-i", "--input", required=True, help="Path to input file (.npy)."
)
parser.add_argument(
"-k", type=int, required=True, help="Parameter k to use in benchmark."
)
# Optional arguments
parser.add_argument(
"-q", "--query", required=False, help="Path to query file (.npy)."
)
parser.add_argument(
"--sample",
type=int,
required=False,
help="Number of entries in base to use. Defaults all",
default=-1
)
parser.add_argument(
"--qsample",
type=int,
required=False,
help="Number of queries to use. Defaults all",
)
parser.add_argument(
"-x", help="type of runs to make", default="faiss,vec-scalar.4096,vec-static,vec-vec0.4096.16,usearch,duckdb,hnswlib,numpy"
)
args = parser.parse_args()
return args
from pathlib import Path
def cli_read_input(input, sample):
input_path = Path(input)
if input_path.suffix == ".fvecs":
return fvecs_read(input_path, sample)
if input_path.suffx == ".npy":
return np.fromfile(input_path, dtype="float32", count=sample)
raise Exception("unknown filetype", input)
def cli_read_query(query, base):
if query is None:
return base[np.random.choice(base.shape[0], 100, replace=False), :]
return cli_read_input(query, -1)
@dataclass
class Config:
name: str
input: str
k: int
queries: str
qsample: int
tests: List[str]
sample: Optional[int]
def parse_config_file(path:str) -> Config:
name = None
input = None
k = None
queries = None
qsample = None
sample = None
tests = []
for line in open(path, 'r'):
line = line.strip()
if not line or line.startswith('#'):
continue
elif line.startswith('@name='):
name = line.removeprefix('@name=')
elif line.startswith('@k='):
k = line.removeprefix('@k=')
elif line.startswith('@input='):
input = line.removeprefix('@input=')
elif line.startswith('@queries='):
queries = line.removeprefix('@queries=')
elif line.startswith('@qsample='):
qsample = line.removeprefix('@qsample=')
elif line.startswith('@sample='):
sample = line.removeprefix('@sample=')
elif line.startswith('@'):
raise Exception(f"unknown config line '{line}'")
else:
tests.append(line)
return Config(name, input, int(k), queries, int(qsample), tests, int(sample) if sample is not None else None)
from sys import argv
if __name__ == "__main__":
config = parse_config_file(argv[1])
print(config)
#args = parse_args()
#print(args)
base = cli_read_input(config.input, config.sample)
queries = cli_read_query(config.queries, base)[: config.qsample]
suite(config.name, base, queries, config.k, config.tests)
#main()
================================================
FILE: benchmarks/exhaustive-memory/gist.suite
================================================
@name=gist
@input=data/gist/gist_base.fvecs
@queries=data/gist/gist_query.fvecs
@sample=500000
@qsample=20
@k=20
faiss
usearch
vec-static
#duckdb
#vec-vec0.8192.1024
#vec-vec0.8192.2048
#vec-scalar.8192
#numpy
================================================
FILE: benchmarks/exhaustive-memory/requirements.txt
================================================
annotated-types==0.7.0
anyio==4.4.0
asgiref==3.8.1
attrs==23.2.0
backoff==2.2.1
bcrypt==4.2.0
build==1.2.1
cachetools==5.4.0
certifi==2024.7.4
charset-normalizer==3.3.2
chroma-hnswlib==0.7.6
chromadb==0.5.5
click==8.1.7
coloredlogs==15.0.1
decorator==5.1.1
deprecated==1.2.14
deprecation==2.1.0
dnspython==2.6.1
duckdb==1.0.0
email-validator==2.2.0
faiss-cpu==1.8.0.post1
fastapi==0.111.1
fastapi-cli==0.0.4
filelock==3.15.4
flatbuffers==24.3.25
fsspec==2024.6.1
google-auth==2.32.0
googleapis-common-protos==1.63.2
grpcio==1.65.1
h11==0.14.0
hnswlib==0.8.0
httpcore==1.0.5
httptools==0.6.1
httpx==0.27.0
huggingface-hub==0.24.1
humanfriendly==10.0
idna==3.7
importlib-metadata==8.0.0
importlib-resources==6.4.0
jinja2==3.1.4
joblib==1.4.2
kubernetes==30.1.0
lancedb==0.10.2
markdown-it-py==3.0.0
markupsafe==2.1.5
mdurl==0.1.2
mmh3==4.1.0
monotonic==1.6
mpmath==1.3.0
networkx==3.3
numpy==1.26.4
oauthlib==3.2.2
onnxruntime==1.18.1
opentelemetry-api==1.26.0
opentelemetry-exporter-otlp-proto-common==1.26.0
opentelemetry-exporter-otlp-proto-grpc==1.26.0
opentelemetry-instrumentation==0.47b0
opentelemetry-instrumentation-asgi==0.47b0
opentelemetry-instrumentation-fastapi==0.47b0
opentelemetry-proto==1.26.0
opentelemetry-sdk==1.26.0
opentelemetry-semantic-conventions==0.47b0
opentelemetry-util-http==0.47b0
orjson==3.10.6
overrides==7.7.0
packaging==24.1
pandas==2.2.2
pillow==10.4.0
posthog==3.5.0
protobuf==4.25.4
py==1.11.0
pyarrow==15.0.0
pyasn1==0.6.0
pyasn1-modules==0.4.0
pydantic==2.8.2
pydantic-core==2.20.1
pygments==2.18.0
pylance==0.14.1
pypika==0.48.9
pyproject-hooks==1.1.0
python-dateutil==2.9.0.post0
python-dotenv==1.0.1
python-multipart==0.0.9
pytz==2024.1
pyyaml==6.0.1
ratelimiter==1.2.0.post0
regex==2024.5.15
requests==2.32.3
requests-oauthlib==2.0.0
retry==0.9.2
rich==13.7.1
rsa==4.9
safetensors==0.4.3
scikit-learn==1.5.1
scipy==1.14.0
sentence-transformers==3.0.1
setuptools==71.1.0
shellingham==1.5.4
six==1.16.0
sniffio==1.3.1
starlette==0.37.2
sympy==1.13.1
tenacity==8.5.0
threadpoolctl==3.5.0
tokenizers==0.19.1
torch==2.3.1
tqdm==4.66.4
transformers==4.43.1
typer==0.12.3
typing-extensions==4.12.2
tzdata==2024.1
urllib3==2.2.2
usearch==2.12.0
uvicorn==0.30.3
uvloop==0.19.0
watchfiles==0.22.0
websocket-client==1.8.0
websockets==12.0
wrapt==1.16.0
zipp==3.19.2
================================================
FILE: benchmarks/exhaustive-memory/sift.suite
================================================
@name=sift1m
@input=data/sift/sift_base.fvecs
@queries=data/sift/sift_query.fvecs
@qsample=100
@k=20
faiss
usearch
duckdb
vec-static
vec-vec0.8192.1024
vec-vec0.8192.2048
vec-scalar.8192
numpy
# #libsql.4096
# #libsql.8192
# faiss
# vec-scalar.4096
# vec-static
# vec-vec0.4096.16
# vec-vec0.8192.1024
# vec-vec0.4096.2048
# usearch
# duckdb
# hnswlib
# numpy
# chroma
================================================
FILE: benchmarks/micro/.gitignore
================================================
target/
================================================
FILE: benchmarks/micro/Cargo.toml
================================================
[package]
name = "micro"
version = "0.1.0"
edition = "2021"
[dependencies]
rusqlite = {version="0.31.0", features=["bundled"]}
[dev-dependencies]
criterion = "0.3"
rand = "0.8.5"
zerocopy = "0.7.34"
[build-dependencies]
cc = "1.0.99"
[[bench]]
name = "my_benchmark"
harness = false
================================================
FILE: benchmarks/micro/benches/my_benchmark.rs
================================================
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use micro::init_vec;
use rand::Rng;
use rusqlite::Connection;
use zerocopy::AsBytes;
fn random_vector(n: usize) -> Vec {
let mut rng = rand::thread_rng();
(0..n).map(|_| rng.gen()).collect()
}
fn setup_base(page_size: usize, d: usize, n: i32) -> Connection {
let base: Vec> = (0..n).map(|_| random_vector(d)).collect();
let mut db = Connection::open_in_memory().unwrap();
db.pragma_update(
Some(rusqlite::DatabaseName::Main),
"page_size",
page_size, //,
//|row| Ok(assert!(row.get::(0).unwrap() == page_size)),
)
.unwrap();
assert_eq!(
db.pragma_query_value(Some(rusqlite::DatabaseName::Main), "page_size", |v| {
Ok(v.get::(0).unwrap())
})
.unwrap(),
page_size,
);
db.execute(
format!("create virtual table vec_base using vec0(a float[{d}])").as_str(),
[],
)
.unwrap();
let tx = db.transaction().unwrap();
for item in &base {
tx.execute("insert into vec_base(a) values (?)", [item.as_bytes()])
.unwrap();
}
tx.commit().unwrap();
db
}
pub fn criterion_benchmark(c: &mut Criterion) {
init_vec();
let n = 1_000_000;
let d = 1536;
let k = 10;
let page_size = 8192;
let page_sizes = [4096, 8192, 16384, 32768];
for page_size in page_sizes {
let db = setup_base(page_size, d, n);
let mut stmt = db
.prepare("select rowid, a from vec_base where rowid = ?")
.unwrap();
c.bench_function(
format!("point page_size={page_size} n={n} dimension={d} k={k}").as_str(),
|b| {
let mut rng = rand::thread_rng();
let query: i64 = rng.gen_range(0..n.into());
b.iter(|| {
let result: (i64, Vec) = stmt
.query_row(rusqlite::params![query], |r| {
Ok((r.get(0).unwrap(), r.get(1).unwrap()))
})
.unwrap();
assert_eq!(result.0, query);
});
},
);
/*
c.bench_function(
format!("KNN page_size={page_size} n={n} dimension={d} k={k}").as_str(),
|b| {
let query: Vec = random_vector(d);
let db = setup_base(page_size, d, n);
let mut stmt = db.prepare(
"select rowid, distance from vec_base where a match ? order by distance limit ?",
)
.unwrap();
b.iter(|| {
let result: Vec<(i64, f64)> = stmt
.query_map(rusqlite::params![query.as_bytes(), k], |r| {
Ok((r.get(0).unwrap(), r.get(1).unwrap()))
})
.unwrap()
.collect::, _>>()
.unwrap();
assert_eq!(result.len(), 10);
});
stmt.finalize().unwrap()
},
); */
}
}
criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);
================================================
FILE: benchmarks/micro/build.rs
================================================
fn main() {
cc::Build::new()
.file("../../sqlite-vec.c")
.compile("sqlite_vec0");
}
================================================
FILE: benchmarks/micro/src/lib.rs
================================================
use rusqlite::ffi::sqlite3_auto_extension;
#[link(name = "sqlite_vec0")]
extern "C" {
pub fn sqlite3_vec_init();
}
pub fn init_vec() {
unsafe {
sqlite3_auto_extension(Some(std::mem::transmute(sqlite3_vec_init as *const ())));
}
}
================================================
FILE: benchmarks/profiling/build-from-npy.sql
================================================
.timer on
pragma page_size = 32768;
--pragma page_size = 16384;
--pragma page_size = 16384;
--pragma page_size = 4096;
create virtual table vec_items using vec0(
embedding float[1536]
);
-- 65s (limit 1e5), ~615MB on disk
insert into vec_items
select
rowid,
vector
from vec_npy_each(vec_npy_file('examples/dbpedia-openai/data/vectors.npy'))
limit 1e5;
================================================
FILE: benchmarks/profiling/query-k.sql
================================================
.timer on
select rowid, distance
from vec_items
where embedding match (select embedding from vec_items where rowid = 100)
and k = :k
order by distance;
select rowid, distance
from vec_items
where embedding match (select embedding from vec_items where rowid = 100)
and k = :k
order by distance;
select rowid, distance
from vec_items
where embedding match (select embedding from vec_items where rowid = 100)
and k = :k
order by distance;
select rowid, distance
from vec_items
where embedding match (select embedding from vec_items where rowid = 100)
and k = :k
order by distance;
select rowid, distance
from vec_items
where embedding match (select embedding from vec_items where rowid = 100)
and k = :k
order by distance;
================================================
FILE: benchmarks/self-params/build.py
================================================
import sqlite3
import time
def connect(path):
db = sqlite3.connect(path)
db.enable_load_extension(True)
db.load_extension("../dist/vec0")
db.execute("select load_extension('../dist/vec0', 'sqlite3_vec_fs_read_init')")
db.enable_load_extension(False)
return db
page_sizes = [ # 4096, 8192,
16384,
32768,
]
chunk_sizes = [128, 256, 1024, 2048]
types = ["f32", "int8", "bit"]
SRC = "../examples/dbpedia-openai/data/vectors.npy"
for page_size in page_sizes:
for chunk_size in chunk_sizes:
for t in types:
print(f"{t} page_size={page_size}, chunk_size={chunk_size}")
t0 = time.time()
db = connect(f"dbs/test.{page_size}.{chunk_size}.{t}.db")
db.execute(f"pragma page_size = {page_size}")
with db:
db.execute(
f"""
create virtual table vec_items using vec0(
embedding {t}[1536],
chunk_size={chunk_size}
)
"""
)
func = "vector"
if t == "int8":
func = "vec_quantize_i8(vector, 'unit')"
if t == "bit":
func = "vec_quantize_binary(vector)"
db.execute(
f"""
insert into vec_items
select rowid, {func}
from vec_npy_each(vec_npy_file(?))
limit 100000
""",
[SRC],
)
elapsed = time.time() - t0
print(elapsed)
"""
# for 100_000
page_size=4096, chunk_size=256
3.5894200801849365
page_size=4096, chunk_size=1024
60.70046401023865
page_size=4096, chunk_size=2048
201.04426288604736
page_size=8192, chunk_size=256
7.034514904022217
page_size=8192, chunk_size=1024
9.983598947525024
page_size=8192, chunk_size=2048
12.318921089172363
page_size=16384, chunk_size=256
4.97080397605896
page_size=16384, chunk_size=1024
6.051195859909058
page_size=16384, chunk_size=2048
8.492683172225952
page_size=32768, chunk_size=256
5.906642198562622
page_size=32768, chunk_size=1024
5.876632213592529
page_size=32768, chunk_size=2048
5.420510292053223
"""
================================================
FILE: benchmarks/self-params/knn.py
================================================
import sqlite3
import time
from random import randrange
from statistics import mean
def connect(path):
print(path)
db = sqlite3.connect(path)
db.enable_load_extension(True)
db.load_extension("../dist/vec0")
db.execute("select load_extension('../dist/vec0', 'sqlite3_vec_fs_read_init')")
db.enable_load_extension(False)
return db
page_sizes = [ # 4096, 8192,
16384,
32768,
]
chunk_sizes = [128, 256, 1024, 2048]
types = ["f32", "int8", "bit"]
types.reverse()
for t in types:
for page_size in page_sizes:
for chunk_size in chunk_sizes:
print(f"page_size={page_size}, chunk_size={chunk_size}")
func = "embedding"
if t == "int8":
func = "vec_quantize_i8(embedding, 'unit')"
if t == "bit":
func = "vec_quantize_binary(embedding)"
times = []
trials = 20
db = connect(f"dbs/test.{page_size}.{chunk_size}.{t}.db")
for trial in range(trials):
t0 = time.time()
results = db.execute(
f"""
select rowid
from vec_items
where embedding match (select {func} from vec_items where rowid = ?)
and k = 10
order by distance
""",
[randrange(100000)],
).fetchall()
times.append(time.time() - t0)
print(mean(times))
"""
page_size=4096, chunk_size=256
0.2635102152824402
page_size=4096, chunk_size=1024
0.2609449863433838
page_size=4096, chunk_size=2048
0.275589919090271
page_size=8192, chunk_size=256
0.18621582984924318
page_size=8192, chunk_size=1024
0.20939643383026124
page_size=8192, chunk_size=2048
0.22376316785812378
page_size=16384, chunk_size=256
0.16012665033340454
page_size=16384, chunk_size=1024
0.18346318006515502
page_size=16384, chunk_size=2048
0.18224761486053467
page_size=32768, chunk_size=256
0.14202518463134767
page_size=32768, chunk_size=1024
0.15340715646743774
page_size=32768, chunk_size=2048
0.18018823862075806
"""
================================================
FILE: benchmarks/self-params/test.py
================================================
import sqlite3
import time
def connect(path):
db = sqlite3.connect(path)
db.enable_load_extension(True)
db.load_extension("../dist/vec0")
db.execute("select load_extension('../dist/vec0', 'sqlite3_vec_fs_read_init')")
db.enable_load_extension(False)
return db
page_sizes = [4096, 8192, 16384, 32768]
chunk_sizes = [256, 1024, 2048]
for page_size in page_sizes:
for chunk_size in chunk_sizes:
print(f"page_size={page_size}, chunk_size={chunk_size}")
t0 = time.time()
db = connect(f"dbs/test.{page_size}.{chunk_size}.db")
print(db.execute("pragma page_size").fetchone()[0])
print(db.execute("select count(*) from vec_items_rowids").fetchone()[0])
================================================
FILE: bindings/go/ncruces/go-sqlite3.patch
================================================
diff --git a/embed/build.sh b/embed/build.sh
index ed2aaec..4cc0b0e 100755
--- a/embed/build.sh
+++ b/embed/build.sh
@@ -23,6 +23,7 @@ trap 'rm -f sqlite3.tmp' EXIT
-Wl,--initial-memory=327680 \
-D_HAVE_SQLITE_CONFIG_H \
-DSQLITE_CUSTOM_INCLUDE=sqlite_opt.h \
+ -DSQLITE_VEC_OMIT_FS=1 \
$(awk '{print "-Wl,--export="$0}' exports.txt)
"$BINARYEN/wasm-ctor-eval" -g -c _initialize sqlite3.wasm -o sqlite3.tmp
diff --git a/sqlite3/main.c b/sqlite3/main.c
index c732937..7c9002a 100644
--- a/sqlite3/main.c
+++ b/sqlite3/main.c
@@ -19,6 +19,7 @@
#include "time.c"
#include "vfs.c"
#include "vtab.c"
+#include "../../sqlite-vec.c"
__attribute__((constructor)) void init() {
sqlite3_initialize();
@@ -30,4 +31,5 @@ __attribute__((constructor)) void init() {
sqlite3_auto_extension((void (*)(void))sqlite3_spellfix_init);
sqlite3_auto_extension((void (*)(void))sqlite3_uint_init);
sqlite3_auto_extension((void (*)(void))sqlite3_time_init);
+ sqlite3_auto_extension((void (*)(void))sqlite3_vec_init);
}
\ No newline at end of file
================================================
FILE: bindings/python/extra_init.py
================================================
from typing import List
from struct import pack
from sqlite3 import Connection
def serialize_float32(vector: List[float]) -> bytes:
"""Serializes a list of floats into the "raw bytes" format sqlite-vec expects"""
return pack("%sf" % len(vector), *vector)
def serialize_int8(vector: List[int]) -> bytes:
"""Serializes a list of integers into the "raw bytes" format sqlite-vec expects"""
return pack("%sb" % len(vector), *vector)
try:
import numpy.typing as npt
def register_numpy(db: Connection, name: str, array: npt.NDArray):
"""ayoo"""
ptr = array.__array_interface__["data"][0]
nvectors, dimensions = array.__array_interface__["shape"]
element_type = array.__array_interface__["typestr"]
assert element_type == ""]
description = "FFI bindings to the sqlite-vec SQLite extension"
homepage = "https://github.com/asg017/sqlite-vec"
repository = "https://github.com/asg017/sqlite-vec"
keywords = ["sqlite", "sqlite-extension"]
license = "MIT/Apache-2.0"
[dependencies]
[build-dependencies]
cc = "1.0"
[dev-dependencies]
rusqlite = "0.31.0"
================================================
FILE: bindings/rust/Makefile
================================================
VERSION=$(shell cat ../../VERSION)
deps: Cargo.toml sqlite-vec.c sqlite-vec.h sqlite3ext.h sqlite3.h
Cargo.toml: ../../VERSION Cargo.toml.tmpl
VERSION=$(VERSION) envsubst < Cargo.toml.tmpl > $@
sqlite-vec.c: ../../sqlite-vec.c
cp $< $@
sqlite-vec.h: ../../sqlite-vec.h
cp $< $@
sqlite3ext.h: ../../vendor/sqlite3ext.h
cp $< $@
sqlite3.h: ../../vendor/sqlite3.h
cp $< $@
.PHONY: deps
================================================
FILE: bindings/rust/build.rs
================================================
fn main() {
cc::Build::new().file("sqlite-vec.c").define("SQLITE_CORE", None).compile("sqlite_vec0");
}
================================================
FILE: bindings/rust/src/lib.rs
================================================
#[link(name = "sqlite_vec0")]
extern "C" {
pub fn sqlite3_vec_init();
}
#[cfg(test)]
mod tests {
use super::*;
use rusqlite::{ffi::sqlite3_auto_extension, Connection};
#[test]
fn test_rusqlite_auto_extension() {
unsafe {
sqlite3_auto_extension(Some(std::mem::transmute(sqlite3_vec_init as *const ())));
}
let conn = Connection::open_in_memory().unwrap();
let result: String = conn
.query_row("select vec_version()", [], |x| x.get(0))
.unwrap();
assert!(result.starts_with("v"));
}
}
================================================
FILE: examples/nbc-headlines/.gitignore
================================================
*.dylib
*.so
*.dll
*.gguf
================================================
FILE: examples/nbc-headlines/1_scrape.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# NBC News Headlines: Scraper\n",
"\n",
"This notebooks implements a scraper for [NBC News](https://www.nbcnews.com) headlines. It uses [this sitemap](https://www.nbcnews.com/archive/articles/2024/march), which provides a list of article headlines + URLs\n",
"for every month for the past few years. \n",
"\n",
"This dataset is mostly to get a simple, real-world small text dataset for testing embeddings. \n",
"They're small pieces of text (~dozen words), have a wide range of semantic meaning, and are more \"real-world\"\n",
"them some other embeddings datasets out there.\n",
"\n",
"This notebook uses [Deno](https://deno.com/), [linkedom](https://github.com/WebReflection/linkedom), and a few \n",
"SQLite extensions to scrape the headlines for a given date range. It creates a single SQL table, `articles`, \n",
"with a few columns like `headline` and `url`. By default it will get all article headlines from January 2024 -> present\n",
"and save them to a database called `headlines-2024.db`. Feel free to copy+paste this code into your own custom scraper. \n",
"\n",
"This notebook also just scrapes the data into a SQLite database, it does NOT do any embeddings + vector search. \n",
"For those examples of those, see [`./2_build.ipynb`](./2_build.ipynb) and [`./3_search.ipynb`](./3_search.ipynb)."
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"import { Database, Statement } from \"jsr:@db/sqlite@0.11\";\n",
"import { parseHTML } from \"npm:linkedom\";\n",
"import * as d3 from \"npm:d3-time\";\n",
"import * as sqlitePath from \"npm:sqlite-path\";\n",
"import * as sqliteUrl from \"npm:sqlite-url\";\n",
"import * as sqliteRegex from \"npm:sqlite-regex\";\n"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [],
"source": [
"const months = [\"january\", \"february\", \"march\", \"april\", \"may\", \"june\", \"july\", \"august\", \"september\", \"october\", \"november\", \"december\"]\n",
"\n",
"class Db {\n",
" db: Database;\n",
" #stmtInsertArticle: Statement;\n",
"\n",
" constructor(path:string) {\n",
" this.db = new Database(path);\n",
" this.db.enableLoadExtension = true;\n",
" this.db.loadExtension(sqlitePath.getLoadablePath());\n",
" this.db.loadExtension(sqliteUrl.getLoadablePath());\n",
" this.db.loadExtension(sqliteRegex.getLoadablePath());\n",
" this.db.enableLoadExtension = false;\n",
"\n",
" this.db.exec(`\n",
" CREATE TABLE IF NOT EXISTS articles(\n",
" id integer primary key autoincrement,\n",
" year integer,\n",
" month integer,\n",
" slug TEXT,\n",
" slug_id TEXT,\n",
" headline TEXT,\n",
" url TEXT,\n",
" category1 TEXT,\n",
" category2 TEXT\n",
" )\n",
" `);\n",
"\n",
" this.#stmtInsertArticle = this.db.prepare(`\n",
" insert into articles(year, month, slug, slug_id, headline, url, category1, category2)\n",
" select\n",
" :year as year,\n",
" :month as month,\n",
" regex_capture(\n",
" '(?P.+)-(?P[^-]+)$',\n",
" path_at(url_path(:url), -1),\n",
" 'slug'\n",
" ) as slug,\n",
" regex_capture(\n",
" '(?P.+)-(?P[^-]+)$',\n",
" path_at(url_path(:url), -1),\n",
" 'id'\n",
" ) as slug_id,\n",
" :headline as headline,\n",
" :url as url,\n",
" path_at(url_path(:url), 0) as category1,\n",
" iif(\n",
" path_length(url_path(:url)) > 2,\n",
" path_at(url_path(:url), 1),\n",
" null\n",
" ) as category2\n",
" `);\n",
" }\n",
"\n",
" insertArticles(year:number, month:text, articles:{url: string, year: number, month: number}[]) {\n",
" const tx = this.db.transaction((year, month, articles) => {\n",
" for(const article of articles) {\n",
" this.#stmtInsertArticle.run({...article, year, month})\n",
" }\n",
" });\n",
" tx(year, month, articles);\n",
" }\n",
"}\n",
"\n",
"async function insertMonth(db: Db, year:number, month: text) {\n",
" let url = `https://www.nbcnews.com/archive/articles/${year}/${month}`;\n",
" while(true) {\n",
" const monthPage = await fetch(url).then(r=>r.text())\n",
" const {document:monthPageDoc} = parseHTML(monthPage);\n",
" const monthEntries = monthPageDoc\n",
" .querySelectorAll('.MonthPage a')\n",
" .map(a => ({headline: a.innerText, url: a.getAttribute('href')}));\n",
" db.insertArticles(year, months.findIndex(m => m === month)+1, monthEntries);\n",
" const next = monthPageDoc.querySelector('a.Pagination__next.Pagination__enable');\n",
" if(!next) {\n",
" break;\n",
" }\n",
" url = `https://www.nbcnews.com${next.getAttribute('href')}`;\n",
" }\n",
"\n",
"}\n"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [],
"source": [
"\n",
"async function backfill(db, start: Date, end: Date) {\n",
" const targets = d3.timeMonths(start, end)\n",
" .map(date => ({year: date.getFullYear(), monthIndex: date.getMonth()}));\n",
" for(const target of targets) {\n",
" console.log(`${target.year} ${target.monthIndex}`)\n",
" await insertMonth(db, target.year, months[target.monthIndex]);\n",
" }\n",
"}\n"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"2024 0\n",
"2024 1\n",
"2024 2\n",
"2024 3\n",
"2024 4\n",
"2024 5\n",
"2024 6\n",
"2024 7\n"
]
},
{
"data": {
"text/plain": [
"\u001b[33m1\u001b[39m"
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"const db = new Db(\":memory:\");\n",
"await backfill(db, new Date('2024-01-01'), new Date())\n",
"db.db.exec(\"vacuum into 'headlines-2024.db'\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Deno",
"language": "typescript",
"name": "deno"
},
"language_info": {
"codemirror_mode": "typescript",
"file_extension": ".ts",
"mimetype": "text/x.typescript",
"name": "typescript",
"nbconvert_exporter": "script",
"pygments_lexer": "typescript",
"version": "5.5.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
================================================
FILE: examples/nbc-headlines/2_build.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# NBC News Headlines: Building FTS5 + `vec0` indexes\n",
"\n",
"Using the dataset built in [the previous `./1_scrape.ipynb` notebook](./1_scrape.ipynb), \n",
"this notebook will enrich that dataset with a full-text search index and a semantic search index,\n",
"using [FTS5](https://www.sqlite.org/fts5.html), \n",
"[`sqlite-vec`](https://github.com/asg017/sqlite-vec), and \n",
"[`sqlite-lembed`](https://github.com/asg017/sqlite-lembed).\n",
"\n",
"This example will use pure SQL for everything. You can do the same exact thing in Python/JavaScript/Go/Rust/etc., or use\n",
"your own embeddings providers like Ollama/llamafile/OpenAI/etc. The core mechanics of FTS5 and `sqlite-vec` will remain the same. \n",
"\n",
"We will use the [Snowflake Artic Embed v1.5](https://huggingface.co/Snowflake/snowflake-arctic-embed-m-v1.5) embeddings model to generate embeddings. "
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[no code]"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
".open tmp-artic2.db"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Step 1: Create a FTS5 index\n",
"\n",
"Creating a full-text search index is as simple as 3 SQL commands! We already have the headlines stored in the `articles` \n",
"table under the `headline` column, so it's just a matter of initializing the FTS5 virtual table and inserting the data."
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
"\n",
"\n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
"0 row × 0 column\n",
"
\n",
"
\n"
],
"text/plain": [
"\u001b[0m┌\u001b[0m\u001b[0m\u001b[0m├\u001b[0m\u001b[0m"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"create virtual table fts_articles using fts5(\n",
" headline,\n",
" content='articles', content_rowid='id'\n",
");\n",
"\n",
"insert into fts_articles(rowid, headline)\n",
" select rowid, headline\n",
" from articles;\n",
"\n",
"insert into fts_articles(fts_articles) values('optimize');"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"By convention we name the FTS5 table `fts_articles`, where the `fts_` prefix says \"this virtual table is full-text search of the `articles` table\". We are only searching the `headline` column, the rest can be ignored. \n",
"\n",
"Here we are using the [\"external content tables\"](https://www.sqlite.org/fts5.html#external_content_tables)\n",
"feature in FTS5 tables, which will avoid storing the headlines a 2nd time, since they already exist in the `articles` table. \n",
"This part isn't required, but saves us a bit of storage. \n",
"\n",
"We also use the [`'optimize'`](https://www.sqlite.org/fts5.html#the_optimize_command) command\n",
" to keep things tidy. This doesn't do much on such a small dataset, but is important to remember for larger tables!"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
"\n",
"\n",
"\n",
"headline\n",
" \n",
" \n",
" \n",
"\n",
"\n",
"\n",
"Kamala Harris visits Planned Parenthood clinic\n",
" \n",
" \n",
"\n",
"\n",
"Former Marine sentenced to 9 years in prison for firebombing Planned Parenthood clinic\n",
" \n",
" \n",
" \n",
"
\n",
"
\n",
"2 rows × 1 column\n",
"
\n",
"
\n"
],
"text/plain": [
"\u001b[0m┌\u001b[0m\u001b[0m────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\u001b[0m┐\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[0m\u001b[1mheadline\u001b[0m \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m├\u001b[0m\u001b[0m────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\u001b[0m┤\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mKamala Harris visits Planned Parenthood clinic \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mFormer Marine sentenced to 9 years in prison for firebombing Planned Parenthood clinic\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m└\u001b[0m\u001b[0m────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\u001b[0m┘\n",
"\u001b[0m\u001b[0m"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"select *\n",
"from fts_articles\n",
"where headline match 'planned parenthood'\n",
"limit 10;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Step 2: Create a \"semantic index\"\n",
"\n",
"\"Semantic index\" in this case is just a fancy way of saying \"vector store\", which we will do with a `sqlite-vec` `vec0` virtual table. \n",
"\n",
"Now, `sqlite-vec` just stores vectors, it doesn't generate embeddings for us. There are hundreds of different remote APIs or local inference runtimes you can use to generate embeddings,\n",
"but here we will use [`sqlite-lembed`](https://github.com/asg017/sqlite-lembed) to keep everything local and everything in pure SQL. \n",
"\n",
"We will need to choose an embeddings model in the [GGUF format](https://huggingface.co/docs/hub/en/gguf),\n",
"since `sqlite-lembed` uses [llama.cpp](https://github.com/ggerganov/llama.cpp) under the hood. \n",
"Here we will use [`Snowflake/snowflake-arctic-embed-m-v1.5`](https://huggingface.co/Snowflake/snowflake-arctic-embed-m-v1.5),\n",
"where we can find a GGUF version [here](https://huggingface.co/asg017/sqlite-lembed-model-examples/tree/main/snowflake-arctic-embed-m-v1.5). \n",
"This model is small-sh (`436MB` full-sized, `118MB` at `Q8_0` quantized), and is trained on fairly recent data so it understands\n",
"recent events like \"COVID-19\" or \"Kamala Harris\". \n",
"\n",
"You can download a `.gguf` quantized version of this model with:\n",
"\n",
"```bash\n",
"wget https://huggingface.co/asg017/sqlite-lembed-model-examples/resolve/main/snowflake-arctic-embed-m-v1.5/snowflake-arctic-embed-m-v1.5.d70deb40.f16.gguf\n",
"```\n",
"\n",
"And we can configure `sqlite-lembed` to use this model like so:"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
"\n",
"\n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
"0 row × 0 column\n",
"
\n",
"
\n"
],
"text/plain": [
"\u001b[0m┌\u001b[0m\u001b[0m\u001b[0m├\u001b[0m\u001b[0m"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
".load ./lembed0\n",
".load ../../dist/vec0\n",
"\n",
"insert into lembed_models(name, model) values\n",
" ('default', lembed_model_from_file('./snowflake-arctic-embed-m-v1.5.d70deb40.f16.gguf'));"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"It's embeddings time! We can use the `lembed()` function, which takes in text and returns a vector representation of that text,\n",
"as an embeddings BLOB that we can insert directly into a `vec0` virtul table. \n",
"\n",
"We'll declare this new `vec_articles` table, using the `vec_` prefix as convention. This matches the `fts_articles` table above. \n",
"The Snowflake embedding model generate vectors with `768` dimensions, which we we store as-as. \n",
"\n",
"Embedding and inserting into this vector store is as easy as a single `INSERT INTO` and `lembed()` call."
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
"\n",
"\n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
"0 row × 0 column\n",
"
\n",
"
\n"
],
"text/plain": [
"\u001b[0m┌\u001b[0m\u001b[0m\u001b[0m├\u001b[0m\u001b[0m"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\n",
"create virtual table vec_articles using vec0(\n",
" article_id integer primary key,\n",
" headline_embedding float[768]\n",
");\n",
"\n",
"insert into vec_articles(article_id, headline_embedding)\n",
"select\n",
" rowid,\n",
" lembed(headline)\n",
"from articles;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This took ~13 minutes for ~14,500 embeddings on my older 2019 Macbook, but newer computers with better CPUs will finish quicker (it took `2m20s` on my newer Mac M1 Mini). \n",
"\n",
"Once the `vec_articles` is ready, we can perform a KNN query like so:"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
"\n",
"\n",
"\n",
"headline\n",
" \n",
"\n",
"distance\n",
" \n",
" \n",
" \n",
"\n",
"\n",
"\n",
"Kamala Harris visits Planned Parenthood clinic\n",
" \n",
"\n",
"0.492593914270401\n",
" \n",
" \n",
"\n",
"\n",
"After Dobbs decision, more women are managing their own abortions\n",
" \n",
"\n",
"0.5789032578468323\n",
" \n",
" \n",
"\n",
"\n",
"Transforming Healthcare\n",
" \n",
"\n",
"0.5822411179542542\n",
" \n",
" \n",
"\n",
"\n",
"A timeline of Trump's many, many positions on abortion\n",
" \n",
"\n",
"0.6101462841033936\n",
" \n",
" \n",
"\n",
"\n",
"How a network of abortion pill providers works together in the wake of new threats\n",
" \n",
"\n",
"0.6196886897087097\n",
" \n",
" \n",
"\n",
"\n",
"'Major hurdles': The reality check behind Biden's big abortion promise\n",
" \n",
"\n",
"0.6198344826698303\n",
" \n",
" \n",
"\n",
"\n",
"Trump's conflicting abortion stances are coming back to haunt him — and his party\n",
" \n",
"\n",
"0.6198986768722534\n",
" \n",
" \n",
"\n",
"\n",
"Where abortion rights could be on the ballot this fall: From the Politics Desk\n",
" \n",
"\n",
"0.6201764345169067\n",
" \n",
" \n",
"\n",
"\n",
"How the Biden campaign quickly mobilized on Trump's abortion stance\n",
" \n",
"\n",
"0.633980393409729\n",
" \n",
" \n",
"\n",
"\n",
"Battle over abortion heats up in Arizona — and could be on the 2024 ballot\n",
" \n",
"\n",
"0.6341449022293091\n",
" \n",
" \n",
" \n",
"
\n",
"
\n",
"10 rows × 2 columns\n",
"
\n",
"
\n"
],
"text/plain": [
"\u001b[0m┌\u001b[0m\u001b[0m────────────────────────────────────────────────────────────────────────────────────\u001b[0m\u001b[0m┬\u001b[0m\u001b[0m────────────────────\u001b[0m\u001b[0m┐\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[0m\u001b[1mheadline\u001b[0m \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[0m\u001b[1mdistance\u001b[0m \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m├\u001b[0m\u001b[0m────────────────────────────────────────────────────────────────────────────────────\u001b[0m\u001b[0m┼\u001b[0m\u001b[0m────────────────────\u001b[0m\u001b[0m┤\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mKamala Harris visits Planned Parenthood clinic \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0.492593914270401\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mAfter Dobbs decision, more women are managing their own abortions \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m0.5789032578468323\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mTransforming Healthcare \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m0.5822411179542542\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mA timeline of Trump's many, many positions on abortion \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m0.6101462841033936\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mHow a network of abortion pill providers works together in the wake of new threats\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m0.6196886897087097\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m'Major hurdles': The reality check behind Biden's big abortion promise \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m0.6198344826698303\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mTrump's conflicting abortion stances are coming back to haunt him — and his party \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m0.6198986768722534\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mWhere abortion rights could be on the ballot this fall: From the Politics Desk \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m0.6201764345169067\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mHow the Biden campaign quickly mobilized on Trump's abortion stance \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0.633980393409729\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mBattle over abortion heats up in Arizona — and could be on the 2024 ballot \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m0.6341449022293091\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m└\u001b[0m\u001b[0m────────────────────────────────────────────────────────────────────────────────────\u001b[0m\u001b[0m┴\u001b[0m\u001b[0m────────────────────\u001b[0m\u001b[0m┘\n",
"\u001b[0m\u001b[0m"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"select\n",
" articles.headline,\n",
" vec_articles.distance\n",
"from vec_articles\n",
"left join articles on articles.rowid = vec_articles.article_id\n",
"where headline_embedding match lembed(\"planned parenthood\")\n",
" and k = 10;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Slim it down with Binary Quantization\n",
"\n",
"The vectors in the `vec_articles` table take up a lot of space. A vector with `768` dimensions take up `786 * 4 = 3072` bytes of space each, or around `45MB` of space for these ~14,500 entries. \n",
"\n",
"That's a lot — the original text dataset was only `~4MB`!\n",
"\n",
"If you want to make the database smaller, there's a number of quantization or other methods to do so, by trading accuracy. \n",
"Here's an example of performing [binary quantization](https://alexgarcia.xyz/sqlite-vec/guides/binary-quant.html)\n",
"on this dataset, storing 768-dimensional bit-vectors instead of floating-point vectors, a `32x` size reduction, at the expense of accuracy. \n",
"\n",
"We'll keep the current SQLite database as-is, and instead make a copy into a new SQLite database file, and change the `vec_articles` table\n",
"to store bit-vectors instead. \n",
"\n",
"First, we'll make a copy of the current database into a new file:"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
"\n",
"\n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
"0 row × 0 column\n",
"
\n",
"
\n"
],
"text/plain": [
"\u001b[0m┌\u001b[0m\u001b[0m\u001b[0m├\u001b[0m\u001b[0m"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vacuum into 'tmp-artic2.slim.db';"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now we'll make a connection to this new file, and drop the old `vec_articles` table that contains the large `float[768]` vectors."
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
"\n",
"\n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
"0 row × 0 column\n",
"
\n",
"
\n"
],
"text/plain": [
"\u001b[0m┌\u001b[0m\u001b[0m\u001b[0m├\u001b[0m\u001b[0m"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"attach database 'tmp-artic2.slim.db' as slim;\n",
"drop table slim.vec_articles;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now we can create a new `vec0` table, storing `bit[768]` vectors instead! \n",
"We can insert the original `float[768]` from the `main.vec_articles` table (original table),\n",
"calling [`vec_quantize_binary()`](https://alexgarcia.xyz/sqlite-vec/api-reference.html#vec_quantize_binary) to convert the floats to bits. "
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
"\n",
"\n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
"0 row × 0 column\n",
"
\n",
"
\n"
],
"text/plain": [
"\u001b[0m┌\u001b[0m\u001b[0m\u001b[0m├\u001b[0m\u001b[0m"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"\n",
"create virtual table slim.vec_articles using vec0(\n",
" article_id integer primary key,\n",
" headline_embedding bit[768]\n",
");\n",
"\n",
"insert into slim.vec_articles(article_id, headline_embedding)\n",
"select\n",
" article_id,\n",
" vec_quantize_binary(headline_embedding)\n",
"from main.vec_articles;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Then we can `VACUUM` the new `slim` database to shrink the file, delete the `DROP`'ed pages from the older `vec0` table. "
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
"\n",
"\n",
" \n",
" \n",
"\n",
" \n",
"
\n",
"
\n",
"0 row × 0 column\n",
"
\n",
"
\n"
],
"text/plain": [
"\u001b[0m┌\u001b[0m\u001b[0m\u001b[0m├\u001b[0m\u001b[0m"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"vacuum slim;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"And there we have it! This file is `7.1MB`, a large reduction from the original `53MB` table. \n",
"\n",
"KNN queries are similar, only adding the `vec_quantize_binary()` function to the query vector."
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
"\n",
"\n",
"\n",
"headline\n",
" \n",
"\n",
"distance\n",
" \n",
" \n",
" \n",
"\n",
"\n",
"\n",
"Kamala Harris visits Planned Parenthood clinic\n",
" \n",
"\n",
"139\n",
" \n",
" \n",
"\n",
"\n",
"How a network of abortion pill providers works together in the wake of new threats\n",
" \n",
"\n",
"151\n",
" \n",
" \n",
"\n",
"\n",
"After Dobbs decision, more women are managing their own abortions\n",
" \n",
"\n",
"153\n",
" \n",
" \n",
"\n",
"\n",
"A timeline of Trump's many, many positions on abortion\n",
" \n",
"\n",
"156\n",
" \n",
" \n",
"\n",
"\n",
"Two of the country’s largest transgender rights organizations will merge\n",
" \n",
"\n",
"158\n",
" \n",
" \n",
"\n",
"\n",
"Transforming Healthcare\n",
" \n",
"\n",
"158\n",
" \n",
" \n",
"\n",
"\n",
"With Harris and Walz, Democrats put abortion rights at the top of the agenda\n",
" \n",
"\n",
"159\n",
" \n",
" \n",
"\n",
"\n",
"In states with strict abortion policies, simply seeing an OB/GYN for regular care can be difficult\n",
" \n",
"\n",
"160\n",
" \n",
" \n",
"\n",
"\n",
"Where abortion rights could be on the ballot this fall: From the Politics Desk\n",
" \n",
"\n",
"161\n",
" \n",
" \n",
"\n",
"\n",
"Map: Where medication abortion is and isn’t legal\n",
" \n",
"\n",
"162\n",
" \n",
" \n",
" \n",
"
\n",
"
\n",
"10 rows × 2 columns\n",
"
\n",
"
\n"
],
"text/plain": [
"\u001b[0m┌\u001b[0m\u001b[0m────────────────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\u001b[0m┬\u001b[0m\u001b[0m──────────\u001b[0m\u001b[0m┐\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[0m\u001b[1mheadline\u001b[0m \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[0m\u001b[1mdistance\u001b[0m\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m├\u001b[0m\u001b[0m────────────────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\u001b[0m┼\u001b[0m\u001b[0m──────────\u001b[0m\u001b[0m┤\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mKamala Harris visits Planned Parenthood clinic \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 139\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mHow a network of abortion pill providers works together in the wake of new threats \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 151\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mAfter Dobbs decision, more women are managing their own abortions \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 153\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mA timeline of Trump's many, many positions on abortion \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 156\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mTwo of the country’s largest transgender rights organizations will merge \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 158\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mTransforming Healthcare \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 158\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mWith Harris and Walz, Democrats put abortion rights at the top of the agenda \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 159\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mIn states with strict abortion policies, simply seeing an OB/GYN for regular care can be difficult\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 160\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mWhere abortion rights could be on the ballot this fall: From the Politics Desk \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 161\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mMap: Where medication abortion is and isn’t legal \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 162\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m└\u001b[0m\u001b[0m────────────────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\u001b[0m┴\u001b[0m\u001b[0m──────────\u001b[0m\u001b[0m┘\n",
"\u001b[0m\u001b[0m"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"select\n",
" slim.articles.headline,\n",
" slim.vec_articles.distance\n",
"from slim.vec_articles\n",
"left join slim.articles on slim.articles.rowid = slim.vec_articles.article_id\n",
"where headline_embedding match vec_quantize_binary(lembed(\"planned parenthood\"))\n",
" and k = 10;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You'll notice the results differ slightly to the full-sized query from above. Some results are ordered differently, some are missing. \n",
"The `distance` in this binary KNN search is hamming distance, not the default L2 distance. "
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Solite",
"language": "sql",
"name": "solite"
},
"language_info": {
"file_extension": ".sql",
"mimetype": "text/x.sqlite",
"name": "sql",
"nb_converter": "script",
"pygments_lexer": "sql",
"version": "TODO"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
================================================
FILE: examples/nbc-headlines/3_search.ipynb
================================================
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# NBC News Headlines: Exploring Hybrod FTS5 + Vector Search\n",
"\n",
"This notebooks explore a few different ways one could combine FTS5 and vector search results, when querying \n",
"[FTS5](https://www.sqlite.org/fts5.html) and\n",
"[`sqlite-vec`](https://github.com/asg017/sqlite-vec) virtual table.\n",
"\n",
"This dataset is a small list of headines scraped from NBC News, found in the [`./1_scrape.ipynb`](./1_scrape.ipynb) notebook.\n",
"To see how the `fts_articles` and `vec_articles` tables were created, see the [`./3_search.ipynb`](./3_search.ipynb) notebook."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
"\n",
"\n",
"\n",
"vec_version()\n",
" \n",
"\n",
"lembed_version()\n",
" \n",
" \n",
" \n",
"\n",
"\n",
"\n",
"v0.1.3-alpha.2\n",
" \n",
"\n",
"v0.0.1-alpha.8\n",
" \n",
" \n",
" \n",
"
\n",
"
\n",
"1 row × 2 columns\n",
"
\n",
"
\n"
],
"text/plain": [
"\u001b[0m┌\u001b[0m\u001b[0m────────────────\u001b[0m\u001b[0m┬\u001b[0m\u001b[0m──────────────────\u001b[0m\u001b[0m┐\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[0m\u001b[1mvec_version()\u001b[0m \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[0m\u001b[1mlembed_version()\u001b[0m\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m├\u001b[0m\u001b[0m────────────────\u001b[0m\u001b[0m┼\u001b[0m\u001b[0m──────────────────\u001b[0m\u001b[0m┤\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mv0.1.3-alpha.2\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mv0.0.1-alpha.8 \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m└\u001b[0m\u001b[0m────────────────\u001b[0m\u001b[0m┴\u001b[0m\u001b[0m──────────────────\u001b[0m\u001b[0m┘\n",
"\u001b[0m\u001b[0m"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
".open tmp-artic2.db\n",
"\n",
".load ../../dist/vec0\n",
".load ./lembed0\n",
"\n",
"insert into lembed_models(name, model)\n",
" values (\n",
" 'default',\n",
" lembed_model_from_file('snowflake-arctic-embed-m-v1.5.d70deb40.f16.gguf')\n",
" );\n",
"\n",
"select vec_version(), lembed_version();"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Full-text Search Only\n",
"\n",
"A simple FTS query on the `fts_articles` virutal table can be made like so:"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
"\n",
"\n",
"\n",
"rowid\n",
" \n",
"\n",
"headline\n",
" \n",
"\n",
"rank\n",
" \n",
" \n",
" \n",
"\n",
"\n",
"\n",
"4666\n",
" \n",
"\n",
"Kamala Harris visits Planned Parenthood clinic\n",
" \n",
"\n",
"-18.9139950477264\n",
" \n",
" \n",
"\n",
"\n",
"6521\n",
" \n",
"\n",
"Former Marine sentenced to 9 years in prison for firebombing Planned Parenthood clinic\n",
" \n",
"\n",
"-14.807022703838651\n",
" \n",
" \n",
" \n",
"
\n",
"
\n",
"2 rows × 3 columns\n",
"
\n",
"
\n"
],
"text/plain": [
"\u001b[0m┌\u001b[0m\u001b[0m───────\u001b[0m\u001b[0m┬\u001b[0m\u001b[0m────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\u001b[0m┬\u001b[0m\u001b[0m─────────────────────\u001b[0m\u001b[0m┐\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[0m\u001b[1mrowid\u001b[0m\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[0m\u001b[1mheadline\u001b[0m \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[0m\u001b[1mrank\u001b[0m \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m├\u001b[0m\u001b[0m───────\u001b[0m\u001b[0m┼\u001b[0m\u001b[0m────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\u001b[0m┼\u001b[0m\u001b[0m─────────────────────\u001b[0m\u001b[0m┤\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 4666\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mKamala Harris visits Planned Parenthood clinic \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m -18.9139950477264\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 6521\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mFormer Marine sentenced to 9 years in prison for firebombing Planned Parenthood clinic\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m-14.807022703838651\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m└\u001b[0m\u001b[0m───────\u001b[0m\u001b[0m┴\u001b[0m\u001b[0m────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\u001b[0m┴\u001b[0m\u001b[0m─────────────────────\u001b[0m\u001b[0m┘\n",
"\u001b[0m\u001b[0m"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
".param set query planned parenthood\n",
"\n",
"select\n",
" rowid,\n",
" headline,\n",
" rank\n",
"from fts_articles\n",
"where headline match :query\n",
"order by rank\n",
"limit 10;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The `rank` column is the negative BM25 score of the query + document. "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Vector Search Only\n",
"\n",
"A KNN vector search can be made on the `vec_articles` virtual table like so:"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
"\n",
"\n",
"\n",
"article_id\n",
" \n",
"\n",
"headline\n",
" \n",
"\n",
"distance\n",
" \n",
" \n",
" \n",
"\n",
"\n",
"\n",
"4666\n",
" \n",
"\n",
"Kamala Harris visits Planned Parenthood clinic\n",
" \n",
"\n",
"0.492593914270401\n",
" \n",
" \n",
"\n",
"\n",
"13928\n",
" \n",
"\n",
"After Dobbs decision, more women are managing their own abortions\n",
" \n",
"\n",
"0.5789032578468323\n",
" \n",
" \n",
"\n",
"\n",
"12636\n",
" \n",
"\n",
"Transforming Healthcare\n",
" \n",
"\n",
"0.5822411179542542\n",
" \n",
" \n",
"\n",
"\n",
"6979\n",
" \n",
"\n",
"A timeline of Trump's many, many positions on abortion\n",
" \n",
"\n",
"0.6101462841033936\n",
" \n",
" \n",
"\n",
"\n",
"7038\n",
" \n",
"\n",
"How a network of abortion pill providers works together in the wake of new threats\n",
" \n",
"\n",
"0.6196886897087097\n",
" \n",
" \n",
"\n",
"\n",
"6914\n",
" \n",
"\n",
"'Major hurdles': The reality check behind Biden's big abortion promise\n",
" \n",
"\n",
"0.6198344826698303\n",
" \n",
" \n",
"\n",
"\n",
"6794\n",
" \n",
"\n",
"Trump's conflicting abortion stances are coming back to haunt him — and his party\n",
" \n",
"\n",
"0.6198986768722534\n",
" \n",
" \n",
"\n",
"\n",
"7381\n",
" \n",
"\n",
"Where abortion rights could be on the ballot this fall: From the Politics Desk\n",
" \n",
"\n",
"0.6201764345169067\n",
" \n",
" \n",
"\n",
"\n",
"6871\n",
" \n",
"\n",
"How the Biden campaign quickly mobilized on Trump's abortion stance\n",
" \n",
"\n",
"0.633980393409729\n",
" \n",
" \n",
"\n",
"\n",
"5496\n",
" \n",
"\n",
"Battle over abortion heats up in Arizona — and could be on the 2024 ballot\n",
" \n",
"\n",
"0.6341449022293091\n",
" \n",
" \n",
" \n",
"
\n",
"
\n",
"10 rows × 3 columns\n",
"
\n",
"
\n"
],
"text/plain": [
"\u001b[0m┌\u001b[0m\u001b[0m────────────\u001b[0m\u001b[0m┬\u001b[0m\u001b[0m────────────────────────────────────────────────────────────────────────────────────\u001b[0m\u001b[0m┬\u001b[0m\u001b[0m────────────────────\u001b[0m\u001b[0m┐\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[0m\u001b[1marticle_id\u001b[0m\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[0m\u001b[1mheadline\u001b[0m \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[0m\u001b[1mdistance\u001b[0m \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m├\u001b[0m\u001b[0m────────────\u001b[0m\u001b[0m┼\u001b[0m\u001b[0m────────────────────────────────────────────────────────────────────────────────────\u001b[0m\u001b[0m┼\u001b[0m\u001b[0m────────────────────\u001b[0m\u001b[0m┤\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 4666\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mKamala Harris visits Planned Parenthood clinic \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0.492593914270401\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 13928\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mAfter Dobbs decision, more women are managing their own abortions \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m0.5789032578468323\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 12636\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mTransforming Healthcare \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m0.5822411179542542\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 6979\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mA timeline of Trump's many, many positions on abortion \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m0.6101462841033936\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 7038\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mHow a network of abortion pill providers works together in the wake of new threats\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m0.6196886897087097\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 6914\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m'Major hurdles': The reality check behind Biden's big abortion promise \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m0.6198344826698303\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 6794\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mTrump's conflicting abortion stances are coming back to haunt him — and his party \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m0.6198986768722534\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 7381\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mWhere abortion rights could be on the ballot this fall: From the Politics Desk \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m0.6201764345169067\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 6871\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mHow the Biden campaign quickly mobilized on Trump's abortion stance \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0.633980393409729\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 5496\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mBattle over abortion heats up in Arizona — and could be on the 2024 ballot \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m0.6341449022293091\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m└\u001b[0m\u001b[0m────────────\u001b[0m\u001b[0m┴\u001b[0m\u001b[0m────────────────────────────────────────────────────────────────────────────────────\u001b[0m\u001b[0m┴\u001b[0m\u001b[0m────────────────────\u001b[0m\u001b[0m┘\n",
"\u001b[0m\u001b[0m"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
".param set query planned parenthood\n",
"\n",
"select\n",
" article_id,\n",
" articles.headline,\n",
" distance\n",
"from vec_articles\n",
"left join articles on articles.rowid = vec_articles.article_id\n",
"where headline_embedding match lembed(:query)\n",
" and k = 10;"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The `distance` column is the L2 distance between the query vector and the headline embedding. \n",
"\n",
"The rest of this notebook explore different ways of combining these FTS5 and vector search results. \n",
"The core queries are similar, and only really different on different `JOIN` or `ORDER BY` techniques."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Combination Technique #1: Keyword-first\n",
"\n",
"In many search-engine cases, you may way to display keyword matches first, and supplement the rest wih with vector search results. \n",
"This makes some intuitive sense — keyword matches are what uses expect, but you'll want to display more result if there are only a few matching documents. \n"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
"\n",
"\n",
"\n",
"id\n",
" \n",
"\n",
"headline\n",
" \n",
"\n",
"match_type\n",
" \n",
"\n",
"article_id\n",
" \n",
"\n",
"rank_number\n",
" \n",
"\n",
"score\n",
" \n",
" \n",
" \n",
"\n",
"\n",
"\n",
"10098\n",
" \n",
"\n",
"Kamala Harris says abortion bans are creating 'a health care crisis'\n",
" \n",
"\n",
"fts\n",
" \n",
"\n",
"10098\n",
" \n",
"\n",
"1\n",
" \n",
"\n",
"-10.678829270936067\n",
" \n",
" \n",
"\n",
"\n",
"9776\n",
" \n",
"\n",
"States with abortion bans saw birth control prescriptions fall post-Dobbs, study finds\n",
" \n",
"\n",
"fts\n",
" \n",
"\n",
"9776\n",
" \n",
"\n",
"2\n",
" \n",
"\n",
"-10.016316725971112\n",
" \n",
" \n",
"\n",
"\n",
"2292\n",
" \n",
"\n",
"Ohio GOP Senate candidates pitch federal abortion bans even after voters protected reproductive rights\n",
" \n",
"\n",
"fts\n",
" \n",
"\n",
"2292\n",
" \n",
"\n",
"3\n",
" \n",
"\n",
"-9.7149595994016\n",
" \n",
" \n",
"\n",
"\n",
"452\n",
" \n",
"\n",
"64K women and girls became pregnant due to rape in states with abortion bans, study estimates\n",
" \n",
"\n",
"fts\n",
" \n",
"\n",
"452\n",
" \n",
"\n",
"4\n",
" \n",
"\n",
"-9.163558569425538\n",
" \n",
" \n",
"\n",
"\n",
"9187\n",
" \n",
"\n",
"Abortion bans drive away up to half of young talent, CNBC/Generation Lab youth survey finds\n",
" \n",
"\n",
"fts\n",
" \n",
"\n",
"9187\n",
" \n",
"\n",
"5\n",
" \n",
"\n",
"-9.163558569425538\n",
" \n",
" \n",
"\n",
"\n",
"6989\n",
" \n",
"\n",
"Trump says abortion restrictions should be left to states, dodging a national ban\n",
" \n",
"\n",
"vec\n",
" \n",
"\n",
"6989\n",
" \n",
"\n",
"1\n",
" \n",
"\n",
"0.4930749833583832\n",
" \n",
" \n",
"\n",
"\n",
"13928\n",
" \n",
"\n",
"After Dobbs decision, more women are managing their own abortions\n",
" \n",
"\n",
"vec\n",
" \n",
"\n",
"13928\n",
" \n",
"\n",
"2\n",
" \n",
"\n",
"0.5120846629142761\n",
" \n",
" \n",
"\n",
"\n",
"11822\n",
" \n",
"\n",
"Iowa now bans most abortions after about 6 weeks\n",
" \n",
"\n",
"vec\n",
" \n",
"\n",
"11822\n",
" \n",
"\n",
"3\n",
" \n",
"\n",
"0.512569785118103\n",
" \n",
" \n",
"\n",
"\n",
"7381\n",
" \n",
"\n",
"Where abortion rights could be on the ballot this fall: From the Politics Desk\n",
" \n",
"\n",
"vec\n",
" \n",
"\n",
"7381\n",
" \n",
"\n",
"4\n",
" \n",
"\n",
"0.5168291926383972\n",
" \n",
" \n",
"\n",
"\n",
"14009\n",
" \n",
"\n",
"Trump signals openness to banning abortion pill\n",
" \n",
"\n",
"vec\n",
" \n",
"\n",
"14009\n",
" \n",
"\n",
"5\n",
" \n",
"\n",
"0.5288293957710266\n",
" \n",
" \n",
"\n",
"\n",
"4426\n",
" \n",
"\n",
"Medication abortions rose in year after Dobbs decision, report finds\n",
" \n",
"\n",
"vec\n",
" \n",
"\n",
"4426\n",
" \n",
"\n",
"6\n",
" \n",
"\n",
"0.5305097699165344\n",
" \n",
" \n",
"\n",
"\n",
"4328\n",
" \n",
"\n",
"Trump signals support for a national 15-week abortion ban\n",
" \n",
"\n",
"vec\n",
" \n",
"\n",
"4328\n",
" \n",
"\n",
"7\n",
" \n",
"\n",
"0.532848060131073\n",
" \n",
" \n",
"\n",
"\n",
"6979\n",
" \n",
"\n",
"A timeline of Trump's many, many positions on abortion\n",
" \n",
"\n",
"vec\n",
" \n",
"\n",
"6979\n",
" \n",
"\n",
"8\n",
" \n",
"\n",
"0.533357560634613\n",
" \n",
" \n",
"\n",
"\n",
"2092\n",
" \n",
"\n",
"For the first time in years, Sen. Graham hasn't introduced a national abortion ban\n",
" \n",
"\n",
"vec\n",
" \n",
"\n",
"2092\n",
" \n",
"\n",
"9\n",
" \n",
"\n",
"0.5336830615997314\n",
" \n",
" \n",
"\n",
"\n",
"6794\n",
" \n",
"\n",
"Trump's conflicting abortion stances are coming back to haunt him — and his party\n",
" \n",
"\n",
"vec\n",
" \n",
"\n",
"6794\n",
" \n",
"\n",
"10\n",
" \n",
"\n",
"0.5347095131874084\n",
" \n",
" \n",
" \n",
"
\n",
"
\n",
"15 rows × 6 columns\n",
"
\n",
"
\n"
],
"text/plain": [
"\u001b[0m┌\u001b[0m\u001b[0m───────\u001b[0m\u001b[0m┬\u001b[0m\u001b[0m────────────────────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\u001b[0m┬\u001b[0m\u001b[0m────────────\u001b[0m\u001b[0m┬\u001b[0m\u001b[0m────────────\u001b[0m\u001b[0m┬\u001b[0m\u001b[0m─────────────\u001b[0m\u001b[0m┬\u001b[0m\u001b[0m─────────────────────\u001b[0m\u001b[0m┐\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[0m\u001b[1mid\u001b[0m \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[0m\u001b[1mheadline\u001b[0m \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[0m\u001b[1mmatch_type\u001b[0m\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[0m\u001b[1marticle_id\u001b[0m\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[0m\u001b[1mrank_number\u001b[0m\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[0m\u001b[1mscore\u001b[0m \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m├\u001b[0m\u001b[0m───────\u001b[0m\u001b[0m┼\u001b[0m\u001b[0m────────────────────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\u001b[0m┼\u001b[0m\u001b[0m────────────\u001b[0m\u001b[0m┼\u001b[0m\u001b[0m────────────\u001b[0m\u001b[0m┼\u001b[0m\u001b[0m─────────────\u001b[0m\u001b[0m┼\u001b[0m\u001b[0m─────────────────────\u001b[0m\u001b[0m┤\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m10098\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mKamala Harris says abortion bans are creating 'a health care crisis' \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mfts \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 10098\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 1\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m-10.678829270936067\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 9776\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mStates with abortion bans saw birth control prescriptions fall post-Dobbs, study finds \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mfts \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 9776\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 2\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m-10.016316725971112\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 2292\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mOhio GOP Senate candidates pitch federal abortion bans even after voters protected reproductive rights\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mfts \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 2292\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 3\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m -9.7149595994016\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 452\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m64K women and girls became pregnant due to rape in states with abortion bans, study estimates \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mfts \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 452\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 4\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m -9.163558569425538\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 9187\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mAbortion bans drive away up to half of young talent, CNBC/Generation Lab youth survey finds \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mfts \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 9187\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 5\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m -9.163558569425538\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 6989\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mTrump says abortion restrictions should be left to states, dodging a national ban \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mvec \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 6989\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 1\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0.4930749833583832\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m13928\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mAfter Dobbs decision, more women are managing their own abortions \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mvec \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 13928\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 2\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0.5120846629142761\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m11822\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mIowa now bans most abortions after about 6 weeks \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mvec \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 11822\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 3\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0.512569785118103\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 7381\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mWhere abortion rights could be on the ballot this fall: From the Politics Desk \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mvec \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 7381\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 4\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0.5168291926383972\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m14009\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mTrump signals openness to banning abortion pill \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mvec \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 14009\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 5\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0.5288293957710266\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 4426\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mMedication abortions rose in year after Dobbs decision, report finds \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mvec \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 4426\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 6\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0.5305097699165344\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 4328\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mTrump signals support for a national 15-week abortion ban \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mvec \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 4328\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 7\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0.532848060131073\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 6979\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mA timeline of Trump's many, many positions on abortion \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mvec \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 6979\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 8\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0.533357560634613\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 2092\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mFor the first time in years, Sen. Graham hasn't introduced a national abortion ban \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mvec \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 2092\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 9\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0.5336830615997314\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 6794\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mTrump's conflicting abortion stances are coming back to haunt him — and his party \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mvec \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 6794\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 10\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0.5347095131874084\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m└\u001b[0m\u001b[0m───────\u001b[0m\u001b[0m┴\u001b[0m\u001b[0m────────────────────────────────────────────────────────────────────────────────────────────────────────\u001b[0m\u001b[0m┴\u001b[0m\u001b[0m────────────\u001b[0m\u001b[0m┴\u001b[0m\u001b[0m────────────\u001b[0m\u001b[0m┴\u001b[0m\u001b[0m─────────────\u001b[0m\u001b[0m┴\u001b[0m\u001b[0m─────────────────────\u001b[0m\u001b[0m┘\n",
"\u001b[0m\u001b[0m"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
".param set query abortion bans\n",
".param set k 10\n",
"\n",
"\n",
"with fts_matches as (\n",
" select\n",
" rowid as article_id,\n",
" row_number() over (order by rank) as rank_number,\n",
" rank as score\n",
" from fts_articles\n",
" where headline match :query\n",
" limit :k\n",
"),\n",
"vec_matches as (\n",
" select\n",
" article_id,\n",
" row_number() over (order by distance) as rank_number,\n",
" distance as score\n",
" from vec_articles\n",
" where\n",
" headline_embedding match lembed(:query)\n",
" and k = :k\n",
" order by distance\n",
"),\n",
"combined as (\n",
" select 'fts' as match_type, * from fts_matches\n",
" union all\n",
" select 'vec' as match_type, * from vec_matches\n",
"),\n",
"final as (\n",
" select\n",
" articles.id,\n",
" articles.headline,\n",
" combined.*\n",
" from combined\n",
" left join articles on articles.rowid = combined.article_id\n",
")\n",
"select * from final;\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We do this with a verbose CTE: one step for the FTS5 query, another for the vector search, one to \"combine\" the results with a `UNION ALL`, and one last one to `LEFT JOIN` back to the base `articles` table to get the headline.\n",
"\n",
"Here we have 5 FTS results and 10 additional vector results. This seems pretty natural, a fallback to vector search when keywords matches lack a bit.\n",
"\n",
"One note: this example doesn't do any de-duplication, so you may get the same results twice. So you may want to add a `DISTINCT` or `GROUP BY` somehwere to handle that. "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Combination Technique #2: Reciprocal Rank Fusion (RRF)\n",
"\n",
"[Reciprocal Rank Fusion](https://learn.microsoft.com/en-us/azure/search/hybrid-search-ranking) \n",
"is another combination technique, where matches that are both FTS matches and vector matches\n",
"are ranked higher than other. The CTE logic is a bit more involved, but can still be represented in a few steps:\n"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
"\n",
"\n",
"\n",
"id\n",
" \n",
"\n",
"headline\n",
" \n",
"\n",
"vec_rank\n",
" \n",
"\n",
"fts_rank\n",
" \n",
"\n",
"combined_rank\n",
" \n",
"\n",
"vec_distance\n",
" \n",
"\n",
"fts_score\n",
" \n",
" \n",
" \n",
"\n",
"\n",
"\n",
"4328\n",
" \n",
"\n",
"Trump signals support for a national 15-week abortion ban\n",
" \n",
"\n",
"2\n",
" \n",
"\n",
"3\n",
" \n",
"\n",
"0.03200204813108039\n",
" \n",
"\n",
"0.5334203839302063\n",
" \n",
"\n",
"-9.841645168493953\n",
" \n",
" \n",
"\n",
"\n",
"5769\n",
" \n",
"\n",
"Mitch McConnell shies away from supporting national abortion ban\n",
" \n",
"\n",
"8\n",
" \n",
"\n",
"2\n",
" \n",
"\n",
"0.030834914611005692\n",
" \n",
"\n",
"0.5501425266265869\n",
" \n",
"\n",
"-10.19017787567105\n",
" \n",
" \n",
"\n",
"\n",
"9507\n",
" \n",
"\n",
"Arizona Senate passes repeal of 1864 abortion ban\n",
" \n",
"\n",
"\n",
" \n",
"\n",
"1\n",
" \n",
"\n",
"0.01639344262295082\n",
" \n",
"\n",
"\n",
" \n",
"\n",
"-10.564302831642667\n",
" \n",
" \n",
"\n",
"\n",
"6989\n",
" \n",
"\n",
"Trump says abortion restrictions should be left to states, dodging a national ban\n",
" \n",
"\n",
"1\n",
" \n",
"\n",
"\n",
" \n",
"\n",
"0.01639344262295082\n",
" \n",
"\n",
"0.5142395496368408\n",
" \n",
"\n",
"\n",
" \n",
" \n",
"\n",
"\n",
"10717\n",
" \n",
"\n",
"Supreme Court rejects bid to restrict access to abortion pill\n",
" \n",
"\n",
"3\n",
" \n",
"\n",
"\n",
" \n",
"\n",
"0.015873015873015872\n",
" \n",
"\n",
"0.5351248383522034\n",
" \n",
"\n",
"\n",
" \n",
" \n",
"\n",
"\n",
"5981\n",
" \n",
"\n",
"Arizona state House passes bill to repeal 1864 abortion ban\n",
" \n",
"\n",
"\n",
" \n",
"\n",
"4\n",
" \n",
"\n",
"0.015625\n",
" \n",
"\n",
"\n",
" \n",
"\n",
"-9.841645168493953\n",
" \n",
" \n",
"\n",
"\n",
"14009\n",
" \n",
"\n",
"Trump signals openness to banning abortion pill\n",
" \n",
"\n",
"4\n",
" \n",
"\n",
"\n",
" \n",
"\n",
"0.015625\n",
" \n",
"\n",
"0.5364335179328918\n",
" \n",
"\n",
"\n",
" \n",
" \n",
"\n",
"\n",
"6375\n",
" \n",
"\n",
"Arizona Republicans again quash effort to repeal 1864 abortion ban\n",
" \n",
"\n",
"\n",
" \n",
"\n",
"5\n",
" \n",
"\n",
"0.015384615384615385\n",
" \n",
"\n",
"\n",
" \n",
"\n",
"-9.841645168493953\n",
" \n",
" \n",
"\n",
"\n",
"7381\n",
" \n",
"\n",
"Where abortion rights could be on the ballot this fall: From the Politics Desk\n",
" \n",
"\n",
"5\n",
" \n",
"\n",
"\n",
" \n",
"\n",
"0.015384615384615385\n",
" \n",
"\n",
"0.5462378859519958\n",
" \n",
"\n",
"\n",
" \n",
" \n",
"\n",
"\n",
"9443\n",
" \n",
"\n",
"Arizona Gov. Katie Hobbs signs repeal of 1864 abortion ban\n",
" \n",
"\n",
"\n",
" \n",
"\n",
"6\n",
" \n",
"\n",
"0.015151515151515152\n",
" \n",
"\n",
"\n",
" \n",
"\n",
"-9.841645168493953\n",
" \n",
" \n",
"\n",
"\n",
"13928\n",
" \n",
"\n",
"After Dobbs decision, more women are managing their own abortions\n",
" \n",
"\n",
"6\n",
" \n",
"\n",
"\n",
" \n",
"\n",
"0.015151515151515152\n",
" \n",
"\n",
"0.5467031002044678\n",
" \n",
"\n",
"\n",
" \n",
" \n",
"\n",
"\n",
"1821\n",
" \n",
"\n",
"Dominican women fight child marriage, teen pregancy amid total abortion ban\n",
" \n",
"\n",
"\n",
" \n",
"\n",
"7\n",
" \n",
"\n",
"0.014925373134328358\n",
" \n",
"\n",
"\n",
" \n",
"\n",
"-9.51616557526609\n",
" \n",
" \n",
"\n",
"\n",
"2092\n",
" \n",
"\n",
"For the first time in years, Sen. Graham hasn't introduced a national abortion ban\n",
" \n",
"\n",
"7\n",
" \n",
"\n",
"\n",
" \n",
"\n",
"0.014925373134328358\n",
" \n",
"\n",
"0.5477523803710938\n",
" \n",
"\n",
"\n",
" \n",
" \n",
"\n",
"\n",
"7150\n",
" \n",
"\n",
"Tennessee court weighs challenge to abortion ban’s narrow medical exception\n",
" \n",
"\n",
"\n",
" \n",
"\n",
"8\n",
" \n",
"\n",
"0.014705882352941176\n",
" \n",
"\n",
"\n",
" \n",
"\n",
"-9.51616557526609\n",
" \n",
" \n",
"\n",
"\n",
"8690\n",
" \n",
"\n",
"Arizona Supreme Court pushes back enforcement date for 1864 abortion ban\n",
" \n",
"\n",
"\n",
" \n",
"\n",
"9\n",
" \n",
"\n",
"0.014492753623188406\n",
" \n",
"\n",
"\n",
" \n",
"\n",
"-9.51616557526609\n",
" \n",
" \n",
"\n",
"\n",
"11822\n",
" \n",
"\n",
"Iowa now bans most abortions after about 6 weeks\n",
" \n",
"\n",
"9\n",
" \n",
"\n",
"\n",
" \n",
"\n",
"0.014492753623188406\n",
" \n",
"\n",
"0.5557170510292053\n",
" \n",
"\n",
"\n",
" \n",
" \n",
"\n",
"\n",
"2646\n",
" \n",
"\n",
"Trump campaign scrambles over abortion ban report as Democrats seize the moment\n",
" \n",
"\n",
"\n",
" \n",
"\n",
"10\n",
" \n",
"\n",
"0.014285714285714285\n",
" \n",
"\n",
"\n",
" \n",
"\n",
"-9.211525101866211\n",
" \n",
" \n",
"\n",
"\n",
"5538\n",
" \n",
"\n",
"Map: Where medication abortion is and isn’t legal\n",
" \n",
"\n",
"10\n",
" \n",
"\n",
"\n",
" \n",
"\n",
"0.014285714285714285\n",
" \n",
"\n",
"0.5588464140892029\n",
" \n",
"\n",
"\n",
" \n",
" \n",
" \n",
"
\n",
"
\n",
"18 rows × 7 columns\n",
"
\n",
"
\n"
],
"text/plain": [
"\u001b[0m┌\u001b[0m\u001b[0m───────\u001b[0m\u001b[0m┬\u001b[0m\u001b[0m────────────────────────────────────────────────────────────────────────────────────\u001b[0m\u001b[0m┬\u001b[0m\u001b[0m──────────\u001b[0m\u001b[0m┬\u001b[0m\u001b[0m──────────\u001b[0m\u001b[0m┬\u001b[0m\u001b[0m──────────────────────\u001b[0m\u001b[0m┬\u001b[0m\u001b[0m────────────────────\u001b[0m\u001b[0m┬\u001b[0m\u001b[0m─────────────────────\u001b[0m\u001b[0m┐\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[0m\u001b[1mid\u001b[0m \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[0m\u001b[1mheadline\u001b[0m \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[0m\u001b[1mvec_rank\u001b[0m\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[0m\u001b[1mfts_rank\u001b[0m\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[0m\u001b[1mcombined_rank\u001b[0m \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[0m\u001b[1mvec_distance\u001b[0m \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[0m\u001b[1mfts_score\u001b[0m \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m├\u001b[0m\u001b[0m───────\u001b[0m\u001b[0m┼\u001b[0m\u001b[0m────────────────────────────────────────────────────────────────────────────────────\u001b[0m\u001b[0m┼\u001b[0m\u001b[0m──────────\u001b[0m\u001b[0m┼\u001b[0m\u001b[0m──────────\u001b[0m\u001b[0m┼\u001b[0m\u001b[0m──────────────────────\u001b[0m\u001b[0m┼\u001b[0m\u001b[0m────────────────────\u001b[0m\u001b[0m┼\u001b[0m\u001b[0m─────────────────────\u001b[0m\u001b[0m┤\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 4328\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mTrump signals support for a national 15-week abortion ban \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 2\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 3\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0.03200204813108039\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m0.5334203839302063\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m -9.841645168493953\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 5769\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mMitch McConnell shies away from supporting national abortion ban \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 8\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 2\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m0.030834914611005692\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m0.5501425266265869\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m -10.19017787567105\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 9507\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mArizona Senate passes repeal of 1864 abortion ban \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 1\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0.01639344262295082\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m-10.564302831642667\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 6989\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mTrump says abortion restrictions should be left to states, dodging a national ban \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 1\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0.01639344262295082\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m0.5142395496368408\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m10717\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mSupreme Court rejects bid to restrict access to abortion pill \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 3\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m0.015873015873015872\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m0.5351248383522034\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 5981\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mArizona state House passes bill to repeal 1864 abortion ban \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 4\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0.015625\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m -9.841645168493953\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m14009\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mTrump signals openness to banning abortion pill \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 4\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 0.015625\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m0.5364335179328918\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 6375\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mArizona Republicans again quash effort to repeal 1864 abortion ban \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 5\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m0.015384615384615385\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m -9.841645168493953\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 7381\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mWhere abortion rights could be on the ballot this fall: From the Politics Desk \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 5\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m0.015384615384615385\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m0.5462378859519958\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 9443\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mArizona Gov. Katie Hobbs signs repeal of 1864 abortion ban \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 6\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m0.015151515151515152\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m -9.841645168493953\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m13928\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mAfter Dobbs decision, more women are managing their own abortions \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 6\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m0.015151515151515152\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m0.5467031002044678\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 1821\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mDominican women fight child marriage, teen pregancy amid total abortion ban \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 7\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m0.014925373134328358\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m -9.51616557526609\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 2092\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mFor the first time in years, Sen. Graham hasn't introduced a national abortion ban\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 7\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m0.014925373134328358\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m0.5477523803710938\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 7150\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mTennessee court weighs challenge to abortion ban’s narrow medical exception \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 8\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m0.014705882352941176\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m -9.51616557526609\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 8690\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mArizona Supreme Court pushes back enforcement date for 1864 abortion ban \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 9\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m0.014492753623188406\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m -9.51616557526609\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m11822\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mIowa now bans most abortions after about 6 weeks \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 9\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m0.014492753623188406\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m0.5557170510292053\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 2646\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mTrump campaign scrambles over abortion ban report as Democrats seize the moment \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 10\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m0.014285714285714285\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m -9.211525101866211\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 5538\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mMap: Where medication abortion is and isn’t legal \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 10\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m0.014285714285714285\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m0.5588464140892029\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m└\u001b[0m\u001b[0m───────\u001b[0m\u001b[0m┴\u001b[0m\u001b[0m────────────────────────────────────────────────────────────────────────────────────\u001b[0m\u001b[0m┴\u001b[0m\u001b[0m──────────\u001b[0m\u001b[0m┴\u001b[0m\u001b[0m──────────\u001b[0m\u001b[0m┴\u001b[0m\u001b[0m──────────────────────\u001b[0m\u001b[0m┴\u001b[0m\u001b[0m────────────────────\u001b[0m\u001b[0m┴\u001b[0m\u001b[0m─────────────────────\u001b[0m\u001b[0m┘\n",
"\u001b[0m\u001b[0m"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
".param set query abortion ban\n",
"\n",
"\n",
".param set k 10\n",
".param set rrf_k 60\n",
".param set weight_fts 1.0\n",
".param set weight_vec 1.0\n",
"\n",
"with vec_matches as (\n",
" select\n",
" article_id,\n",
" row_number() over (order by distance) as rank_number,\n",
" distance\n",
" from vec_articles\n",
" where\n",
" headline_embedding match lembed(:query)\n",
" and k = :k\n",
"),\n",
"fts_matches as (\n",
" select\n",
" rowid,\n",
" row_number() over (order by rank) as rank_number,\n",
" rank as score\n",
" from fts_articles\n",
" where headline match :query\n",
" limit :k\n",
"),\n",
"final as (\n",
" select\n",
" articles.id,\n",
" articles.headline,\n",
" vec_matches.rank_number as vec_rank,\n",
" fts_matches.rank_number as fts_rank,\n",
" coalesce(1.0 / (:rrf_k + fts_matches.rank_number), 0.0) * :weight_fts\n",
" + coalesce(1.0 / (:rrf_k + vec_matches.rank_number), 0.0) * :weight_vec\n",
" as combined_rank,\n",
" vec_matches.distance as vec_distance,\n",
" fts_matches.score as fts_score\n",
" from fts_matches\n",
" full outer join vec_matches on vec_matches.article_id = fts_matches.rowid\n",
" join articles on articles.rowid = coalesce(fts_matches.rowid, vec_matches.article_id)\n",
" order by combined_rank desc\n",
")\n",
"select * from final;\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The first two CTE steps are identical to the \"keyword-first\" approach, just a normal FTS5 + vector KNN queries. \n",
"\n",
"The combination CTE step is more involved, and is described in detail in [this \"Hybrid Search\" Supabase docs page](https://supabase.com/docs/guides/ai/hybrid-search). \n",
"What's nice about this approach is that you can configure the \"weights\" of FTS or vector results with a normal SQL parameter. \n",
"\n",
"In this query, we can see the top result `\"Trump signals support for a national 15-week abortion ban\"` was neither a top FTS result or vector result — only ranked `2` and `3` respectively. \n",
"But since it appeared in both the FTS and vector results, it's ranked higher than others, same with `\"Mitch McConnell shies away from supporting national abortion ban\"`. The rest of the results are\n",
"FTS + vector results interwoven together, pretty nice!"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Combination Technique #3: Re-rank by semantics\n",
"\n",
"Here we use FTS5 results are the \"source truth\", but we re-order them based on semantic similarity between "
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"
\n",
"\n",
"\n",
"\n",
"id\n",
" \n",
"\n",
"headline\n",
" \n",
"\n",
"rowid\n",
" \n",
"\n",
"fts_rank_number\n",
" \n",
"\n",
"score\n",
" \n",
" \n",
" \n",
"\n",
"\n",
"\n",
"4328\n",
" \n",
"\n",
"Trump signals support for a national 15-week abortion ban\n",
" \n",
"\n",
"4328\n",
" \n",
"\n",
"3\n",
" \n",
"\n",
"-9.841645168493953\n",
" \n",
" \n",
"\n",
"\n",
"5769\n",
" \n",
"\n",
"Mitch McConnell shies away from supporting national abortion ban\n",
" \n",
"\n",
"5769\n",
" \n",
"\n",
"2\n",
" \n",
"\n",
"-10.19017787567105\n",
" \n",
" \n",
"\n",
"\n",
"2646\n",
" \n",
"\n",
"Trump campaign scrambles over abortion ban report as Democrats seize the moment\n",
" \n",
"\n",
"2646\n",
" \n",
"\n",
"10\n",
" \n",
"\n",
"-9.211525101866211\n",
" \n",
" \n",
"\n",
"\n",
"7150\n",
" \n",
"\n",
"Tennessee court weighs challenge to abortion ban’s narrow medical exception\n",
" \n",
"\n",
"7150\n",
" \n",
"\n",
"8\n",
" \n",
"\n",
"-9.51616557526609\n",
" \n",
" \n",
"\n",
"\n",
"1821\n",
" \n",
"\n",
"Dominican women fight child marriage, teen pregancy amid total abortion ban\n",
" \n",
"\n",
"1821\n",
" \n",
"\n",
"7\n",
" \n",
"\n",
"-9.51616557526609\n",
" \n",
" \n",
"\n",
"\n",
"6375\n",
" \n",
"\n",
"Arizona Republicans again quash effort to repeal 1864 abortion ban\n",
" \n",
"\n",
"6375\n",
" \n",
"\n",
"5\n",
" \n",
"\n",
"-9.841645168493953\n",
" \n",
" \n",
"\n",
"\n",
"9507\n",
" \n",
"\n",
"Arizona Senate passes repeal of 1864 abortion ban\n",
" \n",
"\n",
"9507\n",
" \n",
"\n",
"1\n",
" \n",
"\n",
"-10.564302831642667\n",
" \n",
" \n",
"\n",
"\n",
"8690\n",
" \n",
"\n",
"Arizona Supreme Court pushes back enforcement date for 1864 abortion ban\n",
" \n",
"\n",
"8690\n",
" \n",
"\n",
"9\n",
" \n",
"\n",
"-9.51616557526609\n",
" \n",
" \n",
"\n",
"\n",
"5981\n",
" \n",
"\n",
"Arizona state House passes bill to repeal 1864 abortion ban\n",
" \n",
"\n",
"5981\n",
" \n",
"\n",
"4\n",
" \n",
"\n",
"-9.841645168493953\n",
" \n",
" \n",
"\n",
"\n",
"9443\n",
" \n",
"\n",
"Arizona Gov. Katie Hobbs signs repeal of 1864 abortion ban\n",
" \n",
"\n",
"9443\n",
" \n",
"\n",
"6\n",
" \n",
"\n",
"-9.841645168493953\n",
" \n",
" \n",
" \n",
"
\n",
"
\n",
"10 rows × 5 columns\n",
"
\n",
"
\n"
],
"text/plain": [
"\u001b[0m┌\u001b[0m\u001b[0m──────\u001b[0m\u001b[0m┬\u001b[0m\u001b[0m─────────────────────────────────────────────────────────────────────────────────\u001b[0m\u001b[0m┬\u001b[0m\u001b[0m───────\u001b[0m\u001b[0m┬\u001b[0m\u001b[0m─────────────────\u001b[0m\u001b[0m┬\u001b[0m\u001b[0m─────────────────────\u001b[0m\u001b[0m┐\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[0m\u001b[1mid\u001b[0m \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[0m\u001b[1mheadline\u001b[0m \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[0m\u001b[1mrowid\u001b[0m\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[0m\u001b[1mfts_rank_number\u001b[0m\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m\u001b[0m\u001b[1mscore\u001b[0m \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m├\u001b[0m\u001b[0m──────\u001b[0m\u001b[0m┼\u001b[0m\u001b[0m─────────────────────────────────────────────────────────────────────────────────\u001b[0m\u001b[0m┼\u001b[0m\u001b[0m───────\u001b[0m\u001b[0m┼\u001b[0m\u001b[0m─────────────────\u001b[0m\u001b[0m┼\u001b[0m\u001b[0m─────────────────────\u001b[0m\u001b[0m┤\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m4328\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mTrump signals support for a national 15-week abortion ban \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 4328\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 3\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m -9.841645168493953\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m5769\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mMitch McConnell shies away from supporting national abortion ban \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 5769\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 2\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m -10.19017787567105\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m2646\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mTrump campaign scrambles over abortion ban report as Democrats seize the moment\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 2646\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 10\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m -9.211525101866211\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m7150\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mTennessee court weighs challenge to abortion ban’s narrow medical exception \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 7150\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 8\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m -9.51616557526609\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m1821\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mDominican women fight child marriage, teen pregancy amid total abortion ban \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 1821\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 7\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m -9.51616557526609\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m6375\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mArizona Republicans again quash effort to repeal 1864 abortion ban \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 6375\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 5\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m -9.841645168493953\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m9507\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mArizona Senate passes repeal of 1864 abortion ban \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 9507\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 1\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m-10.564302831642667\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m8690\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mArizona Supreme Court pushes back enforcement date for 1864 abortion ban \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 8690\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 9\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m -9.51616557526609\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m5981\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mArizona state House passes bill to repeal 1864 abortion ban \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 5981\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 4\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m -9.841645168493953\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m9443\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0mArizona Gov. Katie Hobbs signs repeal of 1864 abortion ban \u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 9443\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m 6\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m \u001b[0m\u001b[0m\u001b[0m -9.841645168493953\u001b[0m \u001b[0m\u001b[0m│\u001b[0m\u001b[0m\n",
"\u001b[0m\u001b[0m\u001b[0m└\u001b[0m\u001b[0m──────\u001b[0m\u001b[0m┴\u001b[0m\u001b[0m─────────────────────────────────────────────────────────────────────────────────\u001b[0m\u001b[0m┴\u001b[0m\u001b[0m───────\u001b[0m\u001b[0m┴\u001b[0m\u001b[0m─────────────────\u001b[0m\u001b[0m┴\u001b[0m\u001b[0m─────────────────────\u001b[0m\u001b[0m┘\n",
"\u001b[0m\u001b[0m"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
".param set query abortion ban\n",
".param set k 10\n",
"\n",
"\n",
"with fts_matches as (\n",
" select\n",
" rowid,\n",
" row_number() over (order by rank) as fts_rank_number,\n",
" rank as score\n",
" from fts_articles\n",
" where headline match :query\n",
" limit :k\n",
"),\n",
"final as (\n",
" select\n",
" articles.id,\n",
" articles.headline,\n",
" fts_matches.*\n",
" from fts_matches\n",
" left join articles on articles.rowid = fts_matches.rowid\n",
" order by vec_distance_cosine(lembed(:query), lembed(articles.headline))\n",
")\n",
"select * from final;\n",
"\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Solite",
"language": "sql",
"name": "solite"
},
"language_info": {
"file_extension": ".sql",
"mimetype": "text/x.sqlite",
"name": "sql",
"nb_converter": "script",
"pygments_lexer": "sql",
"version": "TODO"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
================================================
FILE: examples/nbc-headlines/Makefile
================================================
all-MiniLM-L6-v2.e4ce9877.q8_0.gguf:
curl -L -o $@ https://huggingface.co/asg017/sqlite-lembed-model-examples/resolve/main/all-MiniLM-L6-v2/all-MiniLM-L6-v2.e4ce9877.q8_0.gguf
================================================
FILE: examples/nbc-headlines/README.md
================================================
- `headlines-2024.db`
- 14.5k rows
- 4.4MB
================================================
FILE: examples/python-recipes/openai-sample.py
================================================
# pip install openai sqlite-vec
from openai import OpenAI
import sqlite3
import sqlite_vec
import struct
from typing import List
def serialize(vector: List[float]) -> bytes:
"""serializes a list of floats into a compact "raw bytes" format"""
return struct.pack("%sf" % len(vector), *vector)
sentences = [
"Capri-Sun is a brand of juice concentrate–based drinks manufactured by the German company Wild and regional licensees.",
"George V was King of the United Kingdom and the British Dominions, and Emperor of India, from 6 May 1910 until his death in 1936.",
"Alaqua Cox is a Native American (Menominee) actress.",
"Shohei Ohtani is a Japanese professional baseball pitcher and designated hitter for the Los Angeles Dodgers of Major League Baseball.",
"Tamarindo, also commonly known as agua de tamarindo, is a non-alcoholic beverage made of tamarind, sugar, and water.",
]
client = OpenAI()
# change ':memory:' to a filepath to persist data
db = sqlite3.connect(":memory:")
db.enable_load_extension(True)
sqlite_vec.load(db)
db.enable_load_extension(False)
db.execute(
"""
CREATE TABLE sentences(
id INTEGER PRIMARY KEY,
sentence TEXT
);
"""
)
with db:
for i, sentence in enumerate(sentences):
db.execute("INSERT INTO sentences(id, sentence) VALUES(?, ?)", [i, sentence])
db.execute(
"""
CREATE VIRTUAL TABLE vec_sentences USING vec0(
id INTEGER PRIMARY KEY,
sentence_embedding FLOAT[1536]
);
"""
)
with db:
sentence_rows = db.execute("SELECT id, sentence FROM sentences").fetchall()
response = client.embeddings.create(
input=[row[1] for row in sentence_rows], model="text-embedding-3-small"
)
for (id, _), embedding in zip(sentence_rows, response.data):
db.execute(
"INSERT INTO vec_sentences(id, sentence_embedding) VALUES(?, ?)",
[id, serialize(embedding.embedding)],
)
query = "fruity liquids"
query_embedding = (
client.embeddings.create(input=query, model="text-embedding-3-small")
.data[0]
.embedding
)
results = db.execute(
"""
SELECT
vec_sentences.id,
distance,
sentence
FROM vec_sentences
LEFT JOIN sentences ON sentences.id = vec_sentences.id
WHERE sentence_embedding MATCH ?
AND k = 3
ORDER BY distance
""",
[serialize(query_embedding)],
).fetchall()
for row in results:
print(row)
================================================
FILE: examples/simple-bun/.gitignore
================================================
node_modules/
bun.lockb
================================================
FILE: examples/simple-bun/demo.ts
================================================
import { Database } from "bun:sqlite";
Database.setCustomSQLite("/usr/local/opt/sqlite3/lib/libsqlite3.dylib");
const db = new Database(":memory:");
//sqliteVec.load(db);
db.loadExtension("../../dist/vec0");
const { sqlite_version, vec_version } = db
.prepare(
"select sqlite_version() as sqlite_version, vec_version() as vec_version;",
)
.get();
console.log(`sqlite_version=${sqlite_version}, vec_version=${vec_version}`);
const items = [
[1, [0.1, 0.1, 0.1, 0.1]],
[2, [0.2, 0.2, 0.2, 0.2]],
[3, [0.3, 0.3, 0.3, 0.3]],
[4, [0.4, 0.4, 0.4, 0.4]],
[5, [0.5, 0.5, 0.5, 0.5]],
];
const query = [0.3, 0.3, 0.3, 0.3];
db.exec("CREATE VIRTUAL TABLE vec_items USING vec0(embedding float[4])");
const insertStmt = db.prepare(
"INSERT INTO vec_items(rowid, embedding) VALUES (?, vec_f32(?))",
);
const insertVectors = db.transaction((items) => {
for (const [id, vector] of items) {
insertStmt.run(BigInt(id), new Float32Array(vector));
}
});
insertVectors(items);
const rows = db
.prepare(
`
SELECT
rowid,
distance
FROM vec_items
WHERE embedding MATCH ?
ORDER BY distance
LIMIT 3
`,
)
.all(new Float32Array(query));
console.log(rows);
================================================
FILE: examples/simple-bun/package.json
================================================
{
"name": "simple-bun",
"module": "index.ts",
"type": "module",
"dependencies": {
"sqlite-vec": "latest"
}
}
================================================
FILE: examples/simple-c/.gitignore
================================================
demo
================================================
FILE: examples/simple-c/Makefile
================================================
demo: demo.c
gcc \
-O3 -DSQLITE_CORE \
-I../../ -I../../vendor \
demo.c ../../sqlite-vec.c ../../vendor/sqlite3.c \
-o $@
================================================
FILE: examples/simple-c/demo.c
================================================
#include "sqlite3.h"
#include "sqlite-vec.h"
#include
#include
#include
int main(int argc, char *argv[]) {
int rc = SQLITE_OK;
sqlite3 *db;
sqlite3_stmt *stmt;
rc = sqlite3_auto_extension((void (*)())sqlite3_vec_init);
assert(rc == SQLITE_OK);
rc = sqlite3_open(":memory:", &db);
assert(rc == SQLITE_OK);
rc = sqlite3_prepare_v2(db, "SELECT sqlite_version(), vec_version()", -1, &stmt, NULL);
assert(rc == SQLITE_OK);
rc = sqlite3_step(stmt);
printf("sqlite_version=%s, vec_version=%s\n", sqlite3_column_text(stmt, 0), sqlite3_column_text(stmt, 1));
sqlite3_finalize(stmt);
static const struct {
sqlite3_int64 id;
float vector[4];
} items[] = {
{1, {0.1, 0.1, 0.1, 0.1}},
{2, {0.2, 0.2, 0.2, 0.2}},
{3, {0.3, 0.3, 0.3, 0.3}},
{4, {0.4, 0.4, 0.4, 0.4}},
{5, {0.5, 0.5, 0.5, 0.5}},
};
float query[4] = {0.3, 0.3, 0.3, 0.3};
rc = sqlite3_prepare_v2(db, "CREATE VIRTUAL TABLE vec_items USING vec0(embedding float[4])", -1, &stmt, NULL);
assert(rc == SQLITE_OK);
rc = sqlite3_step(stmt);
assert(rc == SQLITE_DONE);
sqlite3_finalize(stmt);
rc = sqlite3_exec(db, "BEGIN", NULL, NULL, NULL);
assert(rc == SQLITE_OK);
rc = sqlite3_prepare_v2(db, "INSERT INTO vec_items(rowid, embedding) VALUES (?, ?)", -1, &stmt, NULL);
assert(rc == SQLITE_OK);
for (unsigned long i = 0; i < sizeof(items) / sizeof(items[0]); i++) {
sqlite3_bind_int64(stmt, 1, items[i].id);
sqlite3_bind_blob(stmt, 2, items[i].vector, sizeof(items[i].vector), SQLITE_STATIC);
rc = sqlite3_step(stmt);
assert(rc == SQLITE_DONE);
sqlite3_reset(stmt);
}
sqlite3_finalize(stmt);
rc = sqlite3_exec(db, "COMMIT", NULL, NULL, NULL);
assert(rc == SQLITE_OK);
rc = sqlite3_prepare_v2(db,
"SELECT "
" rowid, "
" distance "
"FROM vec_items "
"WHERE embedding MATCH ?1 "
"ORDER BY distance "
"LIMIT 3 "
, -1, &stmt, NULL);
assert(rc == SQLITE_OK);
sqlite3_bind_blob(stmt, 1, query, sizeof(query), SQLITE_STATIC);
while(1) {
rc = sqlite3_step(stmt);
if(rc == SQLITE_DONE) break;
assert(rc==SQLITE_ROW);
sqlite3_int64 rowid = sqlite3_column_int64(stmt, 0);
double distance = sqlite3_column_double(stmt, 1);
printf("rowid=%lld distance=%f\n", rowid, distance);
}
sqlite3_finalize(stmt);
sqlite3_close(db);
return 0;
}
================================================
FILE: examples/simple-deno/demo.ts
================================================
import { Database } from "jsr:@db/sqlite@0.11";
import * as sqliteVec from "npm:sqlite-vec@0.0.1-alpha.9";
const db = new Database(":memory:");
db.enableLoadExtension = true;
sqliteVec.load(db);
db.enableLoadExtension = false;
const [sqlite_version, vec_version] = db
.prepare("select sqlite_version(), vec_version()")
.value<[string, string]>()!;
console.log(`sqlite_version=${sqlite_version}, vec_version=${vec_version}`);
const items = [
[1, [0.1, 0.1, 0.1, 0.1]],
[2, [0.2, 0.2, 0.2, 0.2]],
[3, [0.3, 0.3, 0.3, 0.3]],
[4, [0.4, 0.4, 0.4, 0.4]],
[5, [0.5, 0.5, 0.5, 0.5]],
];
const query = [0.3, 0.3, 0.3, 0.3];
db.exec("CREATE VIRTUAL TABLE vec_items USING vec0(embedding float[4])");
const insertStmt = db.prepare(
"INSERT INTO vec_items(rowid, embedding) VALUES (?, ?)"
);
const insertVectors = db.transaction((items) => {
for (const [id, vector] of items) {
insertStmt.run(BigInt(id), new Uint8Array(new Float32Array(vector).buffer));
}
});
insertVectors(items);
const rows = db
.prepare(
`
SELECT
rowid,
distance
FROM vec_items
WHERE embedding MATCH ?
ORDER BY distance
LIMIT 5
`
)
.all([new Uint8Array(new Float32Array(query).buffer)]);
console.log(rows);
db.close();
================================================
FILE: examples/simple-go-cgo/.gitignore
================================================
demo
================================================
FILE: examples/simple-go-cgo/Makefile
================================================
demo: demo.go go.mod go.sum
go build -o $@
================================================
FILE: examples/simple-go-cgo/demo.go
================================================
package main
import (
"database/sql"
"fmt"
"log"
sqlite_vec "github.com/asg017/sqlite-vec-go-bindings/cgo"
_ "github.com/mattn/go-sqlite3"
)
func main() {
sqlite_vec.Auto()
db, err := sql.Open("sqlite3", ":memory:")
if err != nil {
log.Fatal(err)
}
defer db.Close()
var sqliteVersion string
var vecVersion string
err = db.QueryRow("select sqlite_version(), vec_version()").Scan(&sqliteVersion, &vecVersion)
if err != nil {
log.Fatal(err)
}
fmt.Printf("sqlite_version=%s, vec_version=%s\n", sqliteVersion, vecVersion)
_, err = db.Exec("CREATE VIRTUAL TABLE vec_items USING vec0(embedding float[4])")
if err != nil {
log.Fatal(err)
}
items := map[int][]float32{
1: {0.1, 0.1, 0.1, 0.1},
2: {0.2, 0.2, 0.2, 0.2},
3: {0.3, 0.3, 0.3, 0.3},
4: {0.4, 0.4, 0.4, 0.4},
5: {0.5, 0.5, 0.5, 0.5},
}
q := []float32{0.3, 0.3, 0.3, 0.3}
for id, values := range items {
v, err := sqlite_vec.SerializeFloat32(values)
if err != nil {
log.Fatal(err)
}
_, err = db.Exec("INSERT INTO vec_items(rowid, embedding) VALUES (?, ?)", id, v)
if err != nil {
log.Fatal(err)
}
}
query, err := sqlite_vec.SerializeFloat32(q)
if err != nil {
log.Fatal(err)
}
rows, err := db.Query(`
SELECT
rowid,
distance
FROM vec_items
WHERE embedding MATCH ?
ORDER BY distance
LIMIT 3
`, query)
if err != nil {
log.Fatal(err)
}
for rows.Next() {
var rowid int64
var distance float64
err = rows.Scan(&rowid, &distance)
if err != nil {
log.Fatal(err)
}
fmt.Printf("rowid=%d, distance=%f\n", rowid, distance)
}
err = rows.Err()
if err != nil {
log.Fatal((err))
}
}
================================================
FILE: examples/simple-go-cgo/go.mod
================================================
module github.com/asg017/sqlite-vec/examples/go
go 1.22.5
require github.com/mattn/go-sqlite3 v1.14.22
require github.com/asg017/sqlite-vec-go-bindings v0.0.1-alpha.36 // indirect
================================================
FILE: examples/simple-go-cgo/go.sum
================================================
github.com/asg017/sqlite-vec-go-bindings v0.0.1-alpha.36 h1:FMGkKAA7nZL8gr/dvIx1uc54J3v2gbLVa+mLqZDCvjk=
github.com/asg017/sqlite-vec-go-bindings v0.0.1-alpha.36/go.mod h1:A8+cTt/nKFsYCQF6OgzSNpKZrzNo5gQsXBTfsXHXY0Q=
github.com/asg017/sqlite-vec/bindings/go/cgo v0.0.0-20240511043328-3d763f499859 h1:6jeFy/tSnyNJUrTHoIaFTYkjrHtwVAojvCGkr9G8d4o=
github.com/asg017/sqlite-vec/bindings/go/cgo v0.0.0-20240511043328-3d763f499859/go.mod h1:Go89G54PaautWRwxvAa1fmKeYoSuUyIvSYpvlfXQaNU=
github.com/mattn/go-sqlite3 v1.14.22 h1:2gZY6PC6kBnID23Tichd1K+Z0oS6nE/XwU+Vz/5o4kU=
github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y=
================================================
FILE: examples/simple-go-ncruces/.gitignore
================================================
demo
*.wasm
================================================
FILE: examples/simple-go-ncruces/Makefile
================================================
demo: demo.go
go build -o $@ $<
clean:
rm demo
.PHONY: clean
================================================
FILE: examples/simple-go-ncruces/demo.go
================================================
package main
import (
_ "embed"
"log"
sqlite_vec "github.com/asg017/sqlite-vec-go-bindings/ncruces"
"github.com/ncruces/go-sqlite3"
)
func main() {
db, err := sqlite3.Open(":memory:")
if err != nil {
log.Fatal(err)
}
stmt, _, err := db.Prepare(`SELECT sqlite_version(), vec_version()`)
if err != nil {
log.Fatal(err)
}
stmt.Step()
log.Printf("sqlite_version=%s, vec_version=%s\n", stmt.ColumnText(0), stmt.ColumnText(1))
stmt.Close()
err = db.Exec("CREATE VIRTUAL TABLE vec_items USING vec0(embedding float[4])")
if err != nil {
log.Fatal(err)
}
items := map[int][]float32{
1: {0.1, 0.1, 0.1, 0.1},
2: {0.2, 0.2, 0.2, 0.2},
3: {0.3, 0.3, 0.3, 0.3},
4: {0.4, 0.4, 0.4, 0.4},
5: {0.5, 0.5, 0.5, 0.5},
}
q := []float32{0.3, 0.3, 0.3, 0.3}
stmt, _, err = db.Prepare("INSERT INTO vec_items(rowid, embedding) VALUES (?, ?)")
if err != nil {
log.Fatal(err)
}
for id, values := range items {
v, err := sqlite_vec.SerializeFloat32(values)
if err != nil {
log.Fatal(err)
}
stmt.BindInt(1, id)
stmt.BindBlob(2, v)
err = stmt.Exec()
if err != nil {
log.Fatal(err)
}
stmt.Reset()
}
stmt.Close()
stmt, _, err = db.Prepare(`
SELECT
rowid,
distance
FROM vec_items
WHERE embedding MATCH ?
ORDER BY distance
LIMIT 3
`);
if err != nil {
log.Fatal(err)
}
query, err := sqlite_vec.SerializeFloat32(q)
if err != nil {
log.Fatal(err)
}
stmt.BindBlob(1, query)
for stmt.Step() {
rowid := stmt.ColumnInt64(0)
distance := stmt.ColumnFloat(1)
log.Printf("rowid=%d, distance=%f\n", rowid, distance)
}
if err := stmt.Err(); err != nil {
log.Fatal(err)
}
err = stmt.Close()
if err != nil {
log.Fatal(err)
}
err = db.Close()
if err != nil {
log.Fatal(err)
}
}
================================================
FILE: examples/simple-go-ncruces/go.mod
================================================
module asg017.com/ex1
go 1.22.5
require (
github.com/asg017/sqlite-vec-go-bindings v0.0.1-alpha.37
github.com/ncruces/go-sqlite3 v0.17.2-0.20240711235451-21de85e849b7
)
require (
github.com/ncruces/julianday v1.0.0 // indirect
github.com/tetratelabs/wazero v1.7.3 // indirect
golang.org/x/sys v0.22.0 // indirect
)
================================================
FILE: examples/simple-go-ncruces/go.sum
================================================
github.com/asg017/sqlite-vec-go-bindings v0.0.1-alpha.37 h1:Gz6YkDCs60k5VwbBPKDfAPPeIBcuaN3qriAozAaIIZI=
github.com/asg017/sqlite-vec-go-bindings v0.0.1-alpha.37/go.mod h1:A8+cTt/nKFsYCQF6OgzSNpKZrzNo5gQsXBTfsXHXY0Q=
github.com/ncruces/go-sqlite3 v0.17.2-0.20240711235451-21de85e849b7 h1:ssM02uUFDfz0V2TMg2du2BjbW9cpOhFJK0kpDN+X768=
github.com/ncruces/go-sqlite3 v0.17.2-0.20240711235451-21de85e849b7/go.mod h1:FnCyui8SlDoL0mQZ5dTouNo7s7jXS0kJv9lBt1GlM9w=
github.com/ncruces/julianday v1.0.0 h1:fH0OKwa7NWvniGQtxdJRxAgkBMolni2BjDHaWTxqt7M=
github.com/ncruces/julianday v1.0.0/go.mod h1:Dusn2KvZrrovOMJuOt0TNXL6tB7U2E8kvza5fFc9G7g=
github.com/tetratelabs/wazero v1.7.3 h1:PBH5KVahrt3S2AHgEjKu4u+LlDbbk+nsGE3KLucy6Rw=
github.com/tetratelabs/wazero v1.7.3/go.mod h1:ytl6Zuh20R/eROuyDaGPkp82O9C/DJfXAwJfQ3X6/7Y=
golang.org/x/sys v0.22.0 h1:RI27ohtqKCnwULzJLqkv897zojh5/DwS/ENaMzUOaWI=
golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4=
golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI=
================================================
FILE: examples/simple-node/.gitignore
================================================
node_modules/
package-lock.json
================================================
FILE: examples/simple-node/demo.mjs
================================================
import * as sqliteVec from "sqlite-vec";
import Database from "better-sqlite3";
const db = new Database(":memory:");
sqliteVec.load(db);
const { sqlite_version, vec_version } = db
.prepare(
"select sqlite_version() as sqlite_version, vec_version() as vec_version;",
)
.get();
console.log(`sqlite_version=${sqlite_version}, vec_version=${vec_version}`);
const items = [
[1, [0.1, 0.1, 0.1, 0.1]],
[2, [0.2, 0.2, 0.2, 0.2]],
[3, [0.3, 0.3, 0.3, 0.3]],
[4, [0.4, 0.4, 0.4, 0.4]],
[5, [0.5, 0.5, 0.5, 0.5]],
];
const query = [0.3, 0.3, 0.3, 0.3];
db.exec("CREATE VIRTUAL TABLE vec_items USING vec0(embedding float[4])");
const insertStmt = db.prepare(
"INSERT INTO vec_items(rowid, embedding) VALUES (?, ?)",
);
const insertVectors = db.transaction((items) => {
for (const [id, vector] of items) {
insertStmt.run(BigInt(id), new Float32Array(vector));
}
});
insertVectors(items);
const rows = db
.prepare(
`
SELECT
rowid,
distance
FROM vec_items
WHERE embedding MATCH ?
ORDER BY distance
LIMIT 3
`,
)
.all(new Float32Array(query));
console.log(rows);
================================================
FILE: examples/simple-node/package.json
================================================
{
"name": "node",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"keywords": [],
"author": "",
"license": "ISC",
"dependencies": {
"better-sqlite3": "^9.6.0",
"sqlite-vec": "latest"
}
}
================================================
FILE: examples/simple-node2/.gitignore
================================================
node_modules/
package-lock.json
================================================
FILE: examples/simple-node2/demo.mjs
================================================
/**
* This demo Node.js script shows how you can use sqlite-vec with
* the new builtin node:sqlite module.
* Note that this requires Node v23.5.0 or above.
*/
import { DatabaseSync } from "node:sqlite";
import * as sqliteVec from "sqlite-vec";
// allExtension is required to enable extension support
const db = new DatabaseSync(":memory:", { allowExtension: true });
sqliteVec.load(db);
const { sqlite_version, vec_version } = db
.prepare(
"select sqlite_version() as sqlite_version, vec_version() as vec_version;",
)
.get();
console.log(`sqlite_version=${sqlite_version}, vec_version=${vec_version}`);
const items = [
[1, [0.1, 0.1, 0.1, 0.1]],
[2, [0.2, 0.2, 0.2, 0.2]],
[3, [0.3, 0.3, 0.3, 0.3]],
[4, [0.4, 0.4, 0.4, 0.4]],
[5, [0.5, 0.5, 0.5, 0.5]],
];
const query = [0.3, 0.3, 0.3, 0.3];
db.exec("CREATE VIRTUAL TABLE vec_items USING vec0(embedding float[4])");
const insertStmt = db.prepare(
"INSERT INTO vec_items(rowid, embedding) VALUES (?, ?)",
);
// TODO node:sqlite doesn't have `.transaction()` support yet
for (const [id, vector] of items) {
// node:sqlite requires Uint8Array for BLOB values, so a bit awkward
insertStmt.run(BigInt(id), new Uint8Array(new Float32Array(vector).buffer));
}
const rows = db
.prepare(
`
SELECT
rowid,
distance
FROM vec_items
WHERE embedding MATCH ?
ORDER BY distance
LIMIT 3
`,
)
.all(new Uint8Array(new Float32Array(query).buffer));
console.log(rows);
================================================
FILE: examples/simple-node2/package.json
================================================
{
"name": "simple-node2",
"version": "1.0.0",
"main": "demo.mjs",
"engines": {
"node": ">=23.5.0"
},
"dependencies": {
"sqlite-vec": "latest"
}
}
================================================
FILE: examples/simple-node2/tmp.mjs
================================================
import { DatabaseSync } from "node:sqlite";
import * as sqliteVec from "sqlite-vec";
const db = new DatabaseSync(":memory:", { allowExtension: true });
sqliteVec.load(db);
const embedding = new Float32Array([0.1, 0.2, 0.3, 0.4]);
const { result } = db
.prepare("select vec_length(?) as result")
.get(new Uint8Array(embedding.buffer));
console.log(result); // 4
================================================
FILE: examples/simple-python/.gitignore
================================================
.venv
================================================
FILE: examples/simple-python/demo.py
================================================
import sqlite3
import sqlite_vec
from typing import List
import struct
def serialize_f32(vector: List[float]) -> bytes:
"""serializes a list of floats into a compact "raw bytes" format"""
return struct.pack("%sf" % len(vector), *vector)
db = sqlite3.connect(":memory:")
db.enable_load_extension(True)
sqlite_vec.load(db)
db.enable_load_extension(False)
sqlite_version, vec_version = db.execute(
"select sqlite_version(), vec_version()"
).fetchone()
print(f"sqlite_version={sqlite_version}, vec_version={vec_version}")
items = [
(1, [0.1, 0.1, 0.1, 0.1]),
(2, [0.2, 0.2, 0.2, 0.2]),
(3, [0.3, 0.3, 0.3, 0.3]),
(4, [0.4, 0.4, 0.4, 0.4]),
(5, [0.5, 0.5, 0.5, 0.5]),
]
query = [0.3, 0.3, 0.3, 0.3]
db.execute("CREATE VIRTUAL TABLE vec_items USING vec0(embedding float[4])")
with db:
for item in items:
db.execute(
"INSERT INTO vec_items(rowid, embedding) VALUES (?, ?)",
[item[0], serialize_f32(item[1])],
)
rows = db.execute(
"""
SELECT
rowid,
distance
FROM vec_items
WHERE embedding MATCH ?
ORDER BY distance
LIMIT 3
""",
[serialize_f32(query)],
).fetchall()
print(rows)
================================================
FILE: examples/simple-python/requirements.txt
================================================
sqlite-vec
================================================
FILE: examples/simple-ruby/.gitignore
================================================
Gemfile.lock
================================================
FILE: examples/simple-ruby/Gemfile
================================================
source 'https://rubygems.org'
ruby '>= 3.0'
gem 'sqlite3', '~> 2.0', '>= 2.0.1'
gem 'sqlite-vec'
================================================
FILE: examples/simple-ruby/demo.rb
================================================
require 'sqlite3'
require 'sqlite_vec'
db = SQLite3::Database.new(':memory:')
db.enable_load_extension(true)
SqliteVec.load(db)
db.enable_load_extension(false)
sqlite_version, vec_version = db.execute("select sqlite_version(), vec_version()").first
puts "sqlite_version=#{sqlite_version}, vec_version=#{vec_version}"
db.execute("CREATE VIRTUAL TABLE vec_items USING vec0(embedding float[4])")
items = [
[1, [0.1, 0.1, 0.1, 0.1]],
[2, [0.2, 0.2, 0.2, 0.2]],
[3, [0.3, 0.3, 0.3, 0.3]],
[4, [0.4, 0.4, 0.4, 0.4]],
[5, [0.5, 0.5, 0.5, 0.5]],
]
db.transaction do
items.each do |item|
db.execute("INSERT INTO vec_items(rowid, embedding) VALUES (?, ?)", [item[0], item[1].pack("f*")])
end
end
query = [0.3, 0.3, 0.3, 0.3]
rows = db.execute(<<-SQL, [query.pack("f*")])
SELECT
rowid,
distance
FROM vec_items
WHERE embedding MATCH ?
ORDER BY distance
LIMIT 3
SQL
puts rows
================================================
FILE: examples/simple-rust/.gitignore
================================================
target/
Cargo.lock
================================================
FILE: examples/simple-rust/Cargo.toml
================================================
[package]
name = "sqlite-vec-demo"
edition = "2021"
[dependencies]
sqlite-vec={version="0.0.1-alpha.7"}
rusqlite = {version="0.31.0", features=["bundled"]}
zerocopy = "0.7.33"
[[bin]]
name="demo"
path="demo.rs"
================================================
FILE: examples/simple-rust/demo.rs
================================================
use rusqlite::{ffi::sqlite3_auto_extension, Connection, Result};
use sqlite_vec::sqlite3_vec_init;
use zerocopy::AsBytes;
fn main() -> Result<()> {
unsafe {
sqlite3_auto_extension(Some(std::mem::transmute(sqlite3_vec_init as *const ())));
}
let db = Connection::open_in_memory()?;
let v: Vec = vec![0.1, 0.2, 0.3];
let (sqlite_version, vec_version, x): (String, String, String) = db.query_row(
"select sqlite_version(), vec_version(), vec_to_json(?)",
&[v.as_bytes()],
|x| Ok((x.get(0)?, x.get(1)?, x.get(2)?)),
)?;
println!("sqlite_version={sqlite_version}, vec_version={vec_version}");
let items: Vec<(usize, Vec)> = vec![
(1, vec![0.1, 0.1, 0.1, 0.1]),
(2, vec![0.2, 0.2, 0.2, 0.2]),
(3, vec![0.3, 0.3, 0.3, 0.3]),
(4, vec![0.4, 0.4, 0.4, 0.4]),
(5, vec![0.5, 0.5, 0.5, 0.5]),
];
println!("{x}");
db.execute(
"CREATE VIRTUAL TABLE vec_items USING vec0(embedding float[4])",
[],
)?;
let mut stmt = db.prepare("INSERT INTO vec_items(rowid, embedding) VALUES (?, ?)")?;
for item in items {
stmt.execute(rusqlite::params![item.0, item.1.as_bytes()])?;
}
let query: Vec = vec![0.3, 0.3, 0.3, 0.3];
let result: Vec<(i64, f64)> = db
.prepare(
r"
SELECT
rowid,
distance
FROM vec_items
WHERE embedding MATCH ?1
ORDER BY distance
LIMIT 3
",
)?
.query_map([query.as_bytes()], |r| Ok((r.get(0)?, r.get(1)?)))?
.collect::, _>>()?;
println!("{:?}", result);
Ok(())
}
================================================
FILE: examples/simple-sqlite/demo.sql
================================================
.load ../../dist/vec0
.mode table
.header on
select sqlite_version(), vec_version();
CREATE VIRTUAL TABLE vec_items USING vec0(embedding float[4]);
INSERT INTO vec_items(rowid, embedding)
select
value ->> 0,
value ->> 1
from json_each('[
[1, [0.1, 0.1, 0.1, 0.1]],
[2, [0.2, 0.2, 0.2, 0.2]],
[3, [0.3, 0.3, 0.3, 0.3]],
[4, [0.4, 0.4, 0.4, 0.4]],
[5, [0.5, 0.5, 0.5, 0.5]]
]');
SELECT
rowid,
distance
FROM vec_items
WHERE embedding MATCH '[0.3, 0.3, 0.3, 0.3]'
ORDER BY distance
LIMIT 3;
================================================
FILE: examples/simple-wasm/index.html
================================================
sqlite-vec demo/simple-wasm
`
================================================
FILE: examples/sqlite3-cli/README.md
================================================
# `sqlite-vec` statically compiled in the SQLite CLI
You can compile your own version of the `sqlite3` CLI with `sqlite-vec` builtin.
The process is not well documented, but the special `SQLITE_EXTRA_INIT` compile
option can be used to "inject" code at initialization time. See the `Makefile`
at the root of this project for some more info.
The `core_init.c` file here demonstrates auto-loading the `sqlite-vec`
entrypoints at startup.
================================================
FILE: examples/sqlite3-cli/core_init.c
================================================
#include "sqlite3.h"
#include "sqlite-vec.h"
#include
int core_init(const char *dummy) {
return sqlite3_auto_extension((void *)sqlite3_vec_init);
}
================================================
FILE: examples/wasm/README.md
================================================
# `sqlite-vec` statically compiled into WASM builds
You can compile your own version of SQLite's WASM build with `sqlite-vec`
builtin. Dynamically loading SQLite extensions is not supported in the official
WASM build yet, but you can statically compile extensions in. It's not well
documented, but the `sqlite3_wasm_extra_init` option in the SQLite `ext/wasm`
Makefile allows you to inject your own code at initialization time. See the
`Makefile` at the room of the project for more info.
The `wasm.c` file here demonstrates auto-loading the `sqlite-vec` entrypoints at
startup.
================================================
FILE: examples/wasm/wasm.c
================================================
#include "sqlite3.h"
#include "sqlite-vec.h"
int sqlite3_wasm_extra_init(const char * unused) {
return sqlite3_auto_extension((void (*)(void)) sqlite3_vec_init);
}
================================================
FILE: reference.yaml
================================================
sections:
constructors:
title: Constructors
desc: |
SQL functions that "construct" vectors with different element types.
Currently, only `float32`, `int8`, and `bit` vectors are supported.
op:
title: Operations
desc: |
Different operations and utilities for working with vectors.
distance:
title: Distance functions
desc: Various algorithms to calculate distance between two vectors.
quantization:
title: Quantization
desc: Various techniques to "compress" a vector by reducing precision and accuracy.
numpy:
title: "NumPy Utilities"
desc: Functions to read data from or work with [NumPy arrays](https://numpy.org/doc/stable/reference/generated/numpy.array.html).
meta:
title: Meta
desc: Helper functions to debug `sqlite-vec` installations.
entrypoints:
title: Entrypoints
desc: All the named entrypoints that load in different `sqlite-vec` functions and options.
# vec0:
# title: "vec0 Virtual Table"
# desc: TODO
meta:
vec_version:
params: []
desc: Returns a version string of the current `sqlite-vec` installation.
example: select vec_version();
vec_debug:
params: []
desc: Returns debugging information of the current `sqlite-vec` installation.
example: select vec_debug();
constructors:
vec_f32:
params: [vector]
desc: |
Creates a float vector from a BLOB or JSON text. If a BLOB is provided,
the length must be divisible by 4, as a float takes up 4 bytes of space each.
The returned value is a BLOB with 4 bytes per element, with a special [subtype](https://www.sqlite.org/c3ref/result_subtype.html)
of `223`.
example:
- select vec_f32('[.1, .2, .3, 4]');
- select subtype(vec_f32('[.1, .2, .3, 4]'));
- select vec_f32(X'AABBCCDD');
- select vec_to_json(vec_f32(X'AABBCCDD'));
- select vec_f32(X'AA');
vec_int8:
params: [vector]
desc: |
Creates a 8-bit integer vector from a BLOB or JSON text. If a BLOB is provided,
the length must be divisible by 4, as a float takes up 4 bytes of space each.
If JSON text is provided, each element must be an integer between -128 and 127 inclusive.
The returned value is a BLOB with 1 byte per element, with a special [subtype](https://www.sqlite.org/c3ref/result_subtype.html)
of `225`.
example:
- select vec_int8('[1, 2, 3, 4]');
- select subtype(vec_int8('[1, 2, 3, 4]'));
- select vec_int8(X'AABBCCDD');
- select vec_to_json(vec_int8(X'AABBCCDD'));
- select vec_int8('[999]');
vec_bit:
params: [vector]
desc: |
Creates a binary vector from a BLOB.
The returned value is a BLOB with 1 byte per 8 elements, with a special [subtype](https://www.sqlite.org/c3ref/result_subtype.html)
of `224`.
example:
- select vec_bit(X'F0');
- select subtype(vec_bit(X'F0'));
- select vec_to_json(vec_bit(X'F0'));
op:
vec_length:
params: [vector]
desc: |
Returns the number of elements in the given vector.
The vector can be `JSON`, `BLOB`, or the result of a [constructor function](#constructors).
This function will return an error if `vector` is invalid.
example:
- select vec_length('[.1, .2]');
- select vec_length(X'AABBCCDD');
- select vec_length(vec_int8(X'AABBCCDD'));
- select vec_length(vec_bit(X'AABBCCDD'));
- select vec_length(X'CCDD');
vec_type:
params: [vector]
desc: |
Returns the name of the type of `vector` as text. One of `'float32'`, `'int8'`, or `'bit'`.
This function will return an error if `vector` is invalid.
example:
- select vec_type('[.1, .2]');
- select vec_type(X'AABBCCDD');
- select vec_type(vec_int8(X'AABBCCDD'));
- select vec_type(vec_bit(X'AABBCCDD'));
- select vec_type(X'CCDD');
vec_add:
params: [a, b]
desc: |
Adds every element in vector `a` with vector `b`, returning a new vector `c`. Both vectors
must be of the same type and same length. Only `float32` and `int8` vectors are supported.
An error is raised if either `a` or `b` are invalid, or if they are not the same type or same length.
See also [`vec_sub()`](#vec_sub).
example:
- |
select vec_add(
'[.1, .2, .3]',
'[.4, .5, .6]'
);
- |
select vec_to_json(
vec_add(
'[.1, .2, .3]',
'[.4, .5, .6]'
)
);
- |
select vec_to_json(
vec_add(
vec_int8('[1, 2, 3]'),
vec_int8('[4, 5, 6]')
)
);
- select vec_add('[.1]', vec_int8('[1]'));
- select vec_add(vec_bit(X'AA'), vec_bit(X'BB'));
vec_sub:
params: [a, b]
desc: |
Subtracts every element in vector `a` with vector `b`, returning a new vector `c`. Both vectors
must be of the same type and same length. Only `float32` and `int8` vectors are supported.
An error is raised if either `a` or `b` are invalid, or if they are not the same type or same length.
See also [`vec_add()`](#vec_add).
example:
- |
select vec_sub(
'[.1, .2, .3]',
'[.4, .5, .6]'
);
- |
select vec_to_json(
vec_sub(
'[.1, .2, .3]',
'[.4, .5, .6]'
)
);
- |
select vec_to_json(
vec_sub(
vec_int8('[1, 2, 3]'),
vec_int8('[4, 5, 6]')
)
);
- select vec_sub('[.1]', vec_int8('[1]'));
- select vec_sub(vec_bit(X'AA'), vec_bit(X'BB'));
vec_normalize:
params: [vector]
desc: |
Performs L2 normalization on the given vector. Only float32 vectors are currently supported.
Returns an error if the input is an invalid vector or not a float32 vector.
example:
- select vec_normalize('[2, 3, 1, -4]');
- |
select vec_to_json(
vec_normalize('[2, 3, 1, -4]')
);
- |
-- for matryoshka embeddings - slice then normalize
select vec_to_json(
vec_normalize(
vec_slice('[2, 3, 1, -4]', 0, 2)
)
);
vec_slice:
params: [vector, start, end]
desc: |
Extract a subset of `vector` from the `start` element (inclusive) to the `end` element (exclusive). TODO check
This is especially useful for [Matryoshka embeddings](#TODO), also known as "adaptive length" embeddings.
Use with [`vec_normalize()`](#vec_normalize) to get proper results.
Returns an error in the following conditions:
- If `vector` is not a valid vector
- If `start` is less than zero or greater than or equal to `end`
- If `end` is greater than the length of `vector`, or less than or equal to `start`.
- If `vector` is a bitvector, `start` and `end` must be divisible by 8.
example:
- select vec_slice('[1, 2,3, 4]', 0, 2);
- |
select vec_to_json(
vec_slice('[1, 2,3, 4]', 0, 2)
);
- |
select vec_to_json(
vec_slice('[1, 2,3, 4]', 2, 4)
);
- |
select vec_to_json(
vec_slice('[1, 2,3, 4]', -1, 4)
);
- |
select vec_to_json(
vec_slice('[1, 2,3, 4]', 0, 5)
);
- |
select vec_to_json(
vec_slice('[1, 2,3, 4]', 0, 0)
);
vec_to_json:
params: [vector]
desc: |
Represents a vector as JSON text. The input vector can be a vector BLOB or JSON text.
Returns an error if `vector` is an invalid vector, or when memory cannot be allocated.
example:
- select vec_to_json(X'AABBCCDD');
- select vec_to_json(vec_int8(X'AABBCCDD'));
- select vec_to_json(vec_bit(X'AABBCCDD'));
- select vec_to_json('[1,2,3,4]');
- select vec_to_json('invalid');
vec_each:
params: [vector]
desc: |
A table function to iterate through every element in a vector. One row id returned per element in a vector.
```sql
CREATE TABLE vec_each(
rowid int, -- The
vector HIDDEN -- input parameter: A well-formed vector value
)
```
Returns an error if `vector` is not a valid vector.
example:
- select rowid, value from vec_each('[1,2,3,4]');
- select rowid, value from vec_each(X'AABBCCDD00112233');
- select rowid, value from vec_each(vec_int8(X'AABBCCDD'));
- select rowid, value from vec_each(vec_bit(X'F0'));
distance:
vec_distance_L2:
params: [a, b]
desc: |
Calculates the L2 euclidian distance between vectors `a` and `b`. Only valid for float32 or int8 vectors.
Returns an error under the following conditions:
- `a` or `b` are invalid vectors
- `a` or `b` do not share the same vector element types (ex float32 or int8)
- `a` or `b` are bit vectors. Use [`vec_distance_hamming()`](#vec_distance_hamming) for distance calculations between two bitvectors.
- `a` or `b` do not have the same length.
example:
- select vec_distance_L2('[1, 1]', '[2, 2]');
- select vec_distance_L2('[1, 1]', '[-2, -2]');
- select vec_distance_L2('[1.1, 2.2, 3.3]', '[4.4, 5.5, 6.6]');
- select vec_distance_L2(X'AABBCCDD', X'00112233');
- select vec_distance_L2('[1, 1]', vec_int8('[2, 2]'));
- select vec_distance_L2(vec_bit(X'AA'), vec_bit(X'BB'));
vec_distance_cosine:
params: [a, b]
desc: |
Calculates the cosine distance between vectors `a` and `b`. Only valid for float32 or int8 vectors.
Returns an error under the following conditions:
- `a` or `b` are invalid vectors
- `a` or `b` do not share the same vector element types (ex float32 or int8)
- `a` or `b` are bit vectors. Use [`vec_distance_hamming()`](#vec_distance_hamming) for distance calculations between two bitvectors.
- `a` or `b` do not have the same length.
example:
- select vec_distance_cosine('[1, 1]', '[2, 2]');
- select vec_distance_cosine('[1, 1]', '[-2, -2]');
- select vec_distance_cosine('[1.1, 2.2, 3.3]', '[4.4, 5.5, 6.6]');
- select vec_distance_cosine(X'AABBCCDD', X'00112233');
- select vec_distance_cosine('[1, 1]', vec_int8('[2, 2]'));
- select vec_distance_cosine(vec_bit(X'AA'), vec_bit(X'BB'));
vec_distance_hamming:
params: [a, b]
desc: |
Calculates the hamming distance between two bitvectors `a` and `b`. Only valid for bitvectors.
Returns an error under the following conditions:
- `a` or `b` are not bitvectors
- `a` and `b` do not share the same length
- Memory cannot be allocated
example:
- select vec_distance_hamming(vec_bit(X'00'), vec_bit(X'FF'));
- select vec_distance_hamming(vec_bit(X'FF'), vec_bit(X'FF'));
- select vec_distance_hamming(vec_bit(X'F0'), vec_bit(X'44'));
- select vec_distance_hamming('[1, 1]', '[0, 0]');
quantization:
vec_quantize_binary:
params: [vector]
desc: |
Quantize a float32 or int8 vector into a bitvector.
For every element in the vector, a `1` is assigned to positive numbers and a `0` is assigned to negative numbers.
These values are then packed into a bit vector.
Returns an error if `vector` is invalid, or if `vector` is not a float32 or int8 vector.
example:
- select vec_quantize_binary('[1, 2, 3, 4, 5, 6, 7, 8]');
- select vec_quantize_binary('[1, 2, 3, 4, -5, -6, -7, -8]');
- select vec_quantize_binary('[-1, -2, -3, -4, -5, -6, -7, -8]');
- select vec_quantize_binary('[-1, -2, -3, -4, -5, -6, -7, -8]');
- select vec_quantize_binary(vec_int8(X'11223344'));
- select vec_quantize_binary(vec_bit(X'FF'));
vec_quantize_i8:
params: [vector, "[start]", "[end]"]
desc: x
example: select 'todo';
vec0:
vec0:
params: []
desc: TODO
example:
- |
create virtual table vec_items using vec0(
contents_embedding float[4]
);
- |
insert into vec_items(rowid, contents_embedding)
values (1, '[1, 1, 1, 1]'),
(2, '[2, 2, 2, 2]'),
(3, '[3, 3, 3, 3]');
================================================
FILE: scripts/progress.ts
================================================
const src = Deno.readTextFileSync("sqlite-vec.c");
function numOccuranges(rg) {
return [...src.matchAll(rg)].length;
}
const numAsserts = numOccuranges(/todo_assert/g);
const numComments = numOccuranges(/TODO/g);
const numHandles = numOccuranges(/todo\(/g);
const realTodos = numOccuranges(/TODO\(/g);
const numTotal = numAsserts + numComments + numHandles - realTodos;
console.log("Number of todo_assert()'s: ", numAsserts);
console.log('Number of "// TODO" comments: ', numComments);
console.log("Number of todo panics: ", numHandles);
console.log("Total TODOs: ", numTotal);
console.log();
const TOTAL = 246; // as of e5b0f4c0c5 (2024-04-20)
const progress = (TOTAL - numTotal) / TOTAL;
const width = 60;
console.log(
"▓".repeat((progress < 0 ? 0 : progress) * width) +
"░".repeat((1 - progress) * width) +
` (${TOTAL - numTotal}/${TOTAL})`,
);
console.log();
console.log(
`${(progress * 100.0).toPrecision(2)}% complete to sqlite-vec v0.1.0`,
);
================================================
FILE: scripts/publish-release.sh
================================================
#!/bin/bash
set -euo pipefail xtrace
if [[ -n $(git status --porcelain | grep -v VERSION | grep -v sqlite-dist.toml) ]]; then
echo "❌ There are other un-staged changes to the repository besides VERSION and sqlite-dist.toml"
exit 1
fi
VERSION="$(cat VERSION)"
echo "Publishing version v$VERSION..."
make version
git add --all
git commit -m "v$VERSION"
git tag v$VERSION
git push origin main v$VERSION
if grep -qE "alpha|beta" VERSION; then
gh release create v$VERSION --title=v$VERSION --prerelease
else
gh release create v$VERSION --title=v$VERSION
fi
echo "✅ Published! version v$VERSION"
================================================
FILE: scripts/vendor.sh
================================================
#!/bin/bash
mkdir -p vendor
curl -o sqlite-amalgamation.zip https://www.sqlite.org/2024/sqlite-amalgamation-3450300.zip
unzip -d
unzip sqlite-amalgamation.zip
mv sqlite-amalgamation-3450300/* vendor/
rmdir sqlite-amalgamation-3450300
rm sqlite-amalgamation.zip
================================================
FILE: site/.gitignore
================================================
node_modules
.vitepress/cache
================================================
FILE: site/.vitepress/config.mts
================================================
import { DefaultTheme, defineConfig, HeadConfig } from "vitepress";
import { readFileSync } from "node:fs";
import { dirname, join } from "node:path";
import { fileURLToPath } from "node:url";
const PROJECT = "sqlite-vec";
const description = "A vector search SQLite extension that runs anywhere!";
const VERSION = readFileSync(
join(dirname(fileURLToPath(import.meta.url)), "..", "..", "VERSION"),
"utf8"
);
const sqliteLanuage = JSON.parse(
readFileSync(
join(
dirname(fileURLToPath(import.meta.url)),
"..",
"sqlite.tmlanguage.json"
),
"utf8"
)
);
function head(): HeadConfig[] {
return [
[
"link",
{
rel: "shortcut icon",
type: "image/svg+xml",
href: "./logo.light.svg",
},
],
[
"script",
{
defer: "",
"data-domain": "alexgarcia.xyz/sqlite-vec",
src: "https://plausible.io/js/script.js",
},
],
];
}
const guides = {
text: "Guides",
collapsed: true,
items: [
{ text: "Performance", link: "/guides/performance" },
{
text: "Vector operations",
items: [
{ text: "Vector Arithmetic", link: "/guides/arithmetic" },
{ text: "Binary Quantization", link: "/guides/binary-quant" },
{ text: "Scalar Quantization", link: "/guides/scalar-quant" },
{
text: "Matryoshka Embeddings",
link: "/guides/matryoshka",
},
],
},
/* {
text: "Build with sqlite-vec",
items: [
{ text: "Semantic Search", link: "/guides/semantic-search" },
{ text: "Hybrid Search", link: "/guides/hybrid-search" },
{ text: "Retrival Augmented Generation (RAG)", link: "/guides/rag" },
{ text: "Classifiers", link: "/guides/classifiers" },
],
},*/
],
};
function nav(): DefaultTheme.NavItem[] {
return [
{ text: "API Reference", link: "/api-reference" },
{ text: "♥ Sponsor", link: "https://github.com/sponsors/asg017" },
{
text: `v${VERSION}`,
items: [
{
text: "Github Release",
link: `https://github.com/asg017/${PROJECT}/releases/${VERSION}`,
},
{
text: "Bindings",
items: [
{
text: "Python: PyPi package",
link: `https://pypi.org/project/${PROJECT}`,
},
{
text: "Node.js: NPM package",
link: `https://www.npmjs.com/package/${PROJECT}`,
},
{
text: "Ruby: Ruby gem",
link: `https://rubygems.org/gems/${PROJECT}`,
},
{
text: "Rust: Cargo crate",
link: `https://crates.io/crates/${PROJECT}`,
},
{
text: "Golang: Go module (CGO)",
link: `https://pkg.go.dev/github.com/asg017/${PROJECT}-go-bindings/cgo`,
},
{
text: "Golang: Go module (WASM ncruces)",
link: `https://pkg.go.dev/github.com/asg017/${PROJECT}-go-bindings/ncruces`,
},
{
text: "Datasette: Plugin",
link: `https://datasette.io/plugins/datasette-${PROJECT}`,
},
{
text: "sqlite-utils: Plugin",
link: `https://datasette.io/plugins/datasette-${PROJECT}`,
},
],
},
],
},
];
}
function sidebar(): DefaultTheme.SidebarItem[] {
return [
{
text: "Getting Started",
collapsed: true,
items: [
{
text: "Introduction",
link: "/introduction",
},
{
text: "Installation",
link: "/installation",
},
],
},
{
text: "Using with...",
collapsed: true,
items: [
{ text: "Python", link: "/python" },
{ text: "JavaScript", link: "/js" },
{ text: "Ruby", link: "/ruby" },
{ text: "Rust", link: "/rust" },
{ text: "Go", link: "/go" },
{ text: "C/C++", link: "/c" },
{ text: "Browser (WASM)", link: "/wasm" },
{ text: "Datasette", link: "/datasette" },
{ text: "sqlite-utils", link: "/sqlite-utils" },
{ text: "rqlite", link: "/rqlite" },
{ text: "Android+iOS", link: "/android-ios" },
],
},
{
text: "Features",
collapsed: true,
items: [
{ text: "Vector formats", link: "/features/vector-formats" },
{ text: "KNN queries", link: "/features/knn" },
{ text: "vec0 Virtual Tables", link: "/features/vec0" , items: [
{ text: "Constructor", link: "/features/vec0#TODO" },
{ text: "KNN Queries", link: "/features/vec0#TODO", items: [{text: "a", link:""}] },
{ text: "Metadata Columns", link: "/features/vec0#metadata" },
{ text: "Partition Keys", link: "/features/vec0#partition-keys" },
{ text: "Auxiliary Columns", link: "/features/vec0#aux" },
]},
//{ text: "Static blobs", link: "/features/static-blobs" },
],
},
guides,
{
text: "Documentation",
items: [
{ text: "Compiling", link: "/compiling" },
{ text: "API Reference", link: "/api-reference" },
],
},
{
text: "See also",
items: [
{
text: "sqlite-ecosystem",
link: "https://github.com/asg017/sqlite-ecosystem",
},
{
text: "sqlite-lembed",
link: "https://github.com/asg017/sqlite-lembed",
},
{
text: "sqlite-rembed",
link: "https://github.com/asg017/sqlite-rembed",
},
],
},
];
}
// https://vitepress.dev/reference/site-config
export default defineConfig({
title: `${PROJECT}`,
description,
lastUpdated: true,
head: head(),
base: "/sqlite-vec/",
themeConfig: {
logo: {
light: "/logo.dark.svg",
dark: "/logo.light.svg",
alt: "sqlite-vec logo",
},
nav: nav(),
sidebar: sidebar(),
footer: {
message: "MIT/Apache-2 License",
copyright:
'Copyright © 2024 Alex Garcia ',
},
outline: "deep",
search: {
provider: "local",
},
socialLinks: [
{ icon: "github", link: `https://github.com/asg017/${PROJECT}` },
{ icon: "discord", link: `https://discord.gg/Ve7WeCJFXk` },
],
editLink: {
pattern: `https://github.com/asg017/${PROJECT}/edit/main/site/:path`,
},
},
rewrites: {
"using/:pkg.md": ":pkg.md",
"getting-started/:pkg.md": ":pkg.md",
//"guides/:pkg.md": ":pkg.md",
},
markdown: {
languages: [sqliteLanuage],
},
});
================================================
FILE: site/.vitepress/theme/HeroImg.vue
================================================
-- store 768-dimensional vectors in a vec0 virtual table
create virtual table vec_movies using vec0(
synopsis_embedding float [768]
);
-- insert vectors into the table, as JSON or compact BLOBs
insert into vec_movies(rowid, synopsis_embedding)
select
rowid,
embed(synopsis) as synopsis_embedding
from movies;
-- KNN search!
select
rowid,
distance
from vec_movies
where synopsis_embedding match embed( 'scary futuristic movies' )
order by distance
limit 20 ;
================================================
FILE: site/.vitepress/theme/Sponsors.vue
================================================
================================================
FILE: site/.vitepress/theme/index.ts
================================================
// https://vitepress.dev/guide/custom-theme
import { h } from "vue";
import type { Theme } from "vitepress";
import DefaultTheme from "vitepress/theme";
import "./style.css";
import Sponsors from "./Sponsors.vue";
import HeroImg from "./HeroImg.vue";
export default {
extends: DefaultTheme,
Layout: () => {
return h(DefaultTheme.Layout, null, {
// https://vitepress.dev/guide/extending-default-theme#layout-slots
"layout-top": () =>
h("marquee", { class: "banner", scrollamount: "10" }, [
"🚧🚧🚧 This documentation is a work-in-progress! 🚧🚧🚧",
]),
//"home-hero-image": () => h(HeroImg),
"aside-ads-before": () => h(Sponsors),
});
},
enhanceApp({ app, router, siteData }) {
// ...
},
} satisfies Theme;
================================================
FILE: site/.vitepress/theme/style.css
================================================
/*@import "https://code.cdn.mozilla.net/fonts/zilla-slab.css";*/
@font-face {
font-family: "ZillaSlab-SemiBold";
src: url("/fonts/ZillaSlab-SemiBold.woff");
src: url("/fonts/ZillaSlab-SemiBold.woff2") format("woff2"),
url("/fonts/ZillaSlab-SemiBold.woff") format("woff"),
url("/fonts/ZillaSlab(-SemiBold).otf") format("opentype"),
url("/fonts/ZillaSlab-SemiBold.ttf") format("truetype");
font-weight: 600;
font-style: normal;
}
.VPHero h1,
.VPNavBarTitle .title {
font-family: "ZillaSlab-SemiBold";
font-size: 1.5rem;
}
/**
* Customize default theme styling by overriding CSS variables:
* https://github.com/vuejs/vitepress/blob/main/src/client/theme-default/styles/vars.css
*/
/**
* Colors
*
* Each colors have exact same color scale system with 3 levels of solid
* colors with different brightness, and 1 soft color.
*
* - `XXX-1`: The most solid color used mainly for colored text. It must
* satisfy the contrast ratio against when used on top of `XXX-soft`.
*
* - `XXX-2`: The color used mainly for hover state of the button.
*
* - `XXX-3`: The color for solid background, such as bg color of the button.
* It must satisfy the contrast ratio with pure white (#ffffff) text on
* top of it.
*
* - `XXX-soft`: The color used for subtle background such as custom container
* or badges. It must satisfy the contrast ratio when putting `XXX-1` colors
* on top of it.
*
* The soft color must be semi transparent alpha channel. This is crucial
* because it allows adding multiple "soft" colors on top of each other
* to create a accent, such as when having inline code block inside
* custom containers.
*
* - `default`: The color used purely for subtle indication without any
* special meanings attched to it such as bg color for menu hover state.
*
* - `brand`: Used for primary brand colors, such as link text, button with
* brand theme, etc.
*
* - `tip`: Used to indicate useful information. The default theme uses the
* brand color for this by default.
*
* - `warning`: Used to indicate warning to the users. Used in custom
* container, badges, etc.
*
* - `danger`: Used to show error, or dangerous message to the users. Used
* in custom container, badges, etc.
* -------------------------------------------------------------------------- */
:root {
--vp-c-default-1: var(--vp-c-gray-1);
--vp-c-default-2: var(--vp-c-gray-2);
--vp-c-default-3: var(--vp-c-gray-3);
--vp-c-default-soft: var(--vp-c-gray-soft);
--vp-c-brand-1: var(--vp-c-indigo-1);
--vp-c-brand-2: var(--vp-c-indigo-2);
--vp-c-brand-3: var(--vp-c-indigo-3);
--vp-c-brand-soft: var(--vp-c-indigo-soft);
--vp-c-tip-1: var(--vp-c-brand-1);
--vp-c-tip-2: var(--vp-c-brand-2);
--vp-c-tip-3: var(--vp-c-brand-3);
--vp-c-tip-soft: var(--vp-c-brand-soft);
--vp-c-warning-1: var(--vp-c-yellow-1);
--vp-c-warning-2: var(--vp-c-yellow-2);
--vp-c-warning-3: var(--vp-c-yellow-3);
--vp-c-warning-soft: var(--vp-c-yellow-soft);
--vp-c-danger-1: var(--vp-c-red-1);
--vp-c-danger-2: var(--vp-c-red-2);
--vp-c-danger-3: var(--vp-c-red-3);
--vp-c-danger-soft: var(--vp-c-red-soft);
--vp-c-brand-1x: #a6d189;
--vp-c-brand-1x: #a6da95;
--vp-c-brand-1x: #a6e3a1;
}
:root {
--vp-c-brand-1: #1e66f5;
}
.dark {
--vp-c-brand-1: #89b4fa;
}
/**
* Component: Button
* -------------------------------------------------------------------------- */
:root {
--vp-button-brand-border: transparent;
--vp-button-brand-text: var(--vp-c-white);
--vp-button-brand-bg: var(--vp-c-brand-3);
--vp-button-brand-hover-border: transparent;
--vp-button-brand-hover-text: var(--vp-c-white);
--vp-button-brand-hover-bg: var(--vp-c-brand-2);
--vp-button-brand-active-border: transparent;
--vp-button-brand-active-text: var(--vp-c-white);
--vp-button-brand-active-bg: var(--vp-c-brand-1);
}
/**
* Component: Home
* -------------------------------------------------------------------------- */
:root {
--vp-home-hero-name-color: transparent;
--vp-home-hero-name-background: black;
/*
--vp-home-hero-image-background-image: linear-gradient(
-45deg,
#bd34fe 50%,
#47caff 50%
);
--vp-home-hero-image-filter: blur(44px);
*/
}
.dark {
--vp-home-hero-name-background: white;
}
@media (min-width: 640px) {
:root {
--vp-home-hero-image-filter: blur(56px);
}
}
@media (min-width: 960px) {
:root {
--vp-home-hero-image-filter: blur(68px);
}
}
.VPContent.is-home .language-sqlite {
max-width: 640px;
margin: 0 auto;
margin-top: 2rem;
}
/**
* Component: Custom Block
* -------------------------------------------------------------------------- */
:root {
--vp-custom-block-tip-border: transparent;
--vp-custom-block-tip-text: var(--vp-c-text-1);
--vp-custom-block-tip-bg: var(--vp-c-brand-soft);
--vp-custom-block-tip-code-bg: var(--vp-c-brand-soft);
}
/**
* Component: Algolia
* -------------------------------------------------------------------------- */
.DocSearch {
--docsearch-primary-color: var(--vp-c-brand-1) !important;
}
:root {
--vp-layout-top-height: 30px;
}
.banner {
position: fixed;
z-index: var(--vp-z-index-layout-top);
top: 0;
left: 0;
right: 0;
text-align: center;
height: var(--vp-layout-top-height);
line-height: var(--vp-layout-top-height);
background: #f9e2af;
color: black;
}
================================================
FILE: site/api-reference.md
================================================
---
outline: 2
---
# API Reference
A complete reference to all the SQL scalar functions, table functions, and virtual tables inside `sqlite-vec`.
::: warning
sqlite-vec is pre-v1, so expect breaking changes.
:::
[[toc]]
## Constructors {#constructors}
SQL functions that "construct" vectors with different element types.
Currently, only `float32`, `int8`, and `bit` vectors are supported.
### `vec_f32(vector)` {#vec_f32}
Creates a float vector from a BLOB or JSON text. If a BLOB is provided,
the length must be divisible by 4, as a float takes up 4 bytes of space each.
The returned value is a BLOB with 4 bytes per element, with a special [subtype](https://www.sqlite.org/c3ref/result_subtype.html)
of `223`.
```sql
select vec_f32('[.1, .2, .3, 4]');
-- X'CDCCCC3DCDCC4C3E9A99993E00008040'
select subtype(vec_f32('[.1, .2, .3, 4]'));
-- 223
select vec_f32(X'AABBCCDD');
-- X'AABBCCDD'
select vec_to_json(vec_f32(X'AABBCCDD'));
-- '[-1844071490169864000.000000]'
select vec_f32(X'AA');
-- ❌ invalid float32 vector BLOB length. Must be divisible by 4, found 1
```
### `vec_int8(vector)` {#vec_int8}
Creates a 8-bit integer vector from a BLOB or JSON text. If a BLOB is provided,
the length must be divisible by 4, as a float takes up 4 bytes of space each.
If JSON text is provided, each element must be an integer between -128 and 127 inclusive.
The returned value is a BLOB with 1 byte per element, with a special [subtype](https://www.sqlite.org/c3ref/result_subtype.html)
of `225`.
```sql
select vec_int8('[1, 2, 3, 4]');
-- X'01020304'
select subtype(vec_int8('[1, 2, 3, 4]'));
-- 225
select vec_int8(X'AABBCCDD');
-- X'AABBCCDD'
select vec_to_json(vec_int8(X'AABBCCDD'));
-- '[-86,-69,-52,-35]'
select vec_int8('[999]');
-- ❌ JSON parsing error: value out of range for int8
```
### `vec_bit(vector)` {#vec_bit}
Creates a binary vector from a BLOB.
The returned value is a BLOB with 1 byte per 8 elements, with a special [subtype](https://www.sqlite.org/c3ref/result_subtype.html)
of `224`.
```sql
select vec_bit(X'F0');
-- X'F0'
select subtype(vec_bit(X'F0'));
-- 224
select vec_to_json(vec_bit(X'F0'));
-- '[0,0,0,0,1,1,1,1]'
```
## Operations {#op}
Different operations and utilities for working with vectors.
### `vec_length(vector)` {#vec_length}
Returns the number of elements in the given vector.
The vector can be `JSON`, `BLOB`, or the result of a [constructor function](#constructors).
This function will return an error if `vector` is invalid.
```sql
select vec_length('[.1, .2]');
-- 2
select vec_length(X'AABBCCDD');
-- 1
select vec_length(vec_int8(X'AABBCCDD'));
-- 4
select vec_length(vec_bit(X'AABBCCDD'));
-- 32
select vec_length(X'CCDD');
-- ❌ invalid float32 vector BLOB length. Must be divisible by 4, found 2
```
### `vec_type(vector)` {#vec_type}
Returns the name of the type of `vector` as text. One of `'float32'`, `'int8'`, or `'bit'`.
This function will return an error if `vector` is invalid.
```sql
select vec_type('[.1, .2]');
-- 'float32'
select vec_type(X'AABBCCDD');
-- 'float32'
select vec_type(vec_int8(X'AABBCCDD'));
-- 'int8'
select vec_type(vec_bit(X'AABBCCDD'));
-- 'bit'
select vec_type(X'CCDD');
-- ❌ invalid float32 vector BLOB length. Must be divisible by 4, found 2
```
### `vec_add(a, b)` {#vec_add}
Adds every element in vector `a` with vector `b`, returning a new vector `c`. Both vectors
must be of the same type and same length. Only `float32` and `int8` vectors are supported.
An error is raised if either `a` or `b` are invalid, or if they are not the same type or same length.
See also [`vec_sub()`](#vec_sub).
```sql
select vec_add(
'[.1, .2, .3]',
'[.4, .5, .6]'
);
-- X'0000003F3333333F6766663F'
select vec_to_json(
vec_add(
'[.1, .2, .3]',
'[.4, .5, .6]'
)
);
-- '[0.500000,0.700000,0.900000]'
select vec_to_json(
vec_add(
vec_int8('[1, 2, 3]'),
vec_int8('[4, 5, 6]')
)
);
-- '[5,7,9]'
select vec_add('[.1]', vec_int8('[1]'));
-- ❌ Vector type mistmatch. First vector has type float32, while the second has type int8.
select vec_add(vec_bit(X'AA'), vec_bit(X'BB'));
-- ❌ Cannot add two bitvectors together.
```
### `vec_sub(a, b)` {#vec_sub}
Subtracts every element in vector `a` with vector `b`, returning a new vector `c`. Both vectors
must be of the same type and same length. Only `float32` and `int8` vectors are supported.
An error is raised if either `a` or `b` are invalid, or if they are not the same type or same length.
See also [`vec_add()`](#vec_add).
```sql
select vec_sub(
'[.1, .2, .3]',
'[.4, .5, .6]'
);
-- X'9A9999BE9A9999BE9A9999BE'
select vec_to_json(
vec_sub(
'[.1, .2, .3]',
'[.4, .5, .6]'
)
);
-- '[-0.300000,-0.300000,-0.300000]'
select vec_to_json(
vec_sub(
vec_int8('[1, 2, 3]'),
vec_int8('[4, 5, 6]')
)
);
-- '[-3,-3,-3]'
select vec_sub('[.1]', vec_int8('[1]'));
-- ❌ Vector type mistmatch. First vector has type float32, while the second has type int8.
select vec_sub(vec_bit(X'AA'), vec_bit(X'BB'));
-- ❌ Cannot subtract two bitvectors together.
```
### `vec_normalize(vector)` {#vec_normalize}
Performs L2 normalization on the given vector. Only float32 vectors are currently supported.
Returns an error if the input is an invalid vector or not a float32 vector.
```sql
select vec_normalize('[2, 3, 1, -4]');
-- X'BAF4BA3E8B370C3FBAF43A3EBAF43ABF'
select vec_to_json(
vec_normalize('[2, 3, 1, -4]')
);
-- '[0.365148,0.547723,0.182574,-0.730297]'
-- for matryoshka embeddings - slice then normalize
select vec_to_json(
vec_normalize(
vec_slice('[2, 3, 1, -4]', 0, 2)
)
);
-- '[0.554700,0.832050]'
```
### `vec_slice(vector, start, end)` {#vec_slice}
Extract a subset of `vector` from the `start` element (inclusive) to the `end` element (exclusive). TODO check
This is especially useful for [Matryoshka embeddings](#TODO), also known as "adaptive length" embeddings.
Use with [`vec_normalize()`](#vec_normalize) to get proper results.
Returns an error in the following conditions:
- If `vector` is not a valid vector
- If `start` is less than zero or greater than or equal to `end`
- If `end` is greater than the length of `vector`, or less than or equal to `start`.
- If `vector` is a bitvector, `start` and `end` must be divisible by 8.
```sql
select vec_slice('[1, 2,3, 4]', 0, 2);
-- X'0000803F00000040'
select vec_to_json(
vec_slice('[1, 2,3, 4]', 0, 2)
);
-- '[1.000000,2.000000]'
select vec_to_json(
vec_slice('[1, 2,3, 4]', 2, 4)
);
-- '[3.000000,4.000000]'
select vec_to_json(
vec_slice('[1, 2,3, 4]', -1, 4)
);
-- ❌ slice 'start' index must be a postive number.
select vec_to_json(
vec_slice('[1, 2,3, 4]', 0, 5)
);
-- ❌ slice 'end' index is greater than the number of dimensions
select vec_to_json(
vec_slice('[1, 2,3, 4]', 0, 0)
);
-- ❌ slice 'start' index is equal to the 'end' index, vectors must have non-zero length
```
### `vec_to_json(vector)` {#vec_to_json}
Represents a vector as JSON text. The input vector can be a vector BLOB or JSON text.
Returns an error if `vector` is an invalid vector, or when memory cannot be allocated.
```sql
select vec_to_json(X'AABBCCDD');
-- '[-1844071490169864000.000000]'
select vec_to_json(vec_int8(X'AABBCCDD'));
-- '[-86,-69,-52,-35]'
select vec_to_json(vec_bit(X'AABBCCDD'));
-- '[0,1,0,1,0,1,0,1,1,1,0,1,1,1,0,1,0,0,1,1,0,0,1,1,1,0,1,1,1,0,1,1]'
select vec_to_json('[1,2,3,4]');
-- '[1.000000,2.000000,3.000000,4.000000]'
select vec_to_json('invalid');
-- ❌ JSON array parsing error: Input does not start with '['
```
### `vec_each(vector)` {#vec_each}
A table function to iterate through every element in a vector. One row id returned per element in a vector.
```sql
CREATE TABLE vec_each(
rowid int, -- The
vector HIDDEN -- input parameter: A well-formed vector value
)
```
Returns an error if `vector` is not a valid vector.
```sql
select rowid, value from vec_each('[1,2,3,4]');
/*
┌───────┬───────┐
│ rowid │ value │
├───────┼───────┤
│ 0 │ 1 │
├───────┼───────┤
│ 1 │ 2 │
├───────┼───────┤
│ 2 │ 3 │
├───────┼───────┤
│ 3 │ 4 │
└───────┴───────┘
*/
select rowid, value from vec_each(X'AABBCCDD00112233');
/*
┌───────┬──────────────────────┐
│ rowid │ value │
├───────┼──────────────────────┤
│ 0 │ -1844071490169864200 │
├───────┼──────────────────────┤
│ 1 │ 3.773402568185702e-8 │
└───────┴──────────────────────┘
*/
select rowid, value from vec_each(vec_int8(X'AABBCCDD'));
/*
┌───────┬───────┐
│ rowid │ value │
├───────┼───────┤
│ 0 │ -86 │
├───────┼───────┤
│ 1 │ -69 │
├───────┼───────┤
│ 2 │ -52 │
├───────┼───────┤
│ 3 │ -35 │
└───────┴───────┘
*/
select rowid, value from vec_each(vec_bit(X'F0'));
/*
┌───────┬───────┐
│ rowid │ value │
├───────┼───────┤
│ 0 │ 1 │
├───────┼───────┤
│ 1 │ 1 │
├───────┼───────┤
│ 2 │ 1 │
├───────┼───────┤
│ 3 │ 1 │
├───────┼───────┤
│ 4 │ 0 │
├───────┼───────┤
│ 5 │ 0 │
├───────┼───────┤
│ 6 │ 0 │
├───────┼───────┤
│ 7 │ 0 │
└───────┴───────┘
*/
```
## Distance functions {#distance}
Various algorithms to calculate distance between two vectors.
### `vec_distance_L2(a, b)` {#vec_distance_L2}
Calculates the L2 euclidian distance between vectors `a` and `b`. Only valid for float32 or int8 vectors.
Returns an error under the following conditions:
- `a` or `b` are invalid vectors
- `a` or `b` do not share the same vector element types (ex float32 or int8)
- `a` or `b` are bit vectors. Use [`vec_distance_hamming()`](#vec_distance_hamming) for distance calculations between two bitvectors.
- `a` or `b` do not have the same length.
```sql
select vec_distance_L2('[1, 1]', '[2, 2]');
-- 1.4142135381698608
select vec_distance_L2('[1, 1]', '[-2, -2]');
-- 4.242640495300293
select vec_distance_L2('[1.1, 2.2, 3.3]', '[4.4, 5.5, 6.6]');
-- 5.7157673835754395
select vec_distance_L2(X'AABBCCDD', X'00112233');
-- 1844071490169864200
select vec_distance_L2('[1, 1]', vec_int8('[2, 2]'));
-- ❌ Vector type mistmatch. First vector has type float32, while the second has type int8.
select vec_distance_L2(vec_bit(X'AA'), vec_bit(X'BB'));
-- ❌ Cannot calculate L2 distance between two bitvectors.
```
### `vec_distance_cosine(a, b)` {#vec_distance_cosine}
Calculates the cosine distance between vectors `a` and `b`. Only valid for float32 or int8 vectors.
Returns an error under the following conditions:
- `a` or `b` are invalid vectors
- `a` or `b` do not share the same vector element types (ex float32 or int8)
- `a` or `b` are bit vectors. Use [`vec_distance_hamming()`](#vec_distance_hamming) for distance calculations between two bitvectors.
- `a` or `b` do not have the same length.
```sql
select vec_distance_cosine('[1, 1]', '[2, 2]');
-- 2.220446049250313e-16
select vec_distance_cosine('[1, 1]', '[-2, -2]');
-- 2
select vec_distance_cosine('[1.1, 2.2, 3.3]', '[4.4, 5.5, 6.6]');
-- 0.02536807395517826
select vec_distance_cosine(X'AABBCCDD', X'00112233');
-- 2
select vec_distance_cosine('[1, 1]', vec_int8('[2, 2]'));
-- ❌ Vector type mistmatch. First vector has type float32, while the second has type int8.
select vec_distance_cosine(vec_bit(X'AA'), vec_bit(X'BB'));
-- ❌ Cannot calculate cosine distance between two bitvectors.
```
### `vec_distance_hamming(a, b)` {#vec_distance_hamming}
Calculates the hamming distance between two bitvectors `a` and `b`. Only valid for bitvectors.
Returns an error under the following conditions:
- `a` or `b` are not bitvectors
- `a` and `b` do not share the same length
- Memory cannot be allocated
```sql
select vec_distance_hamming(vec_bit(X'00'), vec_bit(X'FF'));
-- 8
select vec_distance_hamming(vec_bit(X'FF'), vec_bit(X'FF'));
-- 0
select vec_distance_hamming(vec_bit(X'F0'), vec_bit(X'44'));
-- 4
select vec_distance_hamming('[1, 1]', '[0, 0]');
-- ❌ Cannot calculate hamming distance between two float32 vectors.
```
## Quantization {#quantization}
Various techniques to "compress" a vector by reducing precision and accuracy.
### `vec_quantize_binary(vector)` {#vec_quantize_binary}
Quantize a float32 or int8 vector into a bitvector.
For every element in the vector, a `1` is assigned to positive numbers and a `0` is assigned to negative numbers.
These values are then packed into a bit vector.
Returns an error if `vector` is invalid, or if `vector` is not a float32 or int8 vector.
```sql
select vec_quantize_binary('[1, 2, 3, 4, 5, 6, 7, 8]');
-- X'FF'
select vec_quantize_binary('[1, 2, 3, 4, -5, -6, -7, -8]');
-- X'0F'
select vec_quantize_binary('[-1, -2, -3, -4, -5, -6, -7, -8]');
-- X'00'
select vec_quantize_binary('[-1, -2, -3, -4, -5, -6, -7, -8]');
-- X'00'
select vec_quantize_binary(vec_int8(X'11223344'));
-- ❌ Binary quantization requires vectors with a length divisible by 8
select vec_quantize_binary(vec_bit(X'FF'));
-- ❌ Can only binary quantize float or int8 vectors
```
### `vec_quantize_i8(vector, [start], [end])` {#vec_quantize_i8}
x
```sql
select 'todo';
-- 'todo'
```
## NumPy Utilities {#numpy}
Functions to read data from or work with [NumPy arrays](https://numpy.org/doc/stable/reference/generated/numpy.array.html).
### `vec_npy_each(vector)` {#vec_npy_each}
xxx
```sql
-- db.execute('select quote(?)', [to_npy(np.array([[1.0], [2.0], [3.0]], dtype=np.float32))]).fetchone()
select
rowid,
vector,
vec_type(vector),
vec_to_json(vector)
from vec_npy_each(
X'934E554D5059010076007B276465736372273A20273C6634272C2027666F727472616E5F6F72646572273A2046616C73652C20277368617065273A2028332C2031292C207D202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020200A0000803F0000004000004040'
)
/*
┌───────┬─────────────┬──────────────────┬─────────────────────┐
│ rowid │ vector │ vec_type(vector) │ vec_to_json(vector) │
├───────┼─────────────┼──────────────────┼─────────────────────┤
│ 0 │ X'0000803F' │ 'float32' │ '[1.000000]' │
├───────┼─────────────┼──────────────────┼─────────────────────┤
│ 1 │ X'00000040' │ 'float32' │ '[2.000000]' │
├───────┼─────────────┼──────────────────┼─────────────────────┤
│ 2 │ X'00004040' │ 'float32' │ '[3.000000]' │
└───────┴─────────────┴──────────────────┴─────────────────────┘
*/
-- db.execute('select quote(?)', [to_npy(np.array([[1.0], [2.0], [3.0]], dtype=np.float32))]).fetchone()
select
rowid,
vector,
vec_type(vector),
vec_to_json(vector)
from vec_npy_each(
X'934E554D5059010076007B276465736372273A20273C6634272C2027666F727472616E5F6F72646572273A2046616C73652C20277368617065273A2028332C2031292C207D202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020202020200A0000803F0000004000004040'
)
/*
┌───────┬─────────────┬──────────────────┬─────────────────────┐
│ rowid │ vector │ vec_type(vector) │ vec_to_json(vector) │
├───────┼─────────────┼──────────────────┼─────────────────────┤
│ 0 │ X'0000803F' │ 'float32' │ '[1.000000]' │
├───────┼─────────────┼──────────────────┼─────────────────────┤
│ 1 │ X'00000040' │ 'float32' │ '[2.000000]' │
├───────┼─────────────┼──────────────────┼─────────────────────┤
│ 2 │ X'00004040' │ 'float32' │ '[3.000000]' │
└───────┴─────────────┴──────────────────┴─────────────────────┘
*/
```
## Meta {#meta}
Helper functions to debug `sqlite-vec` installations.
### `vec_version()` {#vec_version}
Returns a version string of the current `sqlite-vec` installation.
```sql
select vec_version();
-- 'v0.0.1-alpha.37'
```
### `vec_debug()` {#vec_debug}
Returns debugging information of the current `sqlite-vec` installation.
```sql
select vec_debug();
/*
'Version: v0.0.1-alpha.37
Date: 2024-07-23T14:09:43Z-0700
Commit: 77f9b0374c8129056b344854de2dff6b103e5729
Build flags: avx '
*/
```
## Entrypoints {#entrypoints}
All the named entrypoints that load in different `sqlite-vec` functions and options.
================================================
FILE: site/build-ref.mjs
================================================
import Database from "better-sqlite3";
import { load } from "js-yaml";
import { fileURLToPath } from "node:url";
import { dirname, resolve } from "node:path";
import { readFileSync, writeFileSync } from "node:fs";
import * as v from "valibot";
import { table } from "table";
const HEADER = `---
outline: 2
---
# API Reference
A complete reference to all the SQL scalar functions, table functions, and virtual tables inside \`sqlite-vec\`.
::: warning
sqlite-vec is pre-v1, so expect breaking changes.
:::
[[toc]]
`;
const REF_PATH = resolve(
dirname(fileURLToPath(import.meta.url)),
"../reference.yaml",
);
const EXT_PATH = resolve(
dirname(fileURLToPath(import.meta.url)),
"../dist/vec0",
);
const DocSchema = v.objectWithRest(
{
sections: v.record(
v.string(),
v.object({
title: v.string(),
desc: v.string(),
}),
),
},
v.record(
v.string(),
v.object({
params: v.array(v.string()),
desc: v.string(),
example: v.union([v.string(), v.array(v.string())]),
}),
),
);
const tableConfig = {
border: {
topBody: `─`,
topJoin: `┬`,
topLeft: `┌`,
topRight: `┐`,
bottomBody: `─`,
bottomJoin: `┴`,
bottomLeft: `└`,
bottomRight: `┘`,
bodyLeft: `│`,
bodyRight: `│`,
bodyJoin: `│`,
joinBody: `─`,
joinLeft: `├`,
joinRight: `┤`,
joinJoin: `┼`,
},
};
function formatSingleValue(value) {
if (typeof value === "string") {
const s = `'${value.replace(/'/g, "''")}'`;
if (s.split("\n").length > 1) {
return `/*\n${s}\n*/`;
}
return `-- ${s}`;
}
if (typeof value === "number") return `-- ${value.toString()}`;
if (value === null) return "-- NULL";
if (value instanceof Uint8Array) {
let s = "X'";
for (const v of value) {
s += v.toString(16).toUpperCase().padStart(2, "0");
}
s += "'";
return `-- ${s}`;
}
if (typeof value === "object" || Array.isArray(value)) {
return "-- " + JSON.stringify(value, null, 2);
}
}
function formatValue(value) {
if (typeof value === "string") return `'${value}'`;
if (typeof value === "number") return value;
if (value === null) return "NULL";
if (value instanceof Uint8Array) {
let s = "X'";
for (const v of value) {
s += v.toString(16).toUpperCase().padStart(2, "0");
}
s += "'";
return s;
}
if (typeof value === "object" || Array.isArray(value)) {
return JSON.stringify(value, null, 2);
}
}
function tableize(stmt, results) {
const columnNames = stmt.columns().map((c) => c.name);
const rows = results.map((row) =>
row.map((value) => {
return formatValue(value);
})
);
return table([columnNames, ...rows], tableConfig);
}
function renderExamples(db, name, example) {
let md = "```sql\n";
const examples = Array.isArray(example) ? example : [example];
for (const example of examples) {
const sql = example
/* Strip any '```sql' markdown at the beginning */
.replace(/^\w*```sql/, "")
/* Strip any '```' markdown at the end */
.replace(/```\w*$/m, "")
.trim();
let stmt, results, error;
results = null;
try {
stmt = db.prepare(sql);
try {
stmt.raw(true);
} catch (err) {
1;
}
} catch (error) {
console.error(`Error preparing statement for ${name}:`);
console.error(error);
throw Error();
}
try {
results = stmt.all();
} catch (e) {
error = e.message;
}
md += sql + "\n";
if (!results) {
md += `-- ❌ ${error}\n\n`;
continue;
}
const result = results.length > 1 || stmt.columns().length > 1
? `/*\n${tableize(stmt, results)}\n*/\n`
: formatSingleValue(results[0][0]);
md += result + "\n\n";
}
md += "\n```\n\n";
return md;
}
let md = HEADER;
const doc = v.parse(DocSchema, load(readFileSync(REF_PATH, "utf8")));
const db = new Database();
db.loadExtension(EXT_PATH);
for (const section in doc.sections) {
md += `## ${doc.sections[section].title} {#${section}} \n\n`;
md += doc.sections[section].desc;
md += "\n\n";
for (
const [name, { params, desc, example }] of Object.entries(
doc[section],
)
) {
const headerText = `\`${name}(${(params ?? []).join(", ")})\` {#${name}}`;
md += "### " + headerText + "\n\n";
md += desc + "\n\n";
md += renderExamples(db, name, example);
}
}
writeFileSync("api-reference.md", md, "utf8");
console.log("done");
================================================
FILE: site/compiling.md
================================================
# Compiling `sqlite-vec`
`sqlite-vec` is is easy to compile yourself! It's a single C file with no dependencies, so the process should be straightforward.
## From Source
To compile `sqlite-vec` as a loadable SQLite extension, you can `git clone` the source repository and run the following commands:
```bash
git clone https://github.com/asg017/sqlite-vec
cd sqlite-vec
./scripts/vendor.sh
make loadable
```
The `./scripts/vendor.sh` command will download a recent version of [SQLite's amalgammation builds](https://www.sqlite.org/amalgamation.html), to ensure you have an up-to-date `sqlite3ext.h` available on your system.
Then `make loadable` will generate the `sqlite-vec.h` file and a dynamically loadable library at `dist/vec.$SUFFIX`. The suffix will be `.dylib` for MacOS, `.so` for Linux, and `.dll` for Windows.
## From the amalgamation build
The "amalgamation" build of `sqlite-vec` is a `.zip` or `.tar.gz` archive with the pre-configured `sqlite-vec.c` and `sqlite-vec.h` source files.
The amalgamation builds can be found in [`sqlite-vec` Releases](https://github.com/asg017/sqlite-vec/releases). You can also download the latest amalgamation build with this command:
```-vue
wget https://github.com/asg017/sqlite-vec/releases/download/v{{data.VERSION}}/sqlite-vec-{{data.VERSION}}-amalgamation.zip
unzip sqlite-vec-{{data.VERSION}}-amalgamation.zip
```
There will now be `sqlite-vec.c` and `sqlite-vec.h` available in your current directory. To compile it manually, follow the [official SQLite extension compilation instructions](https://www.sqlite.org/loadext.html#compiling_a_loadable_extension), which will be something like:
```bash
# Linux
gcc -g -fPIC -shared sqlite-vec.c -o vec0.so
# MacOS
gcc -g -fPIC -dynamiclib sqlite-vec.c -o vec0.dylib
# Windows, MSVC compiler
cl sqlite-vec.c -link -dll -out:sqlite-vec.dll
# Windows, MinGW
gcc -g -shared sqlite-vec.c -o vec0.dll
```
Different platforms, compiler, or architectures may require different compilation flags.
## Compile-time options
There are a few compilation options available for `sqlite-vec`, but they're currently unstable and may change in the future. They aren't tracked with [`sqlite-vec`'s semantic versioning policy ](./versioning.md), so options may break in patch version updates.
The current compile-time flags are:
- `SQLITE_VEC_ENABLE_AVX`, enables AVX CPU instructions for some vector search operations
- `SQLITE_VEC_ENABLE_NEON`, enables NEON CPU instructions for some vector search operations
- `SQLITE_VEC_OMIT_FS`, removes some obsure SQL functions and features that use the filesystem, meant for some WASM builds where there's no available filesystem
- `SQLITE_VEC_STATIC`, meant for statically linking `sqlite-vec`
================================================
FILE: site/features/knn.md
================================================
# KNN queries
The most common use-case for vectors in databases is for K-nearest-neighbors (KNN) queries.
You'll have a table of vectors, and you'll want to find the K closest
Currently there are two ways to to perform KNN queries with `sqlite-vec`:
With `vec0` virtual tables and "manually" with regular tables.
The `vec0` virtual table is faster and more compact, but is less flexible and requires `JOIN`s back to your source tables.
The "manual" method is more flexible and
## `vec0` virtual tables
```sql
create virtual table vec_documents using vec0(
document_id integer primary key,
contents_embedding float[768]
);
insert into vec_documents(document_id, contents_embedding)
select id, embed(contents)
from documents;
```
```sql
select
document_id,
distance
from vec_documents
where contents_embedding match :query
and k = 10;
```
```sql
-- This example ONLY works in SQLite versions 3.41+
-- Otherwise, use the `k = 10` method described above!
select
document_id,
distance
from vec_documents
where contents_embedding match :query
limit 10; -- LIMIT only works on SQLite versions 3.41+
```
```sql
with knn_matches as (
select
document_id,
distance
from vec_documents
where contents_embedding match :query
and k = 10
)
select
documents.id,
documents.contents,
knn_matches.distance
from knn_matches
left join documents on documents.id = knn_matches.document_id
```
```sql
create virtual table vec_documents using vec0(
document_id integer primary key,
contents_embedding float[768] distance_metric=cosine
);
-- insert vectors into vec_documents...
-- this MATCH will now use cosine distance instead of the default L2 distance
select
document_id,
distance
from vec_documents
where contents_embedding match :query
and k = 10;
```
## Manually with SQL scalar functions
You don't need a `vec0` virtual table to perform KNN searches with `sqlite-vec`.
You could store vectors in regular columns in a regular tables, like so:
```sql
create table documents(
id integer primary key,
contents text,
-- a 4-dimensional floating-point vector
contents_embedding blob
);
insert into documents values
(1, 'alex', vec_f32('[1.1, 1.1, 1.1, 1.1]')),
(2, 'brian', vec_f32('[2.2, 2.2, 2.2, 2.2]')),
(3, 'craig', vec_f32('[3.3, 3.3, 3.3, 3.3]'));
```
When you want to find similar vectors, you can manually use
[`vec_distance_L2()`](../api-reference.md#vec_distance_l2),
[`vec_distance_L1()`](../api-reference.md#vec_distance_l1),
or [`vec_distance_cosine()`](../api-reference.md#vec_distance_cosine),
and an `ORDER BY` clause to perform a brute-force KNN query.
```sql
select
id,
contents,
vec_distance_L2(contents_embedding, '[2.2, 2.2, 2.2, 2.2]') as distance
from documents
order by distance;
/*
┌────┬──────────┬──────────────────┐
│ id │ contents │ distance │
├────┼──────────┼──────────────────┤
│ 2 │ 'brian' │ 0.0 │
│ 3 │ 'craig' │ 2.19999980926514 │
│ 1 │ 'alex' │ 2.20000004768372 │
└────┴──────────┴──────────────────┘
*/
```
If you choose this approach, it is recommended to define the "vector column" with its element type (`float`, `bit`, etc.) and dimension, for better documentation.
It's also recommended to include a
[`CHECK` constraint](https://www.sqlite.org/lang_createtable.html#check_constraints),
to ensure only vectors of the correct element type and dimension exist in the table.
```sql
create table documents(
id integer primary key,
contents text,
contents_embedding float[4]
check(
typeof(contents_embedding) == 'blob'
and vec_length(contents_embedding) == 4
)
);
-- ❌ Fails, needs to be a BLOB input
insert into documents values (1, 'alex', '[1.1, 1.1, 1.1, 1.1]');
-- ❌ Fails, 3 dimensions, needs 4
insert into documents values (1, 'alex', vec_f32('[1.1, 1.1, 1.1]'));
-- ❌ Fails, needs to be a float32 vector
insert into documents values (1, 'alex', vec_bit('[1.1, 1.1, 1.1, 1.1]'));
-- ✅ Success!
insert into documents values (1, 'alex', vec_f32('[1.1, 1.1, 1.1, 1.1]'));
```
Keep in mind: **SQLite does not support custom types.**
The example above may look like that the `contents_embedding` column has a "custom type"
of `float[4]`, but SQLite allows for *anything* to appear as a "column type".
```sql
-- these "column types" are totally legal in SQLite
create table students(
name ham_sandwich,
age minions[42]
);
```
See [Datatypes in SQLite](https://www.sqlite.org/datatype3.html) for more info.
So by itself, `float[4]` as a "column type" is not enforced by SQLite at all.
This is why we recommend including `CHECK` constraints, to enforce that values in your vector column
are of the correct type and length.
For [strict tables](https://www.sqlite.org/stricttables.html), use the `BLOB` type and include the same `CHECK` constraints.
```sql
create table documents(
id integer primary key,
contents text,
contents_embedding blob check(vec_length(contents_embedding) == 4)
) strict;
```
================================================
FILE: site/features/vec0.md
================================================
# `vec0` Virtual Table
## Metadata in `vec0` Virtual Tables {#vec0_metadata}
There are three ways to store non-vector columns in `vec0` virtual tables:
metadata columns, partition keys, and auxiliary columns. Each option has its
own benefits and limitations.
```sql
create virtual table vec_chunks using vec0(
document_id integer partition key,
contents_embedding float[768],
-- partition key column, denoted by 'partition key'
user_id integer partition key,
-- metadata column, appears as normal column definition
label text,
-- auxiliary column, denoted by '+'
+contents text
);
```
A quick summary of each option:
| Column Type | Description | Benefits | Limitations |
| ----------------- | ----------------------------------------------------------------------- | ---------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------- |
| Metadata columns | Stores boolean, integer, floating point, or text data alongside vectors | Can be included in the `WHERE` clause of a KNN query | Slower full scan, slightly inefficient with long strings (`> 12` characters) |
| Auxiliary columns | Stores any kind of data in a separate internal table | Eliminates need for an external `JOIN` | Cannot appear in the `WHERE` clause of a KNN query |
| Partition Key | Internally shards vector index on a given key | Make selective queries much faster | Can cause oversharding and slow KNN if not used carefully. Should be +100's of vectors per unique partition key value |
### Metadata Columns {#metadata}
Metadata columns are extra "regular" columns that you can include in a `vec0`
table definition. These columns will be indexed along with declared vector
columns, and allow you to include extra `WHERE` constraints during KNN queries.
```sql
create virtual table vec_movies using vec0(
movie_id integer primary key,
synopsis_embedding float[1024],
genre text,
num_reviews int,
mean_rating float,
contains_violence boolean
);
```
In the `vec0` constructor, the `genre`, `num_reviews`, `mean_rating`, and
`contains_violence` columns are metadata columns, with their specified types.
A sample KNN query on this table could look like:
```sql
select *
from vec_movies
where synopsis_embedding match '[...]'
and k = 5
and genre = 'scifi'
and num_reviews between 100 and 500
and mean_rating > 3.5
and contains_violence = false;
```
The first two conditions in the `WHERE` clause (`synopsis_embedding match` and
`k = 5`) denote that the query is a KNN query. The other conditions are metadata
constraints that `sqlite-vec` will recognize and apply during the KNN
calculation. In other words, for the above query, a maximum of 5 rows would be
returned, all of which would match all the `WHERE` constraints for their
metadata column values.
#### Metadata Column Declaration
Metatadata columns are declared in the `vec0` constructor just like regular
column definitions, with the column name first then the column type.
Only the following column types are supported in metadata columns. All these
columns are strictly typed.
- `TEXT` for text and strings
- `INTEGER` for 8-byte integers
- `FLOAT` for 8-byte floating-point numbers
- `BOOLEAN` for 1-bit `0` or `1`
Other column types may be supported in the future. Column type names are case
insensitive.
Additional column constraints like `UNIQUE` or `NOT NULL` are not supported.
A maximum of 16 metadata columns can be declared in a `vec0` virtual table.
#### Supported operations
Metadata column `WHERE` conditions in a KNN query will only work on the
following operators:
- `=` Equals to
- `!=` Not equals to
- `>` Greater than
- `>=` Greater than or equal to
- `<` Less than
- `<=` Less than or equal to
Using any other operator like `IS NULL`, `LIKE`, `GLOB`, `REGEXP`, or any scalar
function will result in an error or incorrect results.
Boolean columns only support `=` and `!=` operators.
### Partition Key Columns {#partition-keys}
Partition key columns allow one to internally shard a vector indexed based on a
given key. Any `=` constraint in a `WHERE` clause on a partition key column will
restrict the search to that clause.
For example, say you're performing vector search on a large dataset of
documents. However, each document belongs to a user, and users can only search
their own documents. It would be wasteful to perform a brute-force search over all
documents if you only care about 1 user at a time. So, you can partition the
vector index based on user ID like so:
```sql
create virtual table vec_documents using vec0(
document_id integer primary key,
user_id integer partition key,
contents_embedding float[1024]
)
```
Then, during a KNN query, you can constrain results to a specific user in the
`WHERE` clause like so:
```sql
select
document_id,
user_id,
distance
from vec_documents
where contents_embedding match :query
and k = 20
and user_id = 123;
```
`sqlite-vec` will recognize the `user_id = 123` constraint and pre-filter
vectors during a KNN search. Vectors with the same partition key values are
collocated together, so this is a fast operation.
Another example: say you're performing vector search on a large dataset of news
headlines of the past 100 years. However, in your application, most users only
want to search a subset of articles based on when they were written, like "in
the past ten years" or "during the obama administration." You can paritition
based on published date like so:
```sql
create virtual table vec_articles using vec0(
article_id integer primary key,
published_date text partition key,
headline_embedding float[1024]
);
```
And a KNN query:
```sql
select
article_id,
published_date,
distance
from vec_articles
where headline_embedding match :query
and published_date between '2009-01-20' and '2017-01-20'; -- Obama administration
```
But be careful! over-using partition key columns can lead to over-sharding and
slower KNN queries. As a rule of thumb, make sure that every unique partition
key value has ~100s of vectors associated with it. In the above examples, make
sure that every user has on the magnitude of dozens or hundreds of documents
each, or that there are dozens or, preferably, hundreds of articles per day. If they
don't and you're noticing slow queries, try a more broad partition key value,
like `organization_id` or `published_month`.
A maximum of 4 partition key columns can be declared in a `vec0` virtual table,
but use caution if you find yourself using more than 1 partition key column. Vectors are sharded
along each unique combination, so over-sharding is more common with more
partition key columns.
### Auxiliary Columns {#aux}
Auxiliary columns store additional unindexed data separate from the internal
vector index. They are meant for larger metadata that will never appear in a
`WHERE` clause of a KNN query, but can be retrieved in the result set without needing a separate `JOIN`.
Auxiliary columns are denoted by a `+` prefix in their column definition, like
so:
```sql
create virtual table vec_chunks using vec0(
contents_embedding float[1024],
+contents text
);
select
rowid,
contents,
distance
from vec_chunks
where contents_embedding match :query
and k = 10;
```
Here we store the text contents of each chunk in the `contents` auxiliary
column. When we perform a KNN query, we can reference the `contents` column in
the `SELECT` clause, to get the raw text contents of the most relevant chunks.
A similar approach can be used for image embeddings:
```sql
create virtual table vec_image_chunks using vec0(
image_embedding float[1024],
+image blob
);
select
rowid,
contents,
distance
from vec_chunks
where contents_embedding match :query
and k = 10;
```
Here the `image` auxiliary column can store the raw image file in a large `BLOB`
column. It can appear in the `SELECT` clause of the KNN query, to get the most
relevant raw images.
In general, auxiliary columns are good for large text, blobs, URLs, or other
datatypes that won't be a part of a `WHERE` clause of a KNN query. Auxiliary columns are a good fit for columns
that will appear often in a `SELECT` clause but not in the `WHERE` clause.
A maximum of 16 auxiliary columns can be declared in a `vec0` virtual table.
================================================
FILE: site/getting-started/installation.md
================================================
# Installing
You have several options to include `sqlite-vec` into your projects, including
PyPi packages for Python, NPM packages for Node.js, Gems for Ruby, and more.
## With popular package managers
::: code-group
```bash [Python]
pip install sqlite-vec
```
```bash [Node.js]
npm install sqlite-vec
```
```bash [Bun]
bun install sqlite-vec
```
```bash [Deno]
deno add npm:sqlite-vec
```
```bash [Ruby]
gem install sqlite-vec
```
```bash [Rust]
cargo add sqlite-vec
```
```bash [Go (CGO)]
go get -u github.com/asg017/sqlite-vec-go-bindings/cgo
```
```bash [Go (ncruces WASM)]
go get -u github.com/asg017/sqlite-vec-go-bindings/ncruces
```
```bash [Datasette]
datasette install datasette-sqlite-vec
```
```bash [sqlite-utils]
sqlite-utils install sqlite-utils-sqlite-vec
```
:::
## Pre-compiled extensions
Alternatively, you can download pre-compiled loadable extensions from the
[`sqlite-vec` Github Releases](https://github.com/asg017/sqlite-vec/releases/latest).
There's also an `install.sh` script that will automatically download the appropriate pre-compiled extension from Github Releases to your machine.
```sh
# yolo
curl -L 'https://github.com/asg017/sqlite-vec/releases/latest/download/install.sh' | sh
```
```sh
# ok lets play it safe
curl -o install.sh -L https://github.com/asg017/sqlite-vec/releases/latest/download/install.sh
# inspect your scripts
cat install.sh
# TODO Test if execute permissions?
./install.sh
```
## Compiling
`sqlite-vec` is a single `sqlite-vec.c` and `sqlite-vec.h`, and can be easily compiled for different platforms, or statically linked into larger applications.
See [*Compiling `sqlite-vec`*](#compiling) for more information.
================================================
FILE: site/getting-started/introduction.md
================================================
# Introduction to `sqlite-vec`
## Intro to Vector Databases
## Vector Search in SQLite with `sqlite-vec`
## Getting help
================================================
FILE: site/guides/arithmetic.md
================================================
# Vector Arithmetic
- `vec_add()`
- `vec_sub()`
- `vec_mean()`
================================================
FILE: site/guides/binary-quant.md
================================================
# Binary Quantization
"Quantization" refers to a variety of methods and techniques for reducing the
size of vectors in a vector index. **Binary quantization** (BQ) refers to a
specific technique where each individual floating point element in a vector is
reduced to a single bit, typically by assigning `0` to negative numbers and `1`
to positive numbers.
For example, in this 8-dimensional `float32` vector:
```json
[-0.73, -0.80, 0.12, -0.73, 0.79, -0.11, 0.23, 0.97]
```
Applying binary quantization would result in the following `bit` vector:
```json
[0, 0, 1, 0, 1, 0, 1, 1]
```
The original 8-dimensional `float32` vector requires `8 * 4 = 32` bytes of space
to store. For 1 million vectors, that would be `32MB`. On the other hand, the
binary quantized 8-dimensional vector can be stored in a single byte — one bit
per element. For 1 million vectors, that would be just `1MB`, a 32x reduction!
Though keep in mind, you're bound to lose a lot quality when reducing 32 bits of
information to 1 bit. [Oversampling and re-scoring](#re-scoring) will help a
lot.
The main goal of BQ is to dramatically reduce the size of your vector index,
resulting in faster searches with less resources. This is especially useful in
`sqlite-vec`, which is (currently) brute-force only and meant to run on small
devices. BQ is an easy low-cost method to make larger vector datasets easier to
manage.
## Binary Quantization `sqlite-vec`
The `sqlite-vec` extension offers a `vec_quantize_binary()` SQL scalar function,
which applies binary quanitization to a `float32` or `int8` vector. For every
element in a given vector, it will apply `0` to negative values and `1` to
positive values, and pack them into a `BLOB`.
```sqlite
select vec_quantize_binary(
'[-0.73, -0.80, 0.12, -0.73, 0.79, -0.11, 0.23, 0.97]'
);
-- X'd4`
```
The single byte `0xd4` in hexadecimal is `11010100` in binary.
## Demo
Here's an end-to-end example of using binary quantization with `vec0` virtual
tables in `sqlite-vec`.
```sqlite
create virtual table vec_movies using vec0(
synopsis_embedding bit[768]
);
```
```sqlite
insert into vec_movies(rowid, synopsis_embedding)
VALUES (:id, vec_quantize_binary(:vector));
```
```sqlite
select
rowid,
distance
from vec_movies
where synopsis_embedding match vec_quantize_binary(:query)
order by distance
limit 20;
```
### Re-scoring
```sqlite
create virtual table vec_movies using vec0(
synopsis_embedding float[768],
synopsis_embedding_coarse bit[768]
);
```
```sqlite
insert into vec_movies(rowid, synopsis_embedding, synopsis_embedding_coarse)
VALUES (:id, :vector, vec_quantize_binary(:vector));
```
```sqlite
with coarse_matches as (
select
rowid,
synopsis_embedding
from vec_movies
where synopsis_embedding_coarse match vec_quantize_binary(:query)
order by distance
limit 20 * 8
),
select
rowid,
vec_distance_L2(synopsis_embedding, :query)
from coarse_matches
order by 2
limit 20;
```
# Benchmarks
## Model support
Certain embedding models, like [Nomic](https://nomic.ai/)'s
[`nomic-embed-text-v1.5`](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5)
text embedding model and
[mixedbread.ai](https://www.mixedbread.ai/blog/mxbai-embed-2d-large-v1)'s
[`mxbai-embed-large-v1`](https://huggingface.co/mixedbread-ai/mxbai-embed-large-v1)
are specifically trained to perform well after binary quantization.
Other embeddings models may not, but you can still try BQ and see if it works
for your datasets. Chances are, if your vectors are normalized (ie between
`-1.0` and `1.0`) there's a good chance you will see acceptable results with BQ.
================================================
FILE: site/guides/classifiers.md
================================================
================================================
FILE: site/guides/hybrid-search.md
================================================
================================================
FILE: site/guides/matryoshka.md
================================================
# Matryoshka (Adaptive-Length) Embeddings
Matryoshka embeddings are a new class of embedding models introduced in the
TODO-YYY paper [_TODO title_](https://arxiv.org/abs/2205.13147). They allow one
to truncate excess dimensions in large vector, without sacrificing much quality.
Let's say your embedding model generate 1024-dimensional vectors. If you have 1
million of these 1024-dimensional vectors, they would take up `4.096 GB` of
space! You're not able to reduce the dimensions without losing a lot of
quality - if you were to remove half of the dimensions 512-dimensional vectors,
you could expect to also lose 50% or more of the quality of results. There are
other dimensional-reduction techniques, like [PCA](#TODO) or [Product Quantization](#TODO), but they typically require
complicated and expensive training processes.
Matryoshka embeddings, on the other hand, _can_ be truncated, without losing much
quality. Using [`mixedbread.ai`](#TODO) `mxbai-embed-large-v1` model, they claim
that
They are called "Matryoshka" embeddings because ... TODO
## Matryoshka Embeddings with `sqlite-vec`
You can use a combination of [`vec_slice()`](../api-reference.md#vec_slice) and
[`vec_normalize()`](../api-reference.md#vec_slice) on Matryoshka embeddings to
truncate.
```sql
select
vec_normalize(
vec_slice(title_embeddings, 0, 256)
) as title_embeddings_256d
from vec_articles;
```
[`vec_slice()`](../api-reference.md#vec_slice) will cut down the vector to the first 256 dimensions. Then [`vec_normalize()`](../api-reference.md#vec_normalize) will normalize that truncated vector, which is typically a required step for Matryoshka embeddings.
## Benchmarks
## Suppported Models
https://supabase.com/blog/matryoshka-embeddings#which-granularities-were-openais-text-embedding-3-models-trained-on
`text-embedding-3-small`: 1536, 512 `text-embedding-3-large`: 3072, 1024, 256
https://x.com/ZainHasan6/status/1757519325202686255
`text-embeddings-3-large:` 3072, 1536, 1024, 512
https://www.mixedbread.ai/blog/binary-mrl
`mxbai-embed-large-v1`: 1024, 512, 256, 128, 64
`nomic-embed-text-v1.5`: 768, 512, 256, 128, 64
```
# TODO new snowflake model
```
================================================
FILE: site/guides/performance.md
================================================
- page_size
- memory mapping
- in-memory index
- chunk_size (?)
================================================
FILE: site/guides/rag.md
================================================
# Retrival Augmented Generation (RAG)
- "memories"?
- chunking
================================================
FILE: site/guides/scalar-quant.md
================================================
# Scalar Quantization (SQ)
"Quantization" refers to a variety of methods and techniques for reducing the
size of vectors in a vector index. **Scalar quantization** (SQ) refers to a
specific technique where each individual floating point element in a vector is
scaled to a small element type, like `float16`, `int8`.
Most embedding models generate `float32` vectors. Each `float32` takes up 4
bytes of space. This can add up, especially when working with a large amount of
vectors or vectors with many dimensions. However, if you scale them to `float16`
or `int8` vectors, they only take up 2 bytes of space and 1 bytes of space
respectively, saving you precious space at the expense of some quality.
```sql
select vec_quantize_float16(vec_f32('[]'), 'unit');
select vec_quantize_int8(vec_f32('[]'), 'unit');
select vec_quantize('float16', vec_f32('...'));
select vec_quantize('int8', vec_f32('...'));
select vec_quantize('bit', vec_f32('...'));
select vec_quantize('sqf16', vec_f32('...'));
select vec_quantize('sqi8', vec_f32('...'));
select vec_quantize('bq2', vec_f32('...'));
```
## Benchmarks
================================================
FILE: site/guides/semantic-search.md
================================================
================================================
FILE: site/index.md
================================================
---
# https://vitepress.dev/reference/default-theme-home-page
layout: home
hero:
name: "sqlite-vec"
text: ""
tagline: A vector search SQLite extension that runs anywhere!
actions:
- theme: brand
text: Getting Started
link: /introduction
- theme: alt
text: API Reference
link: /api-reference
features:
- title: Runs everywhere
details: On laptops, servers, mobile devices, browsers with WASM, Raspberry Pis, and more!
- title: Bindings for many languages
details: Python, Ruby, Node.js/Deno/Bun, Go, Rust, and more!
- title: Pure SQL
details: No extra configuration or server required — only CREATE, INSERT, and SELECT statements
---
```sqlite
-- store 768-dimensional vectors in a vec0 virtual table
create virtual table vec_movies using vec0(
synopsis_embedding float[768]
);
-- insert vectors into the table, as JSON or compact BLOBs
insert into vec_movies(rowid, synopsis_embedding)
select
rowid,
embed(synopsis) as synopsis_embedding
from movies;
-- KNN search!
select
rowid,
distance
from vec_movies
where synopsis_embedding match embed('scary futuristic movies')
order by distance
limit 20;
```
================================================
FILE: site/package.json
================================================
{
"scripts": {
"ref": "node build-ref.mjs",
"dev": "vitepress dev",
"build": "vitepress build",
"preview": "vitepress preview"
},
"devDependencies": {
"vue": "^3.4.26"
},
"dependencies": {
"@types/node": "^20.12.8",
"better-sqlite3": "^11.1.2",
"js-yaml": "^4.1.0",
"table": "^6.8.2",
"valibot": "^0.36.0",
"vitepress": "^1.1.4"
}
}
================================================
FILE: site/project.data.ts
================================================
import { readFileSync } from "node:fs";
import { dirname, join } from "node:path";
import { fileURLToPath } from "node:url";
const PROJECT = "sqlite-vec";
const VERSION = readFileSync(
join(dirname(fileURLToPath(import.meta.url)), "..", "VERSION"),
"utf8",
);
export default {
load() {
return {
PROJECT,
VERSION,
};
},
};
================================================
FILE: site/sqlite.tmlanguage.json
================================================
{
"information_for_contributors": [
"This file has been converted from https://github.com/microsoft/vscode-mssql/blob/master/syntaxes/SQL.plist",
"If you want to provide a fix or improvement, please create a pull request against the original repository.",
"Once accepted there, we are happy to receive an update request."
],
"version": "https://github.com/microsoft/vscode-mssql/commit/3929516cce0a570e91ee1be74b09ed886cb360f4",
"name": "sqlite",
"scopeName": "source.sqlite",
"patterns": [
{
"match": "((?]?=|<>|<|>",
"name": "keyword.operator.comparison.sql"
},
{
"match": "-|\\+|/",
"name": "keyword.operator.math.sql"
},
{
"match": "\\|\\|",
"name": "keyword.operator.concatenator.sql"
},
{
"match": "(?i)\\b(approx_count_distinct|approx_percentile_cont|approx_percentile_disc|avg|checksum_agg|count|count_big|group|grouping|grouping_id|max|min|sum|stdev|stdevp|var|varp)\\b\\s*\\(",
"captures": {
"1": {
"name": "support.function.aggregate.sql"
}
}
},
{
"match": "(?i)\\b(cume_dist|first_value|lag|last_value|lead|percent_rank|percentile_cont|percentile_disc)\\b\\s*\\(",
"captures": {
"1": {
"name": "support.function.analytic.sql"
}
}
},
{
"match": "(?i)\\b(bit_count|get_bit|left_shift|right_shift|set_bit)\\b\\s*\\(",
"captures": {
"1": {
"name": "support.function.bitmanipulation.sql"
}
}
},
{
"match": "(?i)\\b(cast|convert|parse|try_cast|try_convert|try_parse)\\b\\s*\\(",
"captures": {
"1": {
"name": "support.function.conversion.sql"
}
}
},
{
"match": "(?i)\\b(collationproperty|tertiary_weights)\\b\\s*\\(",
"captures": {
"1": {
"name": "support.function.collation.sql"
}
}
},
{
"match": "(?i)\\b(asymkey_id|asymkeyproperty|certproperty|cert_id|crypt_gen_random|decryptbyasymkey|decryptbycert|decryptbykey|decryptbykeyautoasymkey|decryptbykeyautocert|decryptbypassphrase|encryptbyasymkey|encryptbycert|encryptbykey|encryptbypassphrase|hashbytes|is_objectsigned|key_guid|key_id|key_name|signbyasymkey|signbycert|symkeyproperty|verifysignedbycert|verifysignedbyasymkey)\\b\\s*\\(",
"captures": {
"1": {
"name": "support.function.cryptographic.sql"
}
}
},
{
"match": "(?i)\\b(cursor_status)\\b\\s*\\(",
"captures": {
"1": {
"name": "support.function.cursor.sql"
}
}
},
{
"match": "(?i)\\b(sysdatetime|sysdatetimeoffset|sysutcdatetime|current_time(stamp)?|getdate|getutcdate|datename|datepart|day|month|year|datefromparts|datetime2fromparts|datetimefromparts|datetimeoffsetfromparts|smalldatetimefromparts|timefromparts|datediff|dateadd|datetrunc|eomonth|switchoffset|todatetimeoffset|isdate|date_bucket)\\b\\s*\\(",
"captures": {
"1": {
"name": "support.function.datetime.sql"
}
}
},
{
"match": "(?i)\\b(datalength|ident_current|ident_incr|ident_seed|identity|sql_variant_property)\\b\\s*\\(",
"captures": {
"1": {
"name": "support.function.datatype.sql"
}
}
},
{
"match": "(?i)\\b(coalesce|nullif)\\b\\s*\\(",
"captures": {
"1": {
"name": "support.function.expression.sql"
}
}
},
{
"match": "(?(new Uint8Array(embedding.buffer)!);
console.log(result); // 4
```
See
[`simple-deno/demo.ts`](https://github.com/asg017/sqlite-vec/blob/main/examples/simple-deno/demo.ts)
for a more complete Deno demo.
The `better-sqlite3` example above also works in Deno, when the `better-sqlite3` import is prefixed with `npm:`:
```js
import * from "better-sqlite3"; // [!code --]
import * from "npm:better-sqlite3"; // [!code ++]
```
## Bun
Here's a quick recipe of using `sqlite-vec` with
[`bun:sqlite`](https://bun.sh/docs/api/sqlite) in Bun.
```ts
import { Database } from "bun:sqlite";
import * as sqliteVec from "sqlite-vec";
// MacOS *might* have to do this, as the builtin SQLite library on MacOS doesn't allow extensions
Database.setCustomSQLite("/usr/local/opt/sqlite3/lib/libsqlite3.dylib");
const db = new Database(":memory:");
sqliteVec.load(db);
const embedding = new Float32Array([0.1, 0.2, 0.3, 0.4]);
const { result } = db
.prepare("select vec_length(?) as result",)
.get(embedding);
console.log(result); // 4
```
See
[`simple-bun/demo.ts`](https://github.com/asg017/sqlite-vec/blob/main/examples/simple-bun/demo.ts)
for a more complete Bun demo.
The `better-sqlite3`
example above also works with Bun.
================================================
FILE: site/using/python.md
================================================
---
title: sqlite-vec in Python
---
# Using `sqlite-vec` in Python
[](https://pypi.org/project/sqlite-vec/)
To use `sqlite-vec` from Python, install the
[`sqlite-vec` PyPi package](https://pypi.org/project/sqlite-vec/) using your
favorite Python package manager:
```bash
pip install sqlite-vec
```
Once installed, use the `sqlite_vec.load()` function to load `sqlite-vec` SQL
functions into a SQLite connection.
```python
import sqlite3
import sqlite_vec
db = sqlite3.connect(":memory:")
db.enable_load_extension(True)
sqlite_vec.load(db)
db.enable_load_extension(False)
vec_version, = db.execute("select vec_version()").fetchone()
print(f"vec_version={vec_version}")
```
See
[`simple-python/demo.py`](https://github.com/asg017/sqlite-vec/blob/main/examples/simple-python/demo.py)
for a more complete Python demo.
## Working with Vectors
### Lists
If your vectors in Python are provided as a list of floats, you can
convert them into the compact BLOB format that `sqlite-vec` uses with
`serialize_float32()`. This internally calls [`struct.pack()`](https://docs.python.org/3/library/struct.html#struct.pack).
```python
from sqlite_vec import serialize_float32
embedding = [0.1, 0.2, 0.3, 0.4]
result = db.execute('select vec_length(?)', [serialize_float32(embedding)])
print(result.fetchone()[0]) # 4
```
### NumPy Arrays
If your vectors are NumPy arrays, the Python SQLite package allows you to
pass it along as-is, since NumPy arrays implement [the Buffer protocol](https://docs.python.org/3/c-api/buffer.html). Make sure you cast your array elements to 32-bit floats
with
[`.astype(np.float32)`](https://numpy.org/doc/stable/reference/generated/numpy.ndarray.astype.html),
as some embeddings will use `np.float64`.
```python
import numpy as np
embedding = np.array([0.1, 0.2, 0.3, 0.4])
db.execute(
"SELECT vec_length(?)", [embedding.astype(np.float32)]
) # 4
```
## Using an up-to-date version of SQLite {#updated-sqlite}
Some features of `sqlite-vec` will require an up-to-date SQLite library. You can
see what version of SQLite your Python environment uses with
[`sqlite3.sqlite_version`](https://docs.python.org/3/library/sqlite3.html#sqlite3.sqlite_version),
or with this one-line command:
```bash
python -c 'import sqlite3; print(sqlite3.sqlite_version)'
```
Currently, **SQLite version 3.41 or higher** is recommended but not required.
`sqlite-vec` will work with older versions, but certain features and queries will
only work correctly in >=3.41.
To "upgrade" the SQLite version your Python installation uses, you have a few
options.
### Compile your own SQLite version
You can compile an up-to-date version of SQLite and use some system environment
variables (like `LD_PRELOAD` and `DYLD_LIBRARY_PATH`) to force Python to use a
different SQLite library.
[This guide](https://til.simonwillison.net/sqlite/sqlite-version-macos-python)
goes into this approach in more details.
Although compiling SQLite can be straightforward, there are a lot of different
compilation options to consider, which makes it confusing. This also doesn't
work with Windows, which statically compiles its own SQLite library.
### Use `pysqlite3`
[`pysqlite3`](https://github.com/coleifer/pysqlite3) is a 3rd party PyPi package
that bundles an up-to-date SQLite library as a separate pip package.
While it's mostly compatible with the Python `sqlite3` module, there are a few
rare edge cases where the APIs don't match.
### Upgrading your Python version
Sometimes installing a latest version of Python will "magically" upgrade your
SQLite version as well. This is a nuclear option, as upgrading Python
installations can be quite the hassle, but most Python 3.12 builds will have a
very recent SQLite version.
## MacOS blocks SQLite extensions by default
The default SQLite library that is bundled with Mac operating systems do not include support for SQLite extensions. That means the default Python library that is bundled with MacOS also does not support SQLite extensions.
This is the case if you come across the following error message:
```
AttributeError: 'sqlite3.Connection' object has no attribute 'enable_load_extension'
```
As a workaround, use the Homebrew version of Python (`brew install python`, new version at `/opt/homebrew/bin/python3`), which will use the Homebrew version of SQLite that allows SQLite extensions.
Other workarounds can be found at [Using an up-to-date version of SQLite](#updated-sqlite);
================================================
FILE: site/using/rqlite.md
================================================
# Using `sqlite-vec` in rqlite
[rqlite](https://rqlite.io/) users can use `sqlite-vec` with rqlite by loading the extension when they launch their rqlite node:
```bash
# Download a sqlite-vec release.
curl -L https://github.com/asg017/sqlite-vec/releases/download/v0.1.1/sqlite-vec-0.1.1-loadable-linux-x86_64.tar.gz -o sqlite-vec.tar.gz
# Tell rqlite to load sqlite-vec at launch time.
rqlited -extensions-path=sqlite-vec.tar.gz data
```
Once loaded you can use `sqlite-vec` functionality within rqlite. For example, you can perform searches via the [rqlite shell](https://rqlite.io/docs/cli/):
```
$ rqlite
Welcome to the rqlite CLI.
Enter ".help" for usage hints.
Connected to http://127.0.0.1:4001 running version 8
127.0.0.1:4001> create virtual table vec_examples using vec0(sample_embedding float[8]);
1 row affected
127.0.0.1:4001> insert into vec_examples(rowid, sample_embedding) values (1, '[-0.200, 0.250, 0.341, -0.211, 0.645, 0.935, -0.316, -0.924]'), (2, '[0.443, -0.501, 0.355, -0.771, 0.707, -0.708, -0.185, 0.362]'), (3, '[0.716, -0.927, 0.134, 0.052, -0.669, 0.793, -0.634, -0.162]'), (4, '[-0.710, 0.330, 0.656, 0.041, -0.990, 0.726, 0.385, -0.958]')
4 rows affected
127.0.0.1:4001> select rowid, distance from vec_examples where sample_embedding match '[0.890, 0.544, 0.825, 0.961, 0.358, 0.0196, 0.521, 0.175]' order by distance limit 2
+-------+-------------------+
| rowid | distance |
+-------+-------------------+
| 2 | 2.386873722076416 |
+-------+-------------------+
| 1 | 2.389785051345825 |
+-------+-------------------+
```
You can learn more from the [rqlite website](https://rqlite.io/docs/guides/extensions/).
================================================
FILE: site/using/ruby.md
================================================
# Using `sqlite-vec` in Ruby

Ruby developers can use `sqlite-vec` with the [`sqlite-vec` Gem](https://rubygems.org/gems/sqlite-vec).
```bash
gem install sqlite-vec
```
You can then use `SqliteVec.load()` to load `sqlite-vec` SQL functions in a given SQLite connection.
```ruby
require 'sqlite3'
require 'sqlite_vec'
db = SQLite3::Database.new(':memory:')
db.enable_load_extension(true)
SqliteVec.load(db)
db.enable_load_extension(false)
result = db.execute('SELECT vec_version()')
puts result.first.first
```
See
[`simple-ruby/demo.rb`](https://github.com/asg017/sqlite-vec/blob/main/examples/simple-ruby/demo.rb)
for a more complete Ruby demo.
## Working with vectors in Ruby
If your embeddings are provided as a list of numbers, use `.pack("f*")` to convert them into the compact BLOB format that `sqlite-vec` uses.
```ruby
embedding = [0.1, 0.2, 0.3, 0.4]
result = db.execute("SELECT vec_length(?)", [query.pack("f*")]])
puts result.first.first # 4
```
================================================
FILE: site/using/rust.md
================================================
# Using `sqlite-vec` in Rust
[](https://crates.io/crates/sqlite-vec)
You can embed `sqlite-vec` into your Rust projects using the official
[`sqlite-vec` crate](https://crates.io/crates/sqlite-vec).
```bash
cargo add sqlite-vec
```
The crate embeds the `sqlite-vec` C source code, and uses the
[`cc` crate](https://crates.io/crates/sqlite-vec) to compile and statically link
`sqlite-vec` at build-time.
The `sqlite-vec` crate exposes a single function `sqlite3_vec_init`, which is
the C entrypoint for the SQLite extension. You can "register" with your Rust
SQLite library's `sqlite3_auto_extension()` function.
This example registers sqlite-vec using [rusqlite](https://docs.rs/rusqlite/0.32.1/rusqlite/). First, enable the `"bundled"` feature in your Cargo file entry for rusqlite:
```diff
# Cargo.toml
[dependencies]
+ rusqlite = { version = "VERSION", features = ["bundled"] }
```
Then, you can verify your installation was successful by embedding your first vector. This example uses [zerocopy](https://docs.rs/zerocopy/latest/zerocopy/) to efficiently pass the vector as bytes, and prints the resulting vector and library version as Strings:
```rs
use sqlite_vec::sqlite3_vec_init;
use rusqlite::{ffi::sqlite3_auto_extension, Result};
use zerocopy::AsBytes;
fn main()-> Result<()> {
unsafe {
sqlite3_auto_extension(Some(std::mem::transmute(sqlite3_vec_init as *const ())));
}
let db = Connection::open_in_memory()?;
let v: Vec = vec![0.1, 0.2, 0.3];
let (vec_version, embedding): (String, String) = db.query_row(
"select vec_version(), vec_to_json(?)",
&[v.as_bytes()],
|x| Ok((x.get(0)?, x.get(1)?)),
)?;
println!("vec_version={vec_version}, embedding={embedding}");
Ok(())
}
```
See
[`simple-rust/demo.rs`](https://github.com/asg017/sqlite-vec/blob/main/examples/simple-rust/demo.rs)
for a more complete Rust demo.
## Working with vectors in Rust
If your vectors are provided as a `Vec` type, the [`zerocopy` crate](https://crates.io/crates/zerocopy) is recommended, specifically `zerocopy::AsBytes`. This will allow you to pass in vectors into `sqlite-vec` without any copying.
```rs
let query: Vec = vec![0.1, 0.2, 0.3, 0.4];
let mut stmt = db.prepare("SELECT vec_length(?)")?;
stmt.execute(&[item.1.as_bytes()])?;
```
================================================
FILE: site/using/sqlite-utils.md
================================================
# Using `sqlite-vec` in `sqlite-utils`

[`sqlite-utils`](https://sqlite-utils.datasette.io/en/stable/) users can install `sqlite-vec` into their `sqlite-utils` projects with the `sqlite-utils-sqlite-vec` plugin:
```bash
sqlite-utils install sqlite-utils-sqlite-vec
```
================================================
FILE: site/using/wasm.md
================================================
# `sqlite-vec` in the Browser with WebAssembly
`sqlite-vec` can be statically compiled into [official SQLite WASM](https://sqlite.org/wasm/doc/trunk/index.md) builds. The process is a bit complicated, but the result is a vector search in the browser, which is pretty cool!
```html
```
[*Open in CodePen*](https://codepen.io/asg017_ucsd/pen/MWMpJNY)
It's not possibly to dynamically load a SQLite extension into a WASM build of SQLite. So `sqlite-vec` must be statically compiled into custom WASM builds.
## The `sqlite-vec-wasm-demo` NPM package
A **demonstration** of `sqlite-vec` in WASM is provided with the `sqlite-vec-wasm-demo` NPM package. This package is a demonstration and may change at any time. It doesn't follow the [Semantic version of `sqlite-vec`](./versioning.md).
See
[`simple-wasm/index.html`](https://github.com/asg017/sqlite-vec/blob/main/examples/simple-wasm/index.html)
for a more complete WASM demo using this package.
================================================
FILE: site/versioning.md
================================================
# Semantic Versioning for `sqlite-vec`
`sqlite-vec` is pre-v1, so according to the rules of
[Semantic Versioning](https://semver.org/), so "minor" release like "0.2.0" or
"0.3.0" may contain breaking changes.
Only SQL functions, table functions, and virtual tables that are defined in the default `sqlite3_vec_init` entrypoint are considered as the `sqlite-vec` API for semantic versioning. This means that other entrypoints and other SQL functions should be considered unstable, untested, and possibly dangerous.
For the SQL API, a "breaking change" would include:
- Removing a function or module
- Changing the number or types of arguments for an SQL function
- Changing the require arguments of position of a table functions
- Changing the `CREATE VIRTUAL TABLE` constructor of a virtual table in a backwards-incompatible way
- Removing columns from a virtual table or table function
The official "bindings" to `sqlite-vec`, including the Python/Node.js/Ruby/Go/Rust are subject to change and are not covered by semantic versioning.
Though I have no plans to change or break them, and would include notes in changelogs if that ever needs to happen.
================================================
FILE: sqlite-dist.toml
================================================
[package]
name = "sqlite-vec"
license = "MIT OR Apache"
homepage = "https://alexgarcia.xyz/sqlite-vec"
repo = "https://github.com/asg017/sqlite-vec"
description = "A vector search SQLite extension."
authors = ["Alex Garcia"]
git_tag_format = "v$VERSION"
[targets]
github_releases = {}
sqlpkg = {}
spm = {}
amalgamation = {include=["sqlite-vec.c", "sqlite-vec.h"]}
pip = { extra_init_py = "bindings/python/extra_init.py" }
datasette = {}
sqlite_utils = {}
npm = {}
gem = { module_name = "SqliteVec" }
================================================
FILE: sqlite-vec.c
================================================
#include "sqlite-vec.h"
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#ifndef SQLITE_VEC_OMIT_FS
#include
#endif
#ifndef SQLITE_CORE
#include "sqlite3ext.h"
SQLITE_EXTENSION_INIT1
#else
#include "sqlite3.h"
#endif
#ifndef UINT32_TYPE
#ifdef HAVE_UINT32_T
#define UINT32_TYPE uint32_t
#else
#define UINT32_TYPE unsigned int
#endif
#endif
#ifndef UINT16_TYPE
#ifdef HAVE_UINT16_T
#define UINT16_TYPE uint16_t
#else
#define UINT16_TYPE unsigned short int
#endif
#endif
#ifndef INT16_TYPE
#ifdef HAVE_INT16_T
#define INT16_TYPE int16_t
#else
#define INT16_TYPE short int
#endif
#endif
#ifndef UINT8_TYPE
#ifdef HAVE_UINT8_T
#define UINT8_TYPE uint8_t
#else
#define UINT8_TYPE unsigned char
#endif
#endif
#ifndef INT8_TYPE
#ifdef HAVE_INT8_T
#define INT8_TYPE int8_t
#else
#define INT8_TYPE signed char
#endif
#endif
#ifndef LONGDOUBLE_TYPE
#define LONGDOUBLE_TYPE long double
#endif
#ifndef _WIN32
#ifndef __EMSCRIPTEN__
#ifndef __COSMOPOLITAN__
#ifndef __wasi__
typedef u_int8_t uint8_t;
typedef u_int16_t uint16_t;
typedef u_int64_t uint64_t;
#endif
#endif
#endif
#endif
typedef int8_t i8;
typedef uint8_t u8;
typedef int16_t i16;
typedef int32_t i32;
typedef sqlite3_int64 i64;
typedef uint32_t u32;
typedef uint64_t u64;
typedef float f32;
typedef size_t usize;
#ifndef UNUSED_PARAMETER
#define UNUSED_PARAMETER(X) (void)(X)
#endif
// sqlite3_vtab_in() was added in SQLite version 3.38 (2022-02-22)
// https://www.sqlite.org/changes.html#version_3_38_0
#if SQLITE_VERSION_NUMBER >= 3038000
#define COMPILER_SUPPORTS_VTAB_IN 1
#endif
#ifndef SQLITE_SUBTYPE
#define SQLITE_SUBTYPE 0x000100000
#endif
#ifndef SQLITE_RESULT_SUBTYPE
#define SQLITE_RESULT_SUBTYPE 0x001000000
#endif
#ifndef SQLITE_INDEX_CONSTRAINT_LIMIT
#define SQLITE_INDEX_CONSTRAINT_LIMIT 73
#endif
#ifndef SQLITE_INDEX_CONSTRAINT_OFFSET
#define SQLITE_INDEX_CONSTRAINT_OFFSET 74
#endif
#define countof(x) (sizeof(x) / sizeof((x)[0]))
#define min(a, b) (((a) <= (b)) ? (a) : (b))
enum VectorElementType {
// clang-format off
SQLITE_VEC_ELEMENT_TYPE_FLOAT32 = 223 + 0,
SQLITE_VEC_ELEMENT_TYPE_BIT = 223 + 1,
SQLITE_VEC_ELEMENT_TYPE_INT8 = 223 + 2,
// clang-format on
};
#ifdef SQLITE_VEC_ENABLE_AVX
#include
#define PORTABLE_ALIGN32 __attribute__((aligned(32)))
#define PORTABLE_ALIGN64 __attribute__((aligned(64)))
static f32 l2_sqr_float_avx(const void *pVect1v, const void *pVect2v,
const void *qty_ptr) {
f32 *pVect1 = (f32 *)pVect1v;
f32 *pVect2 = (f32 *)pVect2v;
size_t qty = *((size_t *)qty_ptr);
f32 PORTABLE_ALIGN32 TmpRes[8];
size_t qty16 = qty >> 4;
const f32 *pEnd1 = pVect1 + (qty16 << 4);
__m256 diff, v1, v2;
__m256 sum = _mm256_set1_ps(0);
while (pVect1 < pEnd1) {
v1 = _mm256_loadu_ps(pVect1);
pVect1 += 8;
v2 = _mm256_loadu_ps(pVect2);
pVect2 += 8;
diff = _mm256_sub_ps(v1, v2);
sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
v1 = _mm256_loadu_ps(pVect1);
pVect1 += 8;
v2 = _mm256_loadu_ps(pVect2);
pVect2 += 8;
diff = _mm256_sub_ps(v1, v2);
sum = _mm256_add_ps(sum, _mm256_mul_ps(diff, diff));
}
_mm256_store_ps(TmpRes, sum);
return sqrt(TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] +
TmpRes[5] + TmpRes[6] + TmpRes[7]);
}
#endif
#ifdef SQLITE_VEC_ENABLE_NEON
#include
#define PORTABLE_ALIGN32 __attribute__((aligned(32)))
// thx https://github.com/nmslib/hnswlib/pull/299/files
static f32 l2_sqr_float_neon(const void *pVect1v, const void *pVect2v,
const void *qty_ptr) {
f32 *pVect1 = (f32 *)pVect1v;
f32 *pVect2 = (f32 *)pVect2v;
size_t qty = *((size_t *)qty_ptr);
size_t qty16 = qty >> 4;
const f32 *pEnd1 = pVect1 + (qty16 << 4);
float32x4_t diff, v1, v2;
float32x4_t sum0 = vdupq_n_f32(0);
float32x4_t sum1 = vdupq_n_f32(0);
float32x4_t sum2 = vdupq_n_f32(0);
float32x4_t sum3 = vdupq_n_f32(0);
while (pVect1 < pEnd1) {
v1 = vld1q_f32(pVect1);
pVect1 += 4;
v2 = vld1q_f32(pVect2);
pVect2 += 4;
diff = vsubq_f32(v1, v2);
sum0 = vfmaq_f32(sum0, diff, diff);
v1 = vld1q_f32(pVect1);
pVect1 += 4;
v2 = vld1q_f32(pVect2);
pVect2 += 4;
diff = vsubq_f32(v1, v2);
sum1 = vfmaq_f32(sum1, diff, diff);
v1 = vld1q_f32(pVect1);
pVect1 += 4;
v2 = vld1q_f32(pVect2);
pVect2 += 4;
diff = vsubq_f32(v1, v2);
sum2 = vfmaq_f32(sum2, diff, diff);
v1 = vld1q_f32(pVect1);
pVect1 += 4;
v2 = vld1q_f32(pVect2);
pVect2 += 4;
diff = vsubq_f32(v1, v2);
sum3 = vfmaq_f32(sum3, diff, diff);
}
f32 sum_scalar =
vaddvq_f32(vaddq_f32(vaddq_f32(sum0, sum1), vaddq_f32(sum2, sum3)));
const f32 *pEnd2 = pVect1 + (qty - (qty16 << 4));
while (pVect1 < pEnd2) {
f32 diff = *pVect1 - *pVect2;
sum_scalar += diff * diff;
pVect1++;
pVect2++;
}
return sqrt(sum_scalar);
}
static f32 l2_sqr_int8_neon(const void *pVect1v, const void *pVect2v,
const void *qty_ptr) {
i8 *pVect1 = (i8 *)pVect1v;
i8 *pVect2 = (i8 *)pVect2v;
size_t qty = *((size_t *)qty_ptr);
const i8 *pEnd1 = pVect1 + qty;
i32 sum_scalar = 0;
while (pVect1 < pEnd1 - 7) {
// loading 8 at a time
int8x8_t v1 = vld1_s8(pVect1);
int8x8_t v2 = vld1_s8(pVect2);
pVect1 += 8;
pVect2 += 8;
// widen to protect against overflow
int16x8_t v1_wide = vmovl_s8(v1);
int16x8_t v2_wide = vmovl_s8(v2);
int16x8_t diff = vsubq_s16(v1_wide, v2_wide);
int16x8_t squared_diff = vmulq_s16(diff, diff);
int32x4_t sum = vpaddlq_s16(squared_diff);
sum_scalar += vgetq_lane_s32(sum, 0) + vgetq_lane_s32(sum, 1) +
vgetq_lane_s32(sum, 2) + vgetq_lane_s32(sum, 3);
}
// handle leftovers
while (pVect1 < pEnd1) {
i16 diff = (i16)*pVect1 - (i16)*pVect2;
sum_scalar += diff * diff;
pVect1++;
pVect2++;
}
return sqrtf(sum_scalar);
}
static i32 l1_int8_neon(const void *pVect1v, const void *pVect2v,
const void *qty_ptr) {
i8 *pVect1 = (i8 *)pVect1v;
i8 *pVect2 = (i8 *)pVect2v;
size_t qty = *((size_t *)qty_ptr);
const int8_t *pEnd1 = pVect1 + qty;
int32x4_t acc1 = vdupq_n_s32(0);
int32x4_t acc2 = vdupq_n_s32(0);
int32x4_t acc3 = vdupq_n_s32(0);
int32x4_t acc4 = vdupq_n_s32(0);
while (pVect1 < pEnd1 - 63) {
int8x16_t v1 = vld1q_s8(pVect1);
int8x16_t v2 = vld1q_s8(pVect2);
int8x16_t diff1 = vabdq_s8(v1, v2);
acc1 = vaddq_s32(acc1, vpaddlq_u16(vpaddlq_u8(diff1)));
v1 = vld1q_s8(pVect1 + 16);
v2 = vld1q_s8(pVect2 + 16);
int8x16_t diff2 = vabdq_s8(v1, v2);
acc2 = vaddq_s32(acc2, vpaddlq_u16(vpaddlq_u8(diff2)));
v1 = vld1q_s8(pVect1 + 32);
v2 = vld1q_s8(pVect2 + 32);
int8x16_t diff3 = vabdq_s8(v1, v2);
acc3 = vaddq_s32(acc3, vpaddlq_u16(vpaddlq_u8(diff3)));
v1 = vld1q_s8(pVect1 + 48);
v2 = vld1q_s8(pVect2 + 48);
int8x16_t diff4 = vabdq_s8(v1, v2);
acc4 = vaddq_s32(acc4, vpaddlq_u16(vpaddlq_u8(diff4)));
pVect1 += 64;
pVect2 += 64;
}
while (pVect1 < pEnd1 - 15) {
int8x16_t v1 = vld1q_s8(pVect1);
int8x16_t v2 = vld1q_s8(pVect2);
int8x16_t diff = vabdq_s8(v1, v2);
acc1 = vaddq_s32(acc1, vpaddlq_u16(vpaddlq_u8(diff)));
pVect1 += 16;
pVect2 += 16;
}
int32x4_t acc = vaddq_s32(vaddq_s32(acc1, acc2), vaddq_s32(acc3, acc4));
int32_t sum = 0;
while (pVect1 < pEnd1) {
int32_t diff = abs((int32_t)*pVect1 - (int32_t)*pVect2);
sum += diff;
pVect1++;
pVect2++;
}
return vaddvq_s32(acc) + sum;
}
static double l1_f32_neon(const void *pVect1v, const void *pVect2v,
const void *qty_ptr) {
f32 *pVect1 = (f32 *)pVect1v;
f32 *pVect2 = (f32 *)pVect2v;
size_t qty = *((size_t *)qty_ptr);
const f32 *pEnd1 = pVect1 + qty;
float64x2_t acc = vdupq_n_f64(0);
while (pVect1 < pEnd1 - 3) {
float32x4_t v1 = vld1q_f32(pVect1);
float32x4_t v2 = vld1q_f32(pVect2);
pVect1 += 4;
pVect2 += 4;
// f32x4 -> f64x2 pad for overflow
float64x2_t low_diff = vabdq_f64(vcvt_f64_f32(vget_low_f32(v1)),
vcvt_f64_f32(vget_low_f32(v2)));
float64x2_t high_diff =
vabdq_f64(vcvt_high_f64_f32(v1), vcvt_high_f64_f32(v2));
acc = vaddq_f64(acc, vaddq_f64(low_diff, high_diff));
}
double sum = 0;
while (pVect1 < pEnd1) {
sum += fabs((double)*pVect1 - (double)*pVect2);
pVect1++;
pVect2++;
}
return vaddvq_f64(acc) + sum;
}
#endif
static f32 l2_sqr_float(const void *pVect1v, const void *pVect2v,
const void *qty_ptr) {
f32 *pVect1 = (f32 *)pVect1v;
f32 *pVect2 = (f32 *)pVect2v;
size_t qty = *((size_t *)qty_ptr);
f32 res = 0;
for (size_t i = 0; i < qty; i++) {
f32 t = *pVect1 - *pVect2;
pVect1++;
pVect2++;
res += t * t;
}
return sqrt(res);
}
static f32 l2_sqr_int8(const void *pA, const void *pB, const void *pD) {
i8 *a = (i8 *)pA;
i8 *b = (i8 *)pB;
size_t d = *((size_t *)pD);
f32 res = 0;
for (size_t i = 0; i < d; i++) {
f32 t = *a - *b;
a++;
b++;
res += t * t;
}
return sqrt(res);
}
static f32 distance_l2_sqr_float(const void *a, const void *b, const void *d) {
#ifdef SQLITE_VEC_ENABLE_NEON
if ((*(const size_t *)d) > 16) {
return l2_sqr_float_neon(a, b, d);
}
#endif
#ifdef SQLITE_VEC_ENABLE_AVX
if (((*(const size_t *)d) % 16 == 0)) {
return l2_sqr_float_avx(a, b, d);
}
#endif
return l2_sqr_float(a, b, d);
}
static f32 distance_l2_sqr_int8(const void *a, const void *b, const void *d) {
#ifdef SQLITE_VEC_ENABLE_NEON
if ((*(const size_t *)d) > 7) {
return l2_sqr_int8_neon(a, b, d);
}
#endif
return l2_sqr_int8(a, b, d);
}
static i32 l1_int8(const void *pA, const void *pB, const void *pD) {
i8 *a = (i8 *)pA;
i8 *b = (i8 *)pB;
size_t d = *((size_t *)pD);
i32 res = 0;
for (size_t i = 0; i < d; i++) {
res += abs(*a - *b);
a++;
b++;
}
return res;
}
static i32 distance_l1_int8(const void *a, const void *b, const void *d) {
#ifdef SQLITE_VEC_ENABLE_NEON
if ((*(const size_t *)d) > 15) {
return l1_int8_neon(a, b, d);
}
#endif
return l1_int8(a, b, d);
}
static double l1_f32(const void *pA, const void *pB, const void *pD) {
f32 *a = (f32 *)pA;
f32 *b = (f32 *)pB;
size_t d = *((size_t *)pD);
double res = 0;
for (size_t i = 0; i < d; i++) {
res += fabs((double)*a - (double)*b);
a++;
b++;
}
return res;
}
static double distance_l1_f32(const void *a, const void *b, const void *d) {
#ifdef SQLITE_VEC_ENABLE_NEON
if ((*(const size_t *)d) > 3) {
return l1_f32_neon(a, b, d);
}
#endif
return l1_f32(a, b, d);
}
static f32 distance_cosine_float(const void *pVect1v, const void *pVect2v,
const void *qty_ptr) {
f32 *pVect1 = (f32 *)pVect1v;
f32 *pVect2 = (f32 *)pVect2v;
size_t qty = *((size_t *)qty_ptr);
f32 dot = 0;
f32 aMag = 0;
f32 bMag = 0;
for (size_t i = 0; i < qty; i++) {
dot += *pVect1 * *pVect2;
aMag += *pVect1 * *pVect1;
bMag += *pVect2 * *pVect2;
pVect1++;
pVect2++;
}
return 1 - (dot / (sqrt(aMag) * sqrt(bMag)));
}
static f32 distance_cosine_int8(const void *pA, const void *pB,
const void *pD) {
i8 *a = (i8 *)pA;
i8 *b = (i8 *)pB;
size_t d = *((size_t *)pD);
f32 dot = 0;
f32 aMag = 0;
f32 bMag = 0;
for (size_t i = 0; i < d; i++) {
dot += *a * *b;
aMag += *a * *a;
bMag += *b * *b;
a++;
b++;
}
return 1 - (dot / (sqrt(aMag) * sqrt(bMag)));
}
// https://github.com/facebookresearch/faiss/blob/77e2e79cd0a680adc343b9840dd865da724c579e/faiss/utils/hamming_distance/common.h#L34
static u8 hamdist_table[256] = {
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4,
2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4,
2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6,
4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5,
3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6,
4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8};
static f32 distance_hamming_u8(u8 *a, u8 *b, size_t n) {
int same = 0;
for (unsigned long i = 0; i < n; i++) {
same += hamdist_table[a[i] ^ b[i]];
}
return (f32)same;
}
#ifdef _MSC_VER
#if !defined(__clang__) && (defined(_M_ARM) || defined(_M_ARM64))
// From
// https://github.com/ngtcp2/ngtcp2/blob/b64f1e77b5e0d880b93d31f474147fae4a1d17cc/lib/ngtcp2_ringbuf.c,
// line 34-43
static unsigned int __builtin_popcountl(unsigned int x) {
unsigned int c = 0;
for (; x; ++c) {
x &= x - 1;
}
return c;
}
#else
#include
#define __builtin_popcountl __popcnt64
#endif
#endif
static f32 distance_hamming_u64(u64 *a, u64 *b, size_t n) {
int same = 0;
for (unsigned long i = 0; i < n; i++) {
same += __builtin_popcountl(a[i] ^ b[i]);
}
return (f32)same;
}
/**
* @brief Calculate the hamming distance between two bitvectors.
*
* @param a - first bitvector, MUST have d dimensions
* @param b - second bitvector, MUST have d dimensions
* @param d - pointer to size_t, MUST be divisible by CHAR_BIT
* @return f32
*/
static f32 distance_hamming(const void *a, const void *b, const void *d) {
size_t dimensions = *((size_t *)d);
if ((dimensions % 64) == 0) {
return distance_hamming_u64((u64 *)a, (u64 *)b, dimensions / 8 / CHAR_BIT);
}
return distance_hamming_u8((u8 *)a, (u8 *)b, dimensions / CHAR_BIT);
}
#ifdef SQLITE_VEC_TEST
f32 _test_distance_l2_sqr_float(const f32 *a, const f32 *b, size_t dims) {
return distance_l2_sqr_float(a, b, &dims);
}
f32 _test_distance_cosine_float(const f32 *a, const f32 *b, size_t dims) {
return distance_cosine_float(a, b, &dims);
}
f32 _test_distance_hamming(const u8 *a, const u8 *b, size_t dims) {
return distance_hamming(a, b, &dims);
}
#endif
// from SQLite source:
// https://github.com/sqlite/sqlite/blob/a509a90958ddb234d1785ed7801880ccb18b497e/src/json.c#L153
static const char vecJsonIsSpaceX[] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
#define vecJsonIsspace(x) (vecJsonIsSpaceX[(unsigned char)x])
typedef void (*vector_cleanup)(void *p);
void vector_cleanup_noop(void *_) { UNUSED_PARAMETER(_); }
#define JSON_SUBTYPE 74
void vtab_set_error(sqlite3_vtab *pVTab, const char *zFormat, ...) {
va_list args;
sqlite3_free(pVTab->zErrMsg);
va_start(args, zFormat);
pVTab->zErrMsg = sqlite3_vmprintf(zFormat, args);
va_end(args);
}
struct Array {
size_t element_size;
size_t length;
size_t capacity;
void *z;
};
/**
* @brief Initial an array with the given element size and capacity.
*
* @param array
* @param element_size
* @param init_capacity
* @return SQLITE_OK on success, error code on failure. Only error is
* SQLITE_NOMEM
*/
int array_init(struct Array *array, size_t element_size, size_t init_capacity) {
int sz = element_size * init_capacity;
void *z = sqlite3_malloc(sz);
if (!z) {
return SQLITE_NOMEM;
}
memset(z, 0, sz);
array->element_size = element_size;
array->length = 0;
array->capacity = init_capacity;
array->z = z;
return SQLITE_OK;
}
int array_append(struct Array *array, const void *element) {
if (array->length == array->capacity) {
size_t new_capacity = array->capacity * 2 + 100;
void *z = sqlite3_realloc64(array->z, array->element_size * new_capacity);
if (z) {
array->capacity = new_capacity;
array->z = z;
} else {
return SQLITE_NOMEM;
}
}
memcpy(&((unsigned char *)array->z)[array->length * array->element_size],
element, array->element_size);
array->length++;
return SQLITE_OK;
}
void array_cleanup(struct Array *array) {
if (!array)
return;
array->element_size = 0;
array->length = 0;
array->capacity = 0;
sqlite3_free(array->z);
array->z = NULL;
}
char *vector_subtype_name(int subtype) {
switch (subtype) {
case SQLITE_VEC_ELEMENT_TYPE_FLOAT32:
return "float32";
case SQLITE_VEC_ELEMENT_TYPE_INT8:
return "int8";
case SQLITE_VEC_ELEMENT_TYPE_BIT:
return "bit";
}
return "";
}
char *type_name(int type) {
switch (type) {
case SQLITE_INTEGER:
return "INTEGER";
case SQLITE_BLOB:
return "BLOB";
case SQLITE_TEXT:
return "TEXT";
case SQLITE_FLOAT:
return "FLOAT";
case SQLITE_NULL:
return "NULL";
}
return "";
}
typedef void (*fvec_cleanup)(void *vector);
void fvec_cleanup_noop(void *_) { UNUSED_PARAMETER(_); }
static int fvec_from_value(sqlite3_value *value, f32 **vector,
size_t *dimensions, fvec_cleanup *cleanup,
char **pzErr) {
int value_type = sqlite3_value_type(value);
if (value_type == SQLITE_BLOB) {
const void *blob = sqlite3_value_blob(value);
int bytes = sqlite3_value_bytes(value);
if (bytes == 0) {
*pzErr = sqlite3_mprintf("zero-length vectors are not supported.");
return SQLITE_ERROR;
}
if ((bytes % sizeof(f32)) != 0) {
*pzErr = sqlite3_mprintf("invalid float32 vector BLOB length. Must be "
"divisible by %d, found %d",
sizeof(f32), bytes);
return SQLITE_ERROR;
}
f32 *buf = sqlite3_malloc(bytes);
if (!buf) {
*pzErr = sqlite3_mprintf("out of memory");
return SQLITE_NOMEM;
}
memcpy(buf, blob, bytes);
*vector = buf;
*dimensions = bytes / sizeof(f32);
*cleanup = sqlite3_free;
return SQLITE_OK;
}
if (value_type == SQLITE_TEXT) {
const char *source = (const char *)sqlite3_value_text(value);
int source_len = sqlite3_value_bytes(value);
if (source_len == 0) {
*pzErr = sqlite3_mprintf("zero-length vectors are not supported.");
return SQLITE_ERROR;
}
int i = 0;
struct Array x;
int rc = array_init(&x, sizeof(f32), ceil(source_len / 2.0));
if (rc != SQLITE_OK) {
return rc;
}
// advance leading whitespace to first '['
while (i < source_len) {
if (vecJsonIsspace(source[i])) {
i++;
continue;
}
if (source[i] == '[') {
break;
}
*pzErr = sqlite3_mprintf(
"JSON array parsing error: Input does not start with '['");
array_cleanup(&x);
return SQLITE_ERROR;
}
if (source[i] != '[') {
*pzErr = sqlite3_mprintf(
"JSON array parsing error: Input does not start with '['");
array_cleanup(&x);
return SQLITE_ERROR;
}
int offset = i + 1;
while (offset < source_len) {
char *ptr = (char *)&source[offset];
char *endptr;
errno = 0;
double result = strtod(ptr, &endptr);
if ((errno != 0 && result == 0) // some interval error?
|| (errno == ERANGE &&
(result == HUGE_VAL || result == -HUGE_VAL)) // too big / smalls
) {
sqlite3_free(x.z);
*pzErr = sqlite3_mprintf("JSON parsing error");
return SQLITE_ERROR;
}
if (endptr == ptr) {
if (*ptr != ']') {
sqlite3_free(x.z);
*pzErr = sqlite3_mprintf("JSON parsing error");
return SQLITE_ERROR;
}
goto done;
}
f32 res = (f32)result;
array_append(&x, (const void *)&res);
offset += (endptr - ptr);
while (offset < source_len) {
if (vecJsonIsspace(source[offset])) {
offset++;
continue;
}
if (source[offset] == ',') {
offset++;
continue;
}
if (source[offset] == ']')
goto done;
break;
}
}
done:
if (x.length > 0) {
*vector = (f32 *)x.z;
*dimensions = x.length;
*cleanup = sqlite3_free;
return SQLITE_OK;
}
sqlite3_free(x.z);
*pzErr = sqlite3_mprintf("zero-length vectors are not supported.");
return SQLITE_ERROR;
}
*pzErr = sqlite3_mprintf(
"Input must have type BLOB (compact format) or TEXT (JSON), found %s",
type_name(value_type));
return SQLITE_ERROR;
}
static int bitvec_from_value(sqlite3_value *value, u8 **vector,
size_t *dimensions, vector_cleanup *cleanup,
char **pzErr) {
int value_type = sqlite3_value_type(value);
if (value_type == SQLITE_BLOB) {
const void *blob = sqlite3_value_blob(value);
int bytes = sqlite3_value_bytes(value);
if (bytes == 0) {
*pzErr = sqlite3_mprintf("zero-length vectors are not supported.");
return SQLITE_ERROR;
}
*vector = (u8 *)blob;
*dimensions = bytes * CHAR_BIT;
*cleanup = vector_cleanup_noop;
return SQLITE_OK;
}
*pzErr = sqlite3_mprintf("Unknown type for bitvector.");
return SQLITE_ERROR;
}
static int int8_vec_from_value(sqlite3_value *value, i8 **vector,
size_t *dimensions, vector_cleanup *cleanup,
char **pzErr) {
int value_type = sqlite3_value_type(value);
if (value_type == SQLITE_BLOB) {
const void *blob = sqlite3_value_blob(value);
int bytes = sqlite3_value_bytes(value);
if (bytes == 0) {
*pzErr = sqlite3_mprintf("zero-length vectors are not supported.");
return SQLITE_ERROR;
}
*vector = (i8 *)blob;
*dimensions = bytes;
*cleanup = vector_cleanup_noop;
return SQLITE_OK;
}
if (value_type == SQLITE_TEXT) {
const char *source = (const char *)sqlite3_value_text(value);
int source_len = sqlite3_value_bytes(value);
int i = 0;
if (source_len == 0) {
*pzErr = sqlite3_mprintf("zero-length vectors are not supported.");
return SQLITE_ERROR;
}
struct Array x;
int rc = array_init(&x, sizeof(i8), ceil(source_len / 2.0));
if (rc != SQLITE_OK) {
return rc;
}
// advance leading whitespace to first '['
while (i < source_len) {
if (vecJsonIsspace(source[i])) {
i++;
continue;
}
if (source[i] == '[') {
break;
}
*pzErr = sqlite3_mprintf(
"JSON array parsing error: Input does not start with '['");
array_cleanup(&x);
return SQLITE_ERROR;
}
if (source[i] != '[') {
*pzErr = sqlite3_mprintf(
"JSON array parsing error: Input does not start with '['");
array_cleanup(&x);
return SQLITE_ERROR;
}
int offset = i + 1;
while (offset < source_len) {
char *ptr = (char *)&source[offset];
char *endptr;
errno = 0;
long result = strtol(ptr, &endptr, 10);
if ((errno != 0 && result == 0) ||
(errno == ERANGE && (result == LONG_MAX || result == LONG_MIN))) {
sqlite3_free(x.z);
*pzErr = sqlite3_mprintf("JSON parsing error");
return SQLITE_ERROR;
}
if (endptr == ptr) {
if (*ptr != ']') {
sqlite3_free(x.z);
*pzErr = sqlite3_mprintf("JSON parsing error");
return SQLITE_ERROR;
}
goto done;
}
if (result < INT8_MIN || result > INT8_MAX) {
sqlite3_free(x.z);
*pzErr =
sqlite3_mprintf("JSON parsing error: value out of range for int8");
return SQLITE_ERROR;
}
i8 res = (i8)result;
array_append(&x, (const void *)&res);
offset += (endptr - ptr);
while (offset < source_len) {
if (vecJsonIsspace(source[offset])) {
offset++;
continue;
}
if (source[offset] == ',') {
offset++;
continue;
}
if (source[offset] == ']')
goto done;
break;
}
}
done:
if (x.length > 0) {
*vector = (i8 *)x.z;
*dimensions = x.length;
*cleanup = (vector_cleanup)sqlite3_free;
return SQLITE_OK;
}
sqlite3_free(x.z);
*pzErr = sqlite3_mprintf("zero-length vectors are not supported.");
return SQLITE_ERROR;
}
*pzErr = sqlite3_mprintf("Unknown type for int8 vector.");
return SQLITE_ERROR;
}
/**
* @brief Extract a vector from a sqlite3_value. Can be a float32, int8, or bit
* vector.
*
* @param value: the sqlite3_value to read from.
* @param vector: Output pointer to vector data.
* @param dimensions: Output number of dimensions
* @param dimensions: Output vector element type
* @param cleanup
* @param pzErrorMessage
* @return int SQLITE_OK on success, error code otherwise
*/
int vector_from_value(sqlite3_value *value, void **vector, size_t *dimensions,
enum VectorElementType *element_type,
vector_cleanup *cleanup, char **pzErrorMessage) {
int subtype = sqlite3_value_subtype(value);
if (!subtype || (subtype == SQLITE_VEC_ELEMENT_TYPE_FLOAT32) ||
(subtype == JSON_SUBTYPE)) {
int rc = fvec_from_value(value, (f32 **)vector, dimensions,
(fvec_cleanup *)cleanup, pzErrorMessage);
if (rc == SQLITE_OK) {
*element_type = SQLITE_VEC_ELEMENT_TYPE_FLOAT32;
}
return rc;
}
if (subtype == SQLITE_VEC_ELEMENT_TYPE_BIT) {
int rc = bitvec_from_value(value, (u8 **)vector, dimensions, cleanup,
pzErrorMessage);
if (rc == SQLITE_OK) {
*element_type = SQLITE_VEC_ELEMENT_TYPE_BIT;
}
return rc;
}
if (subtype == SQLITE_VEC_ELEMENT_TYPE_INT8) {
int rc = int8_vec_from_value(value, (i8 **)vector, dimensions, cleanup,
pzErrorMessage);
if (rc == SQLITE_OK) {
*element_type = SQLITE_VEC_ELEMENT_TYPE_INT8;
}
return rc;
}
*pzErrorMessage = sqlite3_mprintf("Unknown subtype: %d", subtype);
return SQLITE_ERROR;
}
int ensure_vector_match(sqlite3_value *aValue, sqlite3_value *bValue, void **a,
void **b, enum VectorElementType *element_type,
size_t *dimensions, vector_cleanup *outACleanup,
vector_cleanup *outBCleanup, char **outError) {
int rc;
enum VectorElementType aType, bType;
size_t aDims, bDims;
char *error = NULL;
vector_cleanup aCleanup, bCleanup;
rc = vector_from_value(aValue, a, &aDims, &aType, &aCleanup, &error);
if (rc != SQLITE_OK) {
*outError = sqlite3_mprintf("Error reading 1st vector: %s", error);
sqlite3_free(error);
return SQLITE_ERROR;
}
rc = vector_from_value(bValue, b, &bDims, &bType, &bCleanup, &error);
if (rc != SQLITE_OK) {
*outError = sqlite3_mprintf("Error reading 2nd vector: %s", error);
sqlite3_free(error);
aCleanup(*a);
return SQLITE_ERROR;
}
if (aType != bType) {
*outError =
sqlite3_mprintf("Vector type mistmatch. First vector has type %s, "
"while the second has type %s.",
vector_subtype_name(aType), vector_subtype_name(bType));
aCleanup(*a);
bCleanup(*b);
return SQLITE_ERROR;
}
if (aDims != bDims) {
*outError = sqlite3_mprintf(
"Vector dimension mistmatch. First vector has %ld dimensions, "
"while the second has %ld dimensions.",
aDims, bDims);
aCleanup(*a);
bCleanup(*b);
return SQLITE_ERROR;
}
*element_type = aType;
*dimensions = aDims;
*outACleanup = aCleanup;
*outBCleanup = bCleanup;
return SQLITE_OK;
}
int _cmp(const void *a, const void *b) { return (*(i64 *)a - *(i64 *)b); }
struct VecNpyFile {
char *path;
size_t pathLength;
};
#define SQLITE_VEC_NPY_FILE_NAME "vec0-npy-file"
#ifndef SQLITE_VEC_OMIT_FS
static void vec_npy_file(sqlite3_context *context, int argc,
sqlite3_value **argv) {
assert(argc == 1);
char *path = (char *)sqlite3_value_text(argv[0]);
size_t pathLength = sqlite3_value_bytes(argv[0]);
struct VecNpyFile *f;
f = sqlite3_malloc(sizeof(*f));
if (!f) {
sqlite3_result_error_nomem(context);
return;
}
memset(f, 0, sizeof(*f));
f->path = path;
f->pathLength = pathLength;
sqlite3_result_pointer(context, f, SQLITE_VEC_NPY_FILE_NAME, sqlite3_free);
}
#endif
#pragma region scalar functions
static void vec_f32(sqlite3_context *context, int argc, sqlite3_value **argv) {
assert(argc == 1);
int rc;
f32 *vector = NULL;
size_t dimensions;
fvec_cleanup cleanup;
char *errmsg;
rc = fvec_from_value(argv[0], &vector, &dimensions, &cleanup, &errmsg);
if (rc != SQLITE_OK) {
sqlite3_result_error(context, errmsg, -1);
sqlite3_free(errmsg);
return;
}
sqlite3_result_blob(context, vector, dimensions * sizeof(f32),
(void (*)(void *))cleanup);
sqlite3_result_subtype(context, SQLITE_VEC_ELEMENT_TYPE_FLOAT32);
}
static void vec_bit(sqlite3_context *context, int argc, sqlite3_value **argv) {
assert(argc == 1);
int rc;
u8 *vector;
size_t dimensions;
vector_cleanup cleanup;
char *errmsg;
rc = bitvec_from_value(argv[0], &vector, &dimensions, &cleanup, &errmsg);
if (rc != SQLITE_OK) {
sqlite3_result_error(context, errmsg, -1);
sqlite3_free(errmsg);
return;
}
sqlite3_result_blob(context, vector, dimensions / CHAR_BIT, SQLITE_TRANSIENT);
sqlite3_result_subtype(context, SQLITE_VEC_ELEMENT_TYPE_BIT);
cleanup(vector);
}
static void vec_int8(sqlite3_context *context, int argc, sqlite3_value **argv) {
assert(argc == 1);
int rc;
i8 *vector;
size_t dimensions;
vector_cleanup cleanup;
char *errmsg;
rc = int8_vec_from_value(argv[0], &vector, &dimensions, &cleanup, &errmsg);
if (rc != SQLITE_OK) {
sqlite3_result_error(context, errmsg, -1);
sqlite3_free(errmsg);
return;
}
sqlite3_result_blob(context, vector, dimensions, SQLITE_TRANSIENT);
sqlite3_result_subtype(context, SQLITE_VEC_ELEMENT_TYPE_INT8);
cleanup(vector);
}
static void vec_length(sqlite3_context *context, int argc,
sqlite3_value **argv) {
assert(argc == 1);
int rc;
void *vector;
size_t dimensions;
vector_cleanup cleanup;
char *errmsg;
enum VectorElementType elementType;
rc = vector_from_value(argv[0], &vector, &dimensions, &elementType, &cleanup,
&errmsg);
if (rc != SQLITE_OK) {
sqlite3_result_error(context, errmsg, -1);
sqlite3_free(errmsg);
return;
}
sqlite3_result_int64(context, dimensions);
cleanup(vector);
}
static void vec_distance_cosine(sqlite3_context *context, int argc,
sqlite3_value **argv) {
assert(argc == 2);
int rc;
void *a = NULL, *b = NULL;
size_t dimensions;
vector_cleanup aCleanup, bCleanup;
char *error;
enum VectorElementType elementType;
rc = ensure_vector_match(argv[0], argv[1], &a, &b, &elementType, &dimensions,
&aCleanup, &bCleanup, &error);
if (rc != SQLITE_OK) {
sqlite3_result_error(context, error, -1);
sqlite3_free(error);
return;
}
switch (elementType) {
case SQLITE_VEC_ELEMENT_TYPE_BIT: {
sqlite3_result_error(
context, "Cannot calculate cosine distance between two bitvectors.",
-1);
goto finish;
}
case SQLITE_VEC_ELEMENT_TYPE_FLOAT32: {
f32 result = distance_cosine_float(a, b, &dimensions);
sqlite3_result_double(context, result);
goto finish;
}
case SQLITE_VEC_ELEMENT_TYPE_INT8: {
f32 result = distance_cosine_int8(a, b, &dimensions);
sqlite3_result_double(context, result);
goto finish;
}
}
finish:
aCleanup(a);
bCleanup(b);
return;
}
static void vec_distance_l2(sqlite3_context *context, int argc,
sqlite3_value **argv) {
assert(argc == 2);
int rc;
void *a = NULL, *b = NULL;
size_t dimensions;
vector_cleanup aCleanup, bCleanup;
char *error;
enum VectorElementType elementType;
rc = ensure_vector_match(argv[0], argv[1], &a, &b, &elementType, &dimensions,
&aCleanup, &bCleanup, &error);
if (rc != SQLITE_OK) {
sqlite3_result_error(context, error, -1);
sqlite3_free(error);
return;
}
switch (elementType) {
case SQLITE_VEC_ELEMENT_TYPE_BIT: {
sqlite3_result_error(
context, "Cannot calculate L2 distance between two bitvectors.", -1);
goto finish;
}
case SQLITE_VEC_ELEMENT_TYPE_FLOAT32: {
f32 result = distance_l2_sqr_float(a, b, &dimensions);
sqlite3_result_double(context, result);
goto finish;
}
case SQLITE_VEC_ELEMENT_TYPE_INT8: {
f32 result = distance_l2_sqr_int8(a, b, &dimensions);
sqlite3_result_double(context, result);
goto finish;
}
}
finish:
aCleanup(a);
bCleanup(b);
return;
}
static void vec_distance_l1(sqlite3_context *context, int argc,
sqlite3_value **argv) {
assert(argc == 2);
int rc;
void *a, *b;
size_t dimensions;
vector_cleanup aCleanup, bCleanup;
char *error;
enum VectorElementType elementType;
rc = ensure_vector_match(argv[0], argv[1], &a, &b, &elementType, &dimensions,
&aCleanup, &bCleanup, &error);
if (rc != SQLITE_OK) {
sqlite3_result_error(context, error, -1);
sqlite3_free(error);
return;
}
switch (elementType) {
case SQLITE_VEC_ELEMENT_TYPE_BIT: {
sqlite3_result_error(
context, "Cannot calculate L1 distance between two bitvectors.", -1);
goto finish;
}
case SQLITE_VEC_ELEMENT_TYPE_FLOAT32: {
double result = distance_l1_f32(a, b, &dimensions);
sqlite3_result_double(context, result);
goto finish;
}
case SQLITE_VEC_ELEMENT_TYPE_INT8: {
i64 result = distance_l1_int8(a, b, &dimensions);
sqlite3_result_int(context, result);
goto finish;
}
}
finish:
aCleanup(a);
bCleanup(b);
return;
}
static void vec_distance_hamming(sqlite3_context *context, int argc,
sqlite3_value **argv) {
assert(argc == 2);
int rc;
void *a = NULL, *b = NULL;
size_t dimensions;
vector_cleanup aCleanup, bCleanup;
char *error;
enum VectorElementType elementType;
rc = ensure_vector_match(argv[0], argv[1], &a, &b, &elementType, &dimensions,
&aCleanup, &bCleanup, &error);
if (rc != SQLITE_OK) {
sqlite3_result_error(context, error, -1);
sqlite3_free(error);
return;
}
switch (elementType) {
case SQLITE_VEC_ELEMENT_TYPE_BIT: {
sqlite3_result_double(context, distance_hamming(a, b, &dimensions));
goto finish;
}
case SQLITE_VEC_ELEMENT_TYPE_FLOAT32: {
sqlite3_result_error(
context,
"Cannot calculate hamming distance between two float32 vectors.", -1);
goto finish;
}
case SQLITE_VEC_ELEMENT_TYPE_INT8: {
sqlite3_result_error(
context, "Cannot calculate hamming distance between two int8 vectors.",
-1);
goto finish;
}
}
finish:
aCleanup(a);
bCleanup(b);
return;
}
char *vec_type_name(enum VectorElementType elementType) {
switch (elementType) {
case SQLITE_VEC_ELEMENT_TYPE_FLOAT32:
return "float32";
case SQLITE_VEC_ELEMENT_TYPE_INT8:
return "int8";
case SQLITE_VEC_ELEMENT_TYPE_BIT:
return "bit";
}
return "";
}
static void vec_type(sqlite3_context *context, int argc, sqlite3_value **argv) {
assert(argc == 1);
void *vector;
size_t dimensions;
vector_cleanup cleanup;
char *pzError;
enum VectorElementType elementType;
int rc = vector_from_value(argv[0], &vector, &dimensions, &elementType,
&cleanup, &pzError);
if (rc != SQLITE_OK) {
sqlite3_result_error(context, pzError, -1);
sqlite3_free(pzError);
return;
}
sqlite3_result_text(context, vec_type_name(elementType), -1, SQLITE_STATIC);
cleanup(vector);
}
static void vec_quantize_binary(sqlite3_context *context, int argc,
sqlite3_value **argv) {
assert(argc == 1);
void *vector;
size_t dimensions;
vector_cleanup vectorCleanup;
char *pzError;
enum VectorElementType elementType;
int rc = vector_from_value(argv[0], &vector, &dimensions, &elementType,
&vectorCleanup, &pzError);
if (rc != SQLITE_OK) {
sqlite3_result_error(context, pzError, -1);
sqlite3_free(pzError);
return;
}
if (dimensions <= 0) {
sqlite3_result_error(context, "Zero length vectors are not supported.", -1);
goto cleanup;
return;
}
if ((dimensions % CHAR_BIT) != 0) {
sqlite3_result_error(
context,
"Binary quantization requires vectors with a length divisible by 8",
-1);
goto cleanup;
return;
}
int sz = dimensions / CHAR_BIT;
u8 *out = sqlite3_malloc(sz);
if (!out) {
sqlite3_result_error_code(context, SQLITE_NOMEM);
goto cleanup;
return;
}
memset(out, 0, sz);
switch (elementType) {
case SQLITE_VEC_ELEMENT_TYPE_FLOAT32: {
for (size_t i = 0; i < dimensions; i++) {
int res = ((f32 *)vector)[i] > 0.0;
out[i / 8] |= (res << (i % 8));
}
break;
}
case SQLITE_VEC_ELEMENT_TYPE_INT8: {
for (size_t i = 0; i < dimensions; i++) {
int res = ((i8 *)vector)[i] > 0;
out[i / 8] |= (res << (i % 8));
}
break;
}
case SQLITE_VEC_ELEMENT_TYPE_BIT: {
sqlite3_result_error(context,
"Can only binary quantize float or int8 vectors", -1);
sqlite3_free(out);
return;
}
}
sqlite3_result_blob(context, out, sz, sqlite3_free);
sqlite3_result_subtype(context, SQLITE_VEC_ELEMENT_TYPE_BIT);
cleanup:
vectorCleanup(vector);
}
static void vec_quantize_int8(sqlite3_context *context, int argc,
sqlite3_value **argv) {
assert(argc == 2);
f32 *srcVector;
size_t dimensions;
fvec_cleanup srcCleanup;
char *err;
i8 *out = NULL;
int rc = fvec_from_value(argv[0], &srcVector, &dimensions, &srcCleanup, &err);
if (rc != SQLITE_OK) {
sqlite3_result_error(context, err, -1);
sqlite3_free(err);
return;
}
int sz = dimensions * sizeof(i8);
out = sqlite3_malloc(sz);
if (!out) {
sqlite3_result_error_nomem(context);
goto cleanup;
}
memset(out, 0, sz);
if ((sqlite3_value_type(argv[1]) != SQLITE_TEXT) ||
(sqlite3_value_bytes(argv[1]) != strlen("unit")) ||
(sqlite3_stricmp((const char *)sqlite3_value_text(argv[1]), "unit") !=
0)) {
sqlite3_result_error(
context, "2nd argument to vec_quantize_int8() must be 'unit'.", -1);
sqlite3_free(out);
goto cleanup;
}
f32 step = (1.0 - (-1.0)) / 255;
for (size_t i = 0; i < dimensions; i++) {
double val = ((srcVector[i] - (-1.0)) / step) - 128;
if (!(val <= 127.0)) val = 127.0; /* also clamps NaN */
if (!(val >= -128.0)) val = -128.0;
out[i] = (i8)val;
}
sqlite3_result_blob(context, out, dimensions * sizeof(i8), sqlite3_free);
sqlite3_result_subtype(context, SQLITE_VEC_ELEMENT_TYPE_INT8);
cleanup:
srcCleanup(srcVector);
}
static void vec_add(sqlite3_context *context, int argc, sqlite3_value **argv) {
assert(argc == 2);
int rc;
void *a = NULL, *b = NULL;
size_t dimensions;
vector_cleanup aCleanup, bCleanup;
char *error;
enum VectorElementType elementType;
rc = ensure_vector_match(argv[0], argv[1], &a, &b, &elementType, &dimensions,
&aCleanup, &bCleanup, &error);
if (rc != SQLITE_OK) {
sqlite3_result_error(context, error, -1);
sqlite3_free(error);
return;
}
switch (elementType) {
case SQLITE_VEC_ELEMENT_TYPE_BIT: {
sqlite3_result_error(context, "Cannot add two bitvectors together.", -1);
goto finish;
}
case SQLITE_VEC_ELEMENT_TYPE_FLOAT32: {
size_t outSize = dimensions * sizeof(f32);
f32 *out = sqlite3_malloc(outSize);
if (!out) {
sqlite3_result_error_nomem(context);
goto finish;
}
memset(out, 0, outSize);
for (size_t i = 0; i < dimensions; i++) {
out[i] = ((f32 *)a)[i] + ((f32 *)b)[i];
}
sqlite3_result_blob(context, out, outSize, sqlite3_free);
sqlite3_result_subtype(context, SQLITE_VEC_ELEMENT_TYPE_FLOAT32);
goto finish;
}
case SQLITE_VEC_ELEMENT_TYPE_INT8: {
size_t outSize = dimensions * sizeof(i8);
i8 *out = sqlite3_malloc(outSize);
if (!out) {
sqlite3_result_error_nomem(context);
goto finish;
}
memset(out, 0, outSize);
for (size_t i = 0; i < dimensions; i++) {
out[i] = ((i8 *)a)[i] + ((i8 *)b)[i];
}
sqlite3_result_blob(context, out, outSize, sqlite3_free);
sqlite3_result_subtype(context, SQLITE_VEC_ELEMENT_TYPE_INT8);
goto finish;
}
}
finish:
aCleanup(a);
bCleanup(b);
return;
}
static void vec_sub(sqlite3_context *context, int argc, sqlite3_value **argv) {
assert(argc == 2);
int rc;
void *a = NULL, *b = NULL;
size_t dimensions;
vector_cleanup aCleanup, bCleanup;
char *error;
enum VectorElementType elementType;
rc = ensure_vector_match(argv[0], argv[1], &a, &b, &elementType, &dimensions,
&aCleanup, &bCleanup, &error);
if (rc != SQLITE_OK) {
sqlite3_result_error(context, error, -1);
sqlite3_free(error);
return;
}
switch (elementType) {
case SQLITE_VEC_ELEMENT_TYPE_BIT: {
sqlite3_result_error(context, "Cannot subtract two bitvectors together.",
-1);
goto finish;
}
case SQLITE_VEC_ELEMENT_TYPE_FLOAT32: {
size_t outSize = dimensions * sizeof(f32);
f32 *out = sqlite3_malloc(outSize);
if (!out) {
sqlite3_result_error_nomem(context);
goto finish;
}
memset(out, 0, outSize);
for (size_t i = 0; i < dimensions; i++) {
out[i] = ((f32 *)a)[i] - ((f32 *)b)[i];
}
sqlite3_result_blob(context, out, outSize, sqlite3_free);
sqlite3_result_subtype(context, SQLITE_VEC_ELEMENT_TYPE_FLOAT32);
goto finish;
}
case SQLITE_VEC_ELEMENT_TYPE_INT8: {
size_t outSize = dimensions * sizeof(i8);
i8 *out = sqlite3_malloc(outSize);
if (!out) {
sqlite3_result_error_nomem(context);
goto finish;
}
memset(out, 0, outSize);
for (size_t i = 0; i < dimensions; i++) {
out[i] = ((i8 *)a)[i] - ((i8 *)b)[i];
}
sqlite3_result_blob(context, out, outSize, sqlite3_free);
sqlite3_result_subtype(context, SQLITE_VEC_ELEMENT_TYPE_INT8);
goto finish;
}
}
finish:
aCleanup(a);
bCleanup(b);
return;
}
static void vec_slice(sqlite3_context *context, int argc,
sqlite3_value **argv) {
assert(argc == 3);
void *vector;
size_t dimensions;
vector_cleanup cleanup;
char *err;
enum VectorElementType elementType;
int rc = vector_from_value(argv[0], &vector, &dimensions, &elementType,
&cleanup, &err);
if (rc != SQLITE_OK) {
sqlite3_result_error(context, err, -1);
sqlite3_free(err);
return;
}
int start = sqlite3_value_int(argv[1]);
int end = sqlite3_value_int(argv[2]);
if (start < 0) {
sqlite3_result_error(context,
"slice 'start' index must be a postive number.", -1);
goto done;
}
if (end < 0) {
sqlite3_result_error(context, "slice 'end' index must be a postive number.",
-1);
goto done;
}
if (((size_t)start) > dimensions) {
sqlite3_result_error(
context, "slice 'start' index is greater than the number of dimensions",
-1);
goto done;
}
if (((size_t)end) > dimensions) {
sqlite3_result_error(
context, "slice 'end' index is greater than the number of dimensions",
-1);
goto done;
}
if (start > end) {
sqlite3_result_error(context,
"slice 'start' index is greater than 'end' index", -1);
goto done;
}
if (start == end) {
sqlite3_result_error(context,
"slice 'start' index is equal to the 'end' index, "
"vectors must have non-zero length",
-1);
goto done;
}
size_t n = end - start;
switch (elementType) {
case SQLITE_VEC_ELEMENT_TYPE_FLOAT32: {
int outSize = n * sizeof(f32);
f32 *out = sqlite3_malloc(outSize);
if (!out) {
sqlite3_result_error_nomem(context);
goto done;
}
memset(out, 0, outSize);
for (size_t i = 0; i < n; i++) {
out[i] = ((f32 *)vector)[start + i];
}
sqlite3_result_blob(context, out, outSize, sqlite3_free);
sqlite3_result_subtype(context, SQLITE_VEC_ELEMENT_TYPE_FLOAT32);
goto done;
}
case SQLITE_VEC_ELEMENT_TYPE_INT8: {
int outSize = n * sizeof(i8);
i8 *out = sqlite3_malloc(outSize);
if (!out) {
sqlite3_result_error_nomem(context);
return;
}
memset(out, 0, outSize);
for (size_t i = 0; i < n; i++) {
out[i] = ((i8 *)vector)[start + i];
}
sqlite3_result_blob(context, out, outSize, sqlite3_free);
sqlite3_result_subtype(context, SQLITE_VEC_ELEMENT_TYPE_INT8);
goto done;
}
case SQLITE_VEC_ELEMENT_TYPE_BIT: {
if ((start % CHAR_BIT) != 0) {
sqlite3_result_error(context, "start index must be divisible by 8.", -1);
goto done;
}
if ((end % CHAR_BIT) != 0) {
sqlite3_result_error(context, "end index must be divisible by 8.", -1);
goto done;
}
int outSize = n / CHAR_BIT;
u8 *out = sqlite3_malloc(outSize);
if (!out) {
sqlite3_result_error_nomem(context);
return;
}
memset(out, 0, outSize);
for (size_t i = 0; i < n / CHAR_BIT; i++) {
out[i] = ((u8 *)vector)[(start / CHAR_BIT) + i];
}
sqlite3_result_blob(context, out, outSize, sqlite3_free);
sqlite3_result_subtype(context, SQLITE_VEC_ELEMENT_TYPE_BIT);
goto done;
}
}
done:
cleanup(vector);
}
static void vec_to_json(sqlite3_context *context, int argc,
sqlite3_value **argv) {
assert(argc == 1);
void *vector;
size_t dimensions;
vector_cleanup cleanup;
char *err;
enum VectorElementType elementType;
int rc = vector_from_value(argv[0], &vector, &dimensions, &elementType,
&cleanup, &err);
if (rc != SQLITE_OK) {
sqlite3_result_error(context, err, -1);
sqlite3_free(err);
return;
}
sqlite3_str *str = sqlite3_str_new(sqlite3_context_db_handle(context));
sqlite3_str_appendall(str, "[");
for (size_t i = 0; i < dimensions; i++) {
if (i != 0) {
sqlite3_str_appendall(str, ",");
}
if (elementType == SQLITE_VEC_ELEMENT_TYPE_FLOAT32) {
f32 value = ((f32 *)vector)[i];
if (isnan(value)) {
sqlite3_str_appendall(str, "null");
} else {
sqlite3_str_appendf(str, "%f", value);
}
} else if (elementType == SQLITE_VEC_ELEMENT_TYPE_INT8) {
sqlite3_str_appendf(str, "%d", ((i8 *)vector)[i]);
} else if (elementType == SQLITE_VEC_ELEMENT_TYPE_BIT) {
u8 b = (((u8 *)vector)[i / 8] >> (i % CHAR_BIT)) & 1;
sqlite3_str_appendf(str, "%d", b);
}
}
sqlite3_str_appendall(str, "]");
int len = sqlite3_str_length(str);
char *s = sqlite3_str_finish(str);
if (s) {
sqlite3_result_text(context, s, len, sqlite3_free);
sqlite3_result_subtype(context, JSON_SUBTYPE);
} else {
sqlite3_result_error_nomem(context);
}
cleanup(vector);
}
static void vec_normalize(sqlite3_context *context, int argc,
sqlite3_value **argv) {
assert(argc == 1);
void *vector;
size_t dimensions;
vector_cleanup cleanup;
char *err;
enum VectorElementType elementType;
int rc = vector_from_value(argv[0], &vector, &dimensions, &elementType,
&cleanup, &err);
if (rc != SQLITE_OK) {
sqlite3_result_error(context, err, -1);
sqlite3_free(err);
return;
}
if (elementType != SQLITE_VEC_ELEMENT_TYPE_FLOAT32) {
sqlite3_result_error(
context, "only float32 vectors are supported when normalizing", -1);
cleanup(vector);
return;
}
int outSize = dimensions * sizeof(f32);
f32 *out = sqlite3_malloc(outSize);
if (!out) {
cleanup(vector);
sqlite3_result_error_code(context, SQLITE_NOMEM);
return;
}
memset(out, 0, outSize);
f32 *v = (f32 *)vector;
f32 norm = 0;
for (size_t i = 0; i < dimensions; i++) {
norm += v[i] * v[i];
}
norm = sqrt(norm);
for (size_t i = 0; i < dimensions; i++) {
out[i] = v[i] / norm;
}
sqlite3_result_blob(context, out, dimensions * sizeof(f32), sqlite3_free);
sqlite3_result_subtype(context, SQLITE_VEC_ELEMENT_TYPE_FLOAT32);
cleanup(vector);
}
static void _static_text_func(sqlite3_context *context, int argc,
sqlite3_value **argv) {
UNUSED_PARAMETER(argc);
UNUSED_PARAMETER(argv);
sqlite3_result_text(context, sqlite3_user_data(context), -1, SQLITE_STATIC);
}
#pragma endregion
enum Vec0TokenType {
TOKEN_TYPE_IDENTIFIER,
TOKEN_TYPE_DIGIT,
TOKEN_TYPE_LBRACKET,
TOKEN_TYPE_RBRACKET,
TOKEN_TYPE_PLUS,
TOKEN_TYPE_EQ,
TOKEN_TYPE_LPAREN,
TOKEN_TYPE_RPAREN,
TOKEN_TYPE_COMMA,
};
struct Vec0Token {
enum Vec0TokenType token_type;
char *start;
char *end;
};
int is_alpha(char x) {
return (x >= 'a' && x <= 'z') || (x >= 'A' && x <= 'Z');
}
int is_digit(char x) { return (x >= '0' && x <= '9'); }
int is_whitespace(char x) {
return x == ' ' || x == '\t' || x == '\n' || x == '\r';
}
#define VEC0_TOKEN_RESULT_EOF 1
#define VEC0_TOKEN_RESULT_SOME 2
#define VEC0_TOKEN_RESULT_ERROR 3
int vec0_token_next(char *start, char *end, struct Vec0Token *out) {
char *ptr = start;
while (ptr < end) {
char curr = *ptr;
if (is_whitespace(curr)) {
ptr++;
continue;
} else if (curr == '+') {
ptr++;
out->start = ptr;
out->end = ptr;
out->token_type = TOKEN_TYPE_PLUS;
return VEC0_TOKEN_RESULT_SOME;
} else if (curr == '[') {
ptr++;
out->start = ptr;
out->end = ptr;
out->token_type = TOKEN_TYPE_LBRACKET;
return VEC0_TOKEN_RESULT_SOME;
} else if (curr == ']') {
ptr++;
out->start = ptr;
out->end = ptr;
out->token_type = TOKEN_TYPE_RBRACKET;
return VEC0_TOKEN_RESULT_SOME;
} else if (curr == '=') {
ptr++;
out->start = ptr;
out->end = ptr;
out->token_type = TOKEN_TYPE_EQ;
return VEC0_TOKEN_RESULT_SOME;
} else if (curr == '(') {
ptr++;
out->start = ptr;
out->end = ptr;
out->token_type = TOKEN_TYPE_LPAREN;
return VEC0_TOKEN_RESULT_SOME;
} else if (curr == ')') {
ptr++;
out->start = ptr;
out->end = ptr;
out->token_type = TOKEN_TYPE_RPAREN;
return VEC0_TOKEN_RESULT_SOME;
} else if (curr == ',') {
ptr++;
out->start = ptr;
out->end = ptr;
out->token_type = TOKEN_TYPE_COMMA;
return VEC0_TOKEN_RESULT_SOME;
} else if (is_alpha(curr)) {
char *start = ptr;
while (ptr < end && (is_alpha(*ptr) || is_digit(*ptr) || *ptr == '_')) {
ptr++;
}
out->start = start;
out->end = ptr;
out->token_type = TOKEN_TYPE_IDENTIFIER;
return VEC0_TOKEN_RESULT_SOME;
} else if (is_digit(curr)) {
char *start = ptr;
while (ptr < end && (is_digit(*ptr))) {
ptr++;
}
out->start = start;
out->end = ptr;
out->token_type = TOKEN_TYPE_DIGIT;
return VEC0_TOKEN_RESULT_SOME;
} else {
return VEC0_TOKEN_RESULT_ERROR;
}
}
return VEC0_TOKEN_RESULT_EOF;
}
struct Vec0Scanner {
char *start;
char *end;
char *ptr;
};
void vec0_scanner_init(struct Vec0Scanner *scanner, const char *source,
int source_length) {
scanner->start = (char *)source;
scanner->end = (char *)source + source_length;
scanner->ptr = (char *)source;
}
int vec0_scanner_next(struct Vec0Scanner *scanner, struct Vec0Token *out) {
int rc = vec0_token_next(scanner->start, scanner->end, out);
if (rc == VEC0_TOKEN_RESULT_SOME) {
scanner->start = out->end;
}
return rc;
}
int vec0_parse_table_option(const char *source, int source_length,
char **out_key, int *out_key_length,
char **out_value, int *out_value_length) {
int rc;
struct Vec0Scanner scanner;
struct Vec0Token token;
char *key;
char *value;
int keyLength, valueLength;
vec0_scanner_init(&scanner, source, source_length);
rc = vec0_scanner_next(&scanner, &token);
if (rc != VEC0_TOKEN_RESULT_SOME &&
token.token_type != TOKEN_TYPE_IDENTIFIER) {
return SQLITE_EMPTY;
}
key = token.start;
keyLength = token.end - token.start;
rc = vec0_scanner_next(&scanner, &token);
if (rc != VEC0_TOKEN_RESULT_SOME && token.token_type != TOKEN_TYPE_EQ) {
return SQLITE_EMPTY;
}
rc = vec0_scanner_next(&scanner, &token);
if (rc != VEC0_TOKEN_RESULT_SOME &&
!((token.token_type == TOKEN_TYPE_IDENTIFIER) ||
(token.token_type == TOKEN_TYPE_DIGIT))) {
return SQLITE_ERROR;
}
value = token.start;
valueLength = token.end - token.start;
rc = vec0_scanner_next(&scanner, &token);
if (rc == VEC0_TOKEN_RESULT_EOF) {
*out_key = key;
*out_key_length = keyLength;
*out_value = value;
*out_value_length = valueLength;
return SQLITE_OK;
}
return SQLITE_ERROR;
}
/**
* @brief Parse an argv[i] entry of a vec0 virtual table definition, and see if
* it's a PARTITION KEY definition.
*
* @param source: argv[i] source string
* @param source_length: length of the source string
* @param out_column_name: If it is a partition key, the output column name. Same lifetime
* as source, points to specific char *
* @param out_column_name_length: Length of out_column_name in bytes
* @param out_column_type: SQLITE_TEXT or SQLITE_INTEGER.
* @return int: SQLITE_EMPTY if not a PK, SQLITE_OK if it is.
*/
int vec0_parse_partition_key_definition(const char *source, int source_length,
char **out_column_name,
int *out_column_name_length,
int *out_column_type) {
struct Vec0Scanner scanner;
struct Vec0Token token;
char *column_name;
int column_name_length;
int column_type;
vec0_scanner_init(&scanner, source, source_length);
// Check first token is identifier, will be the column name
int rc = vec0_scanner_next(&scanner, &token);
if (rc != VEC0_TOKEN_RESULT_SOME &&
token.token_type != TOKEN_TYPE_IDENTIFIER) {
return SQLITE_EMPTY;
}
column_name = token.start;
column_name_length = token.end - token.start;
// Check the next token matches "text" or "integer", as column type
rc = vec0_scanner_next(&scanner, &token);
if (rc != VEC0_TOKEN_RESULT_SOME &&
token.token_type != TOKEN_TYPE_IDENTIFIER) {
return SQLITE_EMPTY;
}
if (sqlite3_strnicmp(token.start, "text", token.end - token.start) == 0) {
column_type = SQLITE_TEXT;
} else if (sqlite3_strnicmp(token.start, "int", token.end - token.start) ==
0 ||
sqlite3_strnicmp(token.start, "integer",
token.end - token.start) == 0) {
column_type = SQLITE_INTEGER;
} else {
return SQLITE_EMPTY;
}
// Check the next token is identifier and matches "partition"
rc = vec0_scanner_next(&scanner, &token);
if (rc != VEC0_TOKEN_RESULT_SOME &&
token.token_type != TOKEN_TYPE_IDENTIFIER) {
return SQLITE_EMPTY;
}
if (sqlite3_strnicmp(token.start, "partition", token.end - token.start) != 0) {
return SQLITE_EMPTY;
}
// Check the next token is identifier and matches "key"
rc = vec0_scanner_next(&scanner, &token);
if (rc != VEC0_TOKEN_RESULT_SOME &&
token.token_type != TOKEN_TYPE_IDENTIFIER) {
return SQLITE_EMPTY;
}
if (sqlite3_strnicmp(token.start, "key", token.end - token.start) != 0) {
return SQLITE_EMPTY;
}
*out_column_name = column_name;
*out_column_name_length = column_name_length;
*out_column_type = column_type;
return SQLITE_OK;
}
/**
* @brief Parse an argv[i] entry of a vec0 virtual table definition, and see if
* it's an auxiliar column definition, ie `+[name] [type]` like `+contents text`
*
* @param source: argv[i] source string
* @param source_length: length of the source string
* @param out_column_name: If it is a partition key, the output column name. Same lifetime
* as source, points to specific char *
* @param out_column_name_length: Length of out_column_name in bytes
* @param out_column_type: SQLITE_TEXT, SQLITE_INTEGER, SQLITE_FLOAT, or SQLITE_BLOB.
* @return int: SQLITE_EMPTY if not an aux column, SQLITE_OK if it is.
*/
int vec0_parse_auxiliary_column_definition(const char *source, int source_length,
char **out_column_name,
int *out_column_name_length,
int *out_column_type) {
struct Vec0Scanner scanner;
struct Vec0Token token;
char *column_name;
int column_name_length;
int column_type;
vec0_scanner_init(&scanner, source, source_length);
// Check first token is '+', which denotes aux columns
int rc = vec0_scanner_next(&scanner, &token);
if (rc != VEC0_TOKEN_RESULT_SOME ||
token.token_type != TOKEN_TYPE_PLUS) {
return SQLITE_EMPTY;
}
rc = vec0_scanner_next(&scanner, &token);
if (rc != VEC0_TOKEN_RESULT_SOME &&
token.token_type != TOKEN_TYPE_IDENTIFIER) {
return SQLITE_EMPTY;
}
column_name = token.start;
column_name_length = token.end - token.start;
// Check the next token matches "text" or "integer", as column type
rc = vec0_scanner_next(&scanner, &token);
if (rc != VEC0_TOKEN_RESULT_SOME &&
token.token_type != TOKEN_TYPE_IDENTIFIER) {
return SQLITE_EMPTY;
}
if (sqlite3_strnicmp(token.start, "text", token.end - token.start) == 0) {
column_type = SQLITE_TEXT;
} else if (sqlite3_strnicmp(token.start, "int", token.end - token.start) ==
0 ||
sqlite3_strnicmp(token.start, "integer",
token.end - token.start) == 0) {
column_type = SQLITE_INTEGER;
} else if (sqlite3_strnicmp(token.start, "float", token.end - token.start) ==
0 ||
sqlite3_strnicmp(token.start, "double",
token.end - token.start) == 0) {
column_type = SQLITE_FLOAT;
} else if (sqlite3_strnicmp(token.start, "blob", token.end - token.start) ==0) {
column_type = SQLITE_BLOB;
} else {
return SQLITE_EMPTY;
}
*out_column_name = column_name;
*out_column_name_length = column_name_length;
*out_column_type = column_type;
return SQLITE_OK;
}
typedef enum {
VEC0_METADATA_COLUMN_KIND_BOOLEAN,
VEC0_METADATA_COLUMN_KIND_INTEGER,
VEC0_METADATA_COLUMN_KIND_FLOAT,
VEC0_METADATA_COLUMN_KIND_TEXT,
// future: blob, date, datetime
} vec0_metadata_column_kind;
/**
* @brief Parse an argv[i] entry of a vec0 virtual table definition, and see if
* it's an metadata column definition, ie `[name] [type]` like `is_released boolean`
*
* @param source: argv[i] source string
* @param source_length: length of the source string
* @param out_column_name: If it is a metadata column, the output column name. Same lifetime
* as source, points to specific char *
* @param out_column_name_length: Length of out_column_name in bytes
* @param out_column_type: one of vec0_metadata_column_kind
* @return int: SQLITE_EMPTY if not an metadata column, SQLITE_OK if it is.
*/
int vec0_parse_metadata_column_definition(const char *source, int source_length,
char **out_column_name,
int *out_column_name_length,
vec0_metadata_column_kind *out_column_type) {
struct Vec0Scanner scanner;
struct Vec0Token token;
char *column_name;
int column_name_length;
vec0_metadata_column_kind column_type;
int rc;
vec0_scanner_init(&scanner, source, source_length);
rc = vec0_scanner_next(&scanner, &token);
if (rc != VEC0_TOKEN_RESULT_SOME ||
token.token_type != TOKEN_TYPE_IDENTIFIER) {
return SQLITE_EMPTY;
}
column_name = token.start;
column_name_length = token.end - token.start;
// Check the next token matches a valid metadata type
rc = vec0_scanner_next(&scanner, &token);
if (rc != VEC0_TOKEN_RESULT_SOME ||
token.token_type != TOKEN_TYPE_IDENTIFIER) {
return SQLITE_EMPTY;
}
char * t = token.start;
int n = token.end - token.start;
if (sqlite3_strnicmp(t, "boolean", n) == 0 || sqlite3_strnicmp(t, "bool", n) == 0) {
column_type = VEC0_METADATA_COLUMN_KIND_BOOLEAN;
}else if (sqlite3_strnicmp(t, "int64", n) == 0 || sqlite3_strnicmp(t, "integer64", n) == 0 || sqlite3_strnicmp(t, "integer", n) == 0 || sqlite3_strnicmp(t, "int", n) == 0) {
column_type = VEC0_METADATA_COLUMN_KIND_INTEGER;
}else if (sqlite3_strnicmp(t, "float", n) == 0 || sqlite3_strnicmp(t, "double", n) == 0 || sqlite3_strnicmp(t, "float64", n) == 0 || sqlite3_strnicmp(t, "f64", n) == 0) {
column_type = VEC0_METADATA_COLUMN_KIND_FLOAT;
} else if (sqlite3_strnicmp(t, "text", n) == 0) {
column_type = VEC0_METADATA_COLUMN_KIND_TEXT;
} else {
return SQLITE_EMPTY;
}
*out_column_name = column_name;
*out_column_name_length = column_name_length;
*out_column_type = column_type;
return SQLITE_OK;
}
/**
* @brief Parse an argv[i] entry of a vec0 virtual table definition, and see if
* it's a PRIMARY KEY definition.
*
* @param source: argv[i] source string
* @param source_length: length of the source string
* @param out_column_name: If it is a PK, the output column name. Same lifetime
* as source, points to specific char *
* @param out_column_name_length: Length of out_column_name in bytes
* @param out_column_type: SQLITE_TEXT or SQLITE_INTEGER.
* @return int: SQLITE_EMPTY if not a PK, SQLITE_OK if it is.
*/
int vec0_parse_primary_key_definition(const char *source, int source_length,
char **out_column_name,
int *out_column_name_length,
int *out_column_type) {
struct Vec0Scanner scanner;
struct Vec0Token token;
char *column_name;
int column_name_length;
int column_type;
vec0_scanner_init(&scanner, source, source_length);
// Check first token is identifier, will be the column name
int rc = vec0_scanner_next(&scanner, &token);
if (rc != VEC0_TOKEN_RESULT_SOME &&
token.token_type != TOKEN_TYPE_IDENTIFIER) {
return SQLITE_EMPTY;
}
column_name = token.start;
column_name_length = token.end - token.start;
// Check the next token matches "text" or "integer", as column type
rc = vec0_scanner_next(&scanner, &token);
if (rc != VEC0_TOKEN_RESULT_SOME &&
token.token_type != TOKEN_TYPE_IDENTIFIER) {
return SQLITE_EMPTY;
}
if (sqlite3_strnicmp(token.start, "text", token.end - token.start) == 0) {
column_type = SQLITE_TEXT;
} else if (sqlite3_strnicmp(token.start, "int", token.end - token.start) ==
0 ||
sqlite3_strnicmp(token.start, "integer",
token.end - token.start) == 0) {
column_type = SQLITE_INTEGER;
} else {
return SQLITE_EMPTY;
}
// Check the next token is identifier and matches "primary"
rc = vec0_scanner_next(&scanner, &token);
if (rc != VEC0_TOKEN_RESULT_SOME &&
token.token_type != TOKEN_TYPE_IDENTIFIER) {
return SQLITE_EMPTY;
}
if (sqlite3_strnicmp(token.start, "primary", token.end - token.start) != 0) {
return SQLITE_EMPTY;
}
// Check the next token is identifier and matches "key"
rc = vec0_scanner_next(&scanner, &token);
if (rc != VEC0_TOKEN_RESULT_SOME &&
token.token_type != TOKEN_TYPE_IDENTIFIER) {
return SQLITE_EMPTY;
}
if (sqlite3_strnicmp(token.start, "key", token.end - token.start) != 0) {
return SQLITE_EMPTY;
}
*out_column_name = column_name;
*out_column_name_length = column_name_length;
*out_column_type = column_type;
return SQLITE_OK;
}
enum Vec0DistanceMetrics {
VEC0_DISTANCE_METRIC_L2 = 1,
VEC0_DISTANCE_METRIC_COSINE = 2,
VEC0_DISTANCE_METRIC_L1 = 3,
};
struct VectorColumnDefinition {
char *name;
int name_length;
size_t dimensions;
enum VectorElementType element_type;
enum Vec0DistanceMetrics distance_metric;
};
struct Vec0PartitionColumnDefinition {
int type;
char * name;
int name_length;
};
struct Vec0AuxiliaryColumnDefinition {
int type;
char * name;
int name_length;
};
struct Vec0MetadataColumnDefinition {
vec0_metadata_column_kind kind;
char * name;
int name_length;
};
size_t vector_byte_size(enum VectorElementType element_type,
size_t dimensions) {
switch (element_type) {
case SQLITE_VEC_ELEMENT_TYPE_FLOAT32:
return dimensions * sizeof(f32);
case SQLITE_VEC_ELEMENT_TYPE_INT8:
return dimensions * sizeof(i8);
case SQLITE_VEC_ELEMENT_TYPE_BIT:
return dimensions / CHAR_BIT;
}
return 0;
}
size_t vector_column_byte_size(struct VectorColumnDefinition column) {
return vector_byte_size(column.element_type, column.dimensions);
}
/**
* @brief Parse an vec0 vtab argv[i] column definition and see if
* it's a vector column defintion, ex `contents_embedding float[768]`.
*
* @param source vec0 argv[i] item
* @param source_length length of source in bytes
* @param outColumn Output the parse vector column to this struct, if success
* @return int SQLITE_OK on success, SQLITE_EMPTY is it's not a vector column
* definition, SQLITE_ERROR on error.
*/
int vec0_parse_vector_column(const char *source, int source_length,
struct VectorColumnDefinition *outColumn) {
// parses a vector column definition like so:
// "abc float[123]", "abc_123 bit[1234]", eetc.
// https://github.com/asg017/sqlite-vec/issues/46
int rc;
struct Vec0Scanner scanner;
struct Vec0Token token;
char *name;
int nameLength;
enum VectorElementType elementType;
enum Vec0DistanceMetrics distanceMetric = VEC0_DISTANCE_METRIC_L2;
int dimensions;
vec0_scanner_init(&scanner, source, source_length);
// starts with an identifier
rc = vec0_scanner_next(&scanner, &token);
if (rc != VEC0_TOKEN_RESULT_SOME &&
token.token_type != TOKEN_TYPE_IDENTIFIER) {
return SQLITE_EMPTY;
}
name = token.start;
nameLength = token.end - token.start;
// vector column type comes next: float, int, or bit
rc = vec0_scanner_next(&scanner, &token);
if (rc != VEC0_TOKEN_RESULT_SOME ||
token.token_type != TOKEN_TYPE_IDENTIFIER) {
return SQLITE_EMPTY;
}
if (sqlite3_strnicmp(token.start, "float", 5) == 0 ||
sqlite3_strnicmp(token.start, "f32", 3) == 0) {
elementType = SQLITE_VEC_ELEMENT_TYPE_FLOAT32;
} else if (sqlite3_strnicmp(token.start, "int8", 4) == 0 ||
sqlite3_strnicmp(token.start, "i8", 2) == 0) {
elementType = SQLITE_VEC_ELEMENT_TYPE_INT8;
} else if (sqlite3_strnicmp(token.start, "bit", 3) == 0) {
elementType = SQLITE_VEC_ELEMENT_TYPE_BIT;
} else {
return SQLITE_EMPTY;
}
// left '[' bracket
rc = vec0_scanner_next(&scanner, &token);
if (rc != VEC0_TOKEN_RESULT_SOME && token.token_type != TOKEN_TYPE_LBRACKET) {
return SQLITE_EMPTY;
}
// digit, for vector dimension length
rc = vec0_scanner_next(&scanner, &token);
if (rc != VEC0_TOKEN_RESULT_SOME && token.token_type != TOKEN_TYPE_DIGIT) {
return SQLITE_ERROR;
}
dimensions = atoi(token.start);
if (dimensions <= 0) {
return SQLITE_ERROR;
}
// // right ']' bracket
rc = vec0_scanner_next(&scanner, &token);
if (rc != VEC0_TOKEN_RESULT_SOME && token.token_type != TOKEN_TYPE_RBRACKET) {
return SQLITE_ERROR;
}
// any other tokens left should be column-level options , ex `key=value`
// ex `distance_metric=L2 distance_metric=cosine` should error
while (1) {
// should be EOF or identifier (option key)
rc = vec0_scanner_next(&scanner, &token);
if (rc == VEC0_TOKEN_RESULT_EOF) {
break;
}
if (rc != VEC0_TOKEN_RESULT_SOME &&
token.token_type != TOKEN_TYPE_IDENTIFIER) {
return SQLITE_ERROR;
}
char *key = token.start;
int keyLength = token.end - token.start;
if (sqlite3_strnicmp(key, "distance_metric", keyLength) == 0) {
if (elementType == SQLITE_VEC_ELEMENT_TYPE_BIT) {
return SQLITE_ERROR;
}
// ensure equal sign after distance_metric
rc = vec0_scanner_next(&scanner, &token);
if (rc != VEC0_TOKEN_RESULT_SOME && token.token_type != TOKEN_TYPE_EQ) {
return SQLITE_ERROR;
}
// distance_metric value, an identifier (L2, cosine, etc)
rc = vec0_scanner_next(&scanner, &token);
if (rc != VEC0_TOKEN_RESULT_SOME &&
token.token_type != TOKEN_TYPE_IDENTIFIER) {
return SQLITE_ERROR;
}
char *value = token.start;
int valueLength = token.end - token.start;
if (sqlite3_strnicmp(value, "l2", valueLength) == 0) {
distanceMetric = VEC0_DISTANCE_METRIC_L2;
} else if (sqlite3_strnicmp(value, "l1", valueLength) == 0) {
distanceMetric = VEC0_DISTANCE_METRIC_L1;
} else if (sqlite3_strnicmp(value, "cosine", valueLength) == 0) {
distanceMetric = VEC0_DISTANCE_METRIC_COSINE;
} else {
return SQLITE_ERROR;
}
}
// unknown key
else {
return SQLITE_ERROR;
}
}
outColumn->name = sqlite3_mprintf("%.*s", nameLength, name);
if (!outColumn->name) {
return SQLITE_ERROR;
}
outColumn->name_length = nameLength;
outColumn->distance_metric = distanceMetric;
outColumn->element_type = elementType;
outColumn->dimensions = dimensions;
return SQLITE_OK;
}
#pragma region vec_each table function
typedef struct vec_each_vtab vec_each_vtab;
struct vec_each_vtab {
sqlite3_vtab base;
};
typedef struct vec_each_cursor vec_each_cursor;
struct vec_each_cursor {
sqlite3_vtab_cursor base;
i64 iRowid;
enum VectorElementType vector_type;
void *vector;
size_t dimensions;
vector_cleanup cleanup;
};
static int vec_eachConnect(sqlite3 *db, void *pAux, int argc,
const char *const *argv, sqlite3_vtab **ppVtab,
char **pzErr) {
UNUSED_PARAMETER(pAux);
UNUSED_PARAMETER(argc);
UNUSED_PARAMETER(argv);
UNUSED_PARAMETER(pzErr);
vec_each_vtab *pNew;
int rc;
rc = sqlite3_declare_vtab(db, "CREATE TABLE x(value, vector hidden)");
#define VEC_EACH_COLUMN_VALUE 0
#define VEC_EACH_COLUMN_VECTOR 1
if (rc == SQLITE_OK) {
pNew = sqlite3_malloc(sizeof(*pNew));
*ppVtab = (sqlite3_vtab *)pNew;
if (pNew == 0)
return SQLITE_NOMEM;
memset(pNew, 0, sizeof(*pNew));
}
return rc;
}
static int vec_eachDisconnect(sqlite3_vtab *pVtab) {
vec_each_vtab *p = (vec_each_vtab *)pVtab;
sqlite3_free(p);
return SQLITE_OK;
}
static int vec_eachOpen(sqlite3_vtab *p, sqlite3_vtab_cursor **ppCursor) {
UNUSED_PARAMETER(p);
vec_each_cursor *pCur;
pCur = sqlite3_malloc(sizeof(*pCur));
if (pCur == 0)
return SQLITE_NOMEM;
memset(pCur, 0, sizeof(*pCur));
*ppCursor = &pCur->base;
return SQLITE_OK;
}
static int vec_eachClose(sqlite3_vtab_cursor *cur) {
vec_each_cursor *pCur = (vec_each_cursor *)cur;
if(pCur->vector) {
pCur->cleanup(pCur->vector);
}
sqlite3_free(pCur);
return SQLITE_OK;
}
static int vec_eachBestIndex(sqlite3_vtab *pVTab,
sqlite3_index_info *pIdxInfo) {
UNUSED_PARAMETER(pVTab);
int hasVector = 0;
for (int i = 0; i < pIdxInfo->nConstraint; i++) {
const struct sqlite3_index_constraint *pCons = &pIdxInfo->aConstraint[i];
// printf("i=%d iColumn=%d, op=%d, usable=%d\n", i, pCons->iColumn,
// pCons->op, pCons->usable);
switch (pCons->iColumn) {
case VEC_EACH_COLUMN_VECTOR: {
if (pCons->op == SQLITE_INDEX_CONSTRAINT_EQ && pCons->usable) {
hasVector = 1;
pIdxInfo->aConstraintUsage[i].argvIndex = 1;
pIdxInfo->aConstraintUsage[i].omit = 1;
}
break;
}
}
}
if (!hasVector) {
return SQLITE_CONSTRAINT;
}
pIdxInfo->estimatedCost = (double)100000;
pIdxInfo->estimatedRows = 100000;
return SQLITE_OK;
}
static int vec_eachFilter(sqlite3_vtab_cursor *pVtabCursor, int idxNum,
const char *idxStr, int argc, sqlite3_value **argv) {
UNUSED_PARAMETER(idxNum);
UNUSED_PARAMETER(idxStr);
assert(argc == 1);
vec_each_cursor *pCur = (vec_each_cursor *)pVtabCursor;
if (pCur->vector) {
pCur->cleanup(pCur->vector);
pCur->vector = NULL;
}
char *pzErrMsg;
int rc = vector_from_value(argv[0], &pCur->vector, &pCur->dimensions,
&pCur->vector_type, &pCur->cleanup, &pzErrMsg);
if (rc != SQLITE_OK) {
sqlite3_free(pzErrMsg);
return SQLITE_ERROR;
}
pCur->iRowid = 0;
return SQLITE_OK;
}
static int vec_eachRowid(sqlite3_vtab_cursor *cur, sqlite_int64 *pRowid) {
vec_each_cursor *pCur = (vec_each_cursor *)cur;
*pRowid = pCur->iRowid;
return SQLITE_OK;
}
static int vec_eachEof(sqlite3_vtab_cursor *cur) {
vec_each_cursor *pCur = (vec_each_cursor *)cur;
return pCur->iRowid >= (i64)pCur->dimensions;
}
static int vec_eachNext(sqlite3_vtab_cursor *cur) {
vec_each_cursor *pCur = (vec_each_cursor *)cur;
pCur->iRowid++;
return SQLITE_OK;
}
static int vec_eachColumn(sqlite3_vtab_cursor *cur, sqlite3_context *context,
int i) {
vec_each_cursor *pCur = (vec_each_cursor *)cur;
switch (i) {
case VEC_EACH_COLUMN_VALUE:
switch (pCur->vector_type) {
case SQLITE_VEC_ELEMENT_TYPE_FLOAT32: {
sqlite3_result_double(context, ((f32 *)pCur->vector)[pCur->iRowid]);
break;
}
case SQLITE_VEC_ELEMENT_TYPE_BIT: {
u8 x = ((u8 *)pCur->vector)[pCur->iRowid / CHAR_BIT];
sqlite3_result_int(context,
(x & (0b10000000 >> ((pCur->iRowid % CHAR_BIT)))) > 0);
break;
}
case SQLITE_VEC_ELEMENT_TYPE_INT8: {
sqlite3_result_int(context, ((i8 *)pCur->vector)[pCur->iRowid]);
break;
}
}
break;
}
return SQLITE_OK;
}
static sqlite3_module vec_eachModule = {
/* iVersion */ 0,
/* xCreate */ 0,
/* xConnect */ vec_eachConnect,
/* xBestIndex */ vec_eachBestIndex,
/* xDisconnect */ vec_eachDisconnect,
/* xDestroy */ 0,
/* xOpen */ vec_eachOpen,
/* xClose */ vec_eachClose,
/* xFilter */ vec_eachFilter,
/* xNext */ vec_eachNext,
/* xEof */ vec_eachEof,
/* xColumn */ vec_eachColumn,
/* xRowid */ vec_eachRowid,
/* xUpdate */ 0,
/* xBegin */ 0,
/* xSync */ 0,
/* xCommit */ 0,
/* xRollback */ 0,
/* xFindMethod */ 0,
/* xRename */ 0,
/* xSavepoint */ 0,
/* xRelease */ 0,
/* xRollbackTo */ 0,
/* xShadowName */ 0,
#if SQLITE_VERSION_NUMBER >= 3044000
/* xIntegrity */ 0
#endif
};
#pragma endregion
#pragma region vec_npy_each table function
enum NpyTokenType {
NPY_TOKEN_TYPE_IDENTIFIER,
NPY_TOKEN_TYPE_NUMBER,
NPY_TOKEN_TYPE_LPAREN,
NPY_TOKEN_TYPE_RPAREN,
NPY_TOKEN_TYPE_LBRACE,
NPY_TOKEN_TYPE_RBRACE,
NPY_TOKEN_TYPE_COLON,
NPY_TOKEN_TYPE_COMMA,
NPY_TOKEN_TYPE_STRING,
NPY_TOKEN_TYPE_FALSE,
};
struct NpyToken {
enum NpyTokenType token_type;
unsigned char *start;
unsigned char *end;
};
int npy_token_next(unsigned char *start, unsigned char *end,
struct NpyToken *out) {
unsigned char *ptr = start;
while (ptr < end) {
unsigned char curr = *ptr;
if (is_whitespace(curr)) {
ptr++;
continue;
} else if (curr == '(') {
out->start = ptr++;
out->end = ptr;
out->token_type = NPY_TOKEN_TYPE_LPAREN;
return VEC0_TOKEN_RESULT_SOME;
} else if (curr == ')') {
out->start = ptr++;
out->end = ptr;
out->token_type = NPY_TOKEN_TYPE_RPAREN;
return VEC0_TOKEN_RESULT_SOME;
} else if (curr == '{') {
out->start = ptr++;
out->end = ptr;
out->token_type = NPY_TOKEN_TYPE_LBRACE;
return VEC0_TOKEN_RESULT_SOME;
} else if (curr == '}') {
out->start = ptr++;
out->end = ptr;
out->token_type = NPY_TOKEN_TYPE_RBRACE;
return VEC0_TOKEN_RESULT_SOME;
} else if (curr == ':') {
out->start = ptr++;
out->end = ptr;
out->token_type = NPY_TOKEN_TYPE_COLON;
return VEC0_TOKEN_RESULT_SOME;
} else if (curr == ',') {
out->start = ptr++;
out->end = ptr;
out->token_type = NPY_TOKEN_TYPE_COMMA;
return VEC0_TOKEN_RESULT_SOME;
} else if (curr == '\'') {
unsigned char *start = ptr;
ptr++;
while (ptr < end) {
if ((*ptr) == '\'') {
break;
}
ptr++;
}
if (ptr >= end || (*ptr) != '\'') {
return VEC0_TOKEN_RESULT_ERROR;
}
out->start = start;
out->end = ++ptr;
out->token_type = NPY_TOKEN_TYPE_STRING;
return VEC0_TOKEN_RESULT_SOME;
} else if (curr == 'F' &&
strncmp((char *)ptr, "False", strlen("False")) == 0) {
out->start = ptr;
out->end = (ptr + (int)strlen("False"));
ptr = out->end;
out->token_type = NPY_TOKEN_TYPE_FALSE;
return VEC0_TOKEN_RESULT_SOME;
} else if (is_digit(curr)) {
unsigned char *start = ptr;
while (ptr < end && (is_digit(*ptr))) {
ptr++;
}
out->start = start;
out->end = ptr;
out->token_type = NPY_TOKEN_TYPE_NUMBER;
return VEC0_TOKEN_RESULT_SOME;
} else {
return VEC0_TOKEN_RESULT_ERROR;
}
}
return VEC0_TOKEN_RESULT_ERROR;
}
struct NpyScanner {
unsigned char *start;
unsigned char *end;
unsigned char *ptr;
};
void npy_scanner_init(struct NpyScanner *scanner, const unsigned char *source,
int source_length) {
scanner->start = (unsigned char *)source;
scanner->end = (unsigned char *)source + source_length;
scanner->ptr = (unsigned char *)source;
}
int npy_scanner_next(struct NpyScanner *scanner, struct NpyToken *out) {
int rc = npy_token_next(scanner->start, scanner->end, out);
if (rc == VEC0_TOKEN_RESULT_SOME) {
scanner->start = out->end;
}
return rc;
}
#define NPY_PARSE_ERROR "Error parsing numpy array: "
int parse_npy_header(sqlite3_vtab *pVTab, const unsigned char *header,
size_t headerLength,
enum VectorElementType *out_element_type,
int *fortran_order, size_t *numElements,
size_t *numDimensions) {
struct NpyScanner scanner;
struct NpyToken token;
int rc;
npy_scanner_init(&scanner, header, headerLength);
if (npy_scanner_next(&scanner, &token) != VEC0_TOKEN_RESULT_SOME &&
token.token_type != NPY_TOKEN_TYPE_LBRACE) {
vtab_set_error(pVTab,
NPY_PARSE_ERROR "numpy header did not start with '{'");
return SQLITE_ERROR;
}
while (1) {
rc = npy_scanner_next(&scanner, &token);
if (rc != VEC0_TOKEN_RESULT_SOME) {
vtab_set_error(pVTab, NPY_PARSE_ERROR "expected key in numpy header");
return SQLITE_ERROR;
}
if (token.token_type == NPY_TOKEN_TYPE_RBRACE) {
break;
}
if (token.token_type != NPY_TOKEN_TYPE_STRING) {
vtab_set_error(pVTab, NPY_PARSE_ERROR
"expected a string as key in numpy header");
return SQLITE_ERROR;
}
unsigned char *key = token.start;
rc = npy_scanner_next(&scanner, &token);
if ((rc != VEC0_TOKEN_RESULT_SOME) ||
(token.token_type != NPY_TOKEN_TYPE_COLON)) {
vtab_set_error(pVTab, NPY_PARSE_ERROR
"expected a ':' after key in numpy header");
return SQLITE_ERROR;
}
if (strncmp((char *)key, "'descr'", strlen("'descr'")) == 0) {
rc = npy_scanner_next(&scanner, &token);
if ((rc != VEC0_TOKEN_RESULT_SOME) ||
(token.token_type != NPY_TOKEN_TYPE_STRING)) {
vtab_set_error(pVTab, NPY_PARSE_ERROR
"expected a string value after 'descr' key");
return SQLITE_ERROR;
}
if (strncmp((char *)token.start, "'maxChunks = 1024;
pCur->chunksBufferSize =
(vector_byte_size(element_type, numDimensions)) * pCur->maxChunks;
pCur->chunksBuffer = sqlite3_malloc(pCur->chunksBufferSize);
if (pCur->chunksBufferSize && !pCur->chunksBuffer) {
return SQLITE_NOMEM;
}
pCur->currentChunkSize =
fread(pCur->chunksBuffer, vector_byte_size(element_type, numDimensions),
pCur->maxChunks, file);
pCur->currentChunkIndex = 0;
pCur->elementType = element_type;
pCur->nElements = numElements;
pCur->nDimensions = numDimensions;
pCur->input_type = VEC_NPY_EACH_INPUT_FILE;
pCur->eof = pCur->currentChunkSize == 0;
pCur->file = file;
return SQLITE_OK;
}
#endif
int parse_npy_buffer(sqlite3_vtab *pVTab, const unsigned char *buffer,
int bufferLength, void **data, size_t *numElements,
size_t *numDimensions,
enum VectorElementType *element_type) {
if (bufferLength < 10) {
// IMP: V03312_20150
vtab_set_error(pVTab, "numpy array too short");
return SQLITE_ERROR;
}
if (memcmp(NPY_MAGIC, buffer, sizeof(NPY_MAGIC)) != 0) {
// V11954_28792
vtab_set_error(pVTab, "numpy array does not contain the 'magic' header");
return SQLITE_ERROR;
}
u8 major = buffer[6];
u8 minor = buffer[7];
uint16_t headerLength = 0;
memcpy(&headerLength, &buffer[8], sizeof(uint16_t));
i32 totalHeaderLength = sizeof(NPY_MAGIC) + sizeof(major) + sizeof(minor) +
sizeof(headerLength) + headerLength;
i32 dataSize = bufferLength - totalHeaderLength;
if (dataSize < 0) {
vtab_set_error(pVTab, "numpy array header length is invalid");
return SQLITE_ERROR;
}
const unsigned char *header = &buffer[10];
int fortran_order;
int rc = parse_npy_header(pVTab, header, headerLength, element_type,
&fortran_order, numElements, numDimensions);
if (rc != SQLITE_OK) {
return rc;
}
i32 expectedDataSize =
(*numElements * vector_byte_size(*element_type, *numDimensions));
if (expectedDataSize != dataSize) {
vtab_set_error(pVTab,
"numpy array error: Expected a data size of %d, found %d",
expectedDataSize, dataSize);
return SQLITE_ERROR;
}
*data = (void *)&buffer[totalHeaderLength];
return SQLITE_OK;
}
static int vec_npy_eachConnect(sqlite3 *db, void *pAux, int argc,
const char *const *argv, sqlite3_vtab **ppVtab,
char **pzErr) {
UNUSED_PARAMETER(pAux);
UNUSED_PARAMETER(argc);
UNUSED_PARAMETER(argv);
UNUSED_PARAMETER(pzErr);
vec_npy_each_vtab *pNew;
int rc;
rc = sqlite3_declare_vtab(db, "CREATE TABLE x(vector, input hidden)");
#define VEC_NPY_EACH_COLUMN_VECTOR 0
#define VEC_NPY_EACH_COLUMN_INPUT 1
if (rc == SQLITE_OK) {
pNew = sqlite3_malloc(sizeof(*pNew));
*ppVtab = (sqlite3_vtab *)pNew;
if (pNew == 0)
return SQLITE_NOMEM;
memset(pNew, 0, sizeof(*pNew));
}
return rc;
}
static int vec_npy_eachDisconnect(sqlite3_vtab *pVtab) {
vec_npy_each_vtab *p = (vec_npy_each_vtab *)pVtab;
sqlite3_free(p);
return SQLITE_OK;
}
static int vec_npy_eachOpen(sqlite3_vtab *p, sqlite3_vtab_cursor **ppCursor) {
UNUSED_PARAMETER(p);
vec_npy_each_cursor *pCur;
pCur = sqlite3_malloc(sizeof(*pCur));
if (pCur == 0)
return SQLITE_NOMEM;
memset(pCur, 0, sizeof(*pCur));
*ppCursor = &pCur->base;
return SQLITE_OK;
}
static int vec_npy_eachClose(sqlite3_vtab_cursor *cur) {
vec_npy_each_cursor *pCur = (vec_npy_each_cursor *)cur;
#ifndef SQLITE_VEC_OMIT_FS
if (pCur->file) {
fclose(pCur->file);
pCur->file = NULL;
}
#endif
if (pCur->chunksBuffer) {
sqlite3_free(pCur->chunksBuffer);
pCur->chunksBuffer = NULL;
}
if (pCur->vector) {
pCur->vector = NULL;
}
sqlite3_free(pCur);
return SQLITE_OK;
}
static int vec_npy_eachBestIndex(sqlite3_vtab *pVTab,
sqlite3_index_info *pIdxInfo) {
int hasInput;
for (int i = 0; i < pIdxInfo->nConstraint; i++) {
const struct sqlite3_index_constraint *pCons = &pIdxInfo->aConstraint[i];
// printf("i=%d iColumn=%d, op=%d, usable=%d\n", i, pCons->iColumn,
// pCons->op, pCons->usable);
switch (pCons->iColumn) {
case VEC_NPY_EACH_COLUMN_INPUT: {
if (pCons->op == SQLITE_INDEX_CONSTRAINT_EQ && pCons->usable) {
hasInput = 1;
pIdxInfo->aConstraintUsage[i].argvIndex = 1;
pIdxInfo->aConstraintUsage[i].omit = 1;
}
break;
}
}
}
if (!hasInput) {
pVTab->zErrMsg = sqlite3_mprintf("input argument is required");
return SQLITE_ERROR;
}
pIdxInfo->estimatedCost = (double)100000;
pIdxInfo->estimatedRows = 100000;
return SQLITE_OK;
}
static int vec_npy_eachFilter(sqlite3_vtab_cursor *pVtabCursor, int idxNum,
const char *idxStr, int argc,
sqlite3_value **argv) {
UNUSED_PARAMETER(idxNum);
UNUSED_PARAMETER(idxStr);
assert(argc == 1);
int rc;
vec_npy_each_cursor *pCur = (vec_npy_each_cursor *)pVtabCursor;
#ifndef SQLITE_VEC_OMIT_FS
if (pCur->file) {
fclose(pCur->file);
pCur->file = NULL;
}
#endif
if (pCur->chunksBuffer) {
sqlite3_free(pCur->chunksBuffer);
pCur->chunksBuffer = NULL;
}
if (pCur->vector) {
pCur->vector = NULL;
}
#ifndef SQLITE_VEC_OMIT_FS
struct VecNpyFile *f = NULL;
if ((f = sqlite3_value_pointer(argv[0], SQLITE_VEC_NPY_FILE_NAME))) {
FILE *file = fopen(f->path, "r");
if (!file) {
vtab_set_error(pVtabCursor->pVtab, "Could not open numpy file");
return SQLITE_ERROR;
}
rc = parse_npy_file(pVtabCursor->pVtab, file, pCur);
if (rc != SQLITE_OK) {
#ifndef SQLITE_VEC_OMIT_FS
fclose(file);
#endif
return rc;
}
} else
#endif
{
const unsigned char *input = sqlite3_value_blob(argv[0]);
int inputLength = sqlite3_value_bytes(argv[0]);
void *data;
size_t numElements;
size_t numDimensions;
enum VectorElementType element_type;
rc = parse_npy_buffer(pVtabCursor->pVtab, input, inputLength, &data,
&numElements, &numDimensions, &element_type);
if (rc != SQLITE_OK) {
return rc;
}
pCur->vector = data;
pCur->elementType = element_type;
pCur->nElements = numElements;
pCur->nDimensions = numDimensions;
pCur->input_type = VEC_NPY_EACH_INPUT_BUFFER;
}
pCur->iRowid = 0;
return SQLITE_OK;
}
static int vec_npy_eachRowid(sqlite3_vtab_cursor *cur, sqlite_int64 *pRowid) {
vec_npy_each_cursor *pCur = (vec_npy_each_cursor *)cur;
*pRowid = pCur->iRowid;
return SQLITE_OK;
}
static int vec_npy_eachEof(sqlite3_vtab_cursor *cur) {
vec_npy_each_cursor *pCur = (vec_npy_each_cursor *)cur;
if (pCur->input_type == VEC_NPY_EACH_INPUT_BUFFER) {
return (!pCur->nElements) || (size_t)pCur->iRowid >= pCur->nElements;
}
return pCur->eof;
}
static int vec_npy_eachNext(sqlite3_vtab_cursor *cur) {
vec_npy_each_cursor *pCur = (vec_npy_each_cursor *)cur;
pCur->iRowid++;
if (pCur->input_type == VEC_NPY_EACH_INPUT_BUFFER) {
return SQLITE_OK;
}
#ifndef SQLITE_VEC_OMIT_FS
// else: input is a file
pCur->currentChunkIndex++;
if (pCur->currentChunkIndex >= pCur->currentChunkSize) {
pCur->currentChunkSize =
fread(pCur->chunksBuffer,
vector_byte_size(pCur->elementType, pCur->nDimensions),
pCur->maxChunks, pCur->file);
if (!pCur->currentChunkSize) {
pCur->eof = 1;
}
pCur->currentChunkIndex = 0;
}
#endif
return SQLITE_OK;
}
static int vec_npy_eachColumnBuffer(vec_npy_each_cursor *pCur,
sqlite3_context *context, int i) {
switch (i) {
case VEC_NPY_EACH_COLUMN_VECTOR: {
sqlite3_result_subtype(context, pCur->elementType);
switch (pCur->elementType) {
case SQLITE_VEC_ELEMENT_TYPE_FLOAT32: {
sqlite3_result_blob(
context,
&((unsigned char *)
pCur->vector)[pCur->iRowid * pCur->nDimensions * sizeof(f32)],
pCur->nDimensions * sizeof(f32), SQLITE_TRANSIENT);
break;
}
case SQLITE_VEC_ELEMENT_TYPE_INT8:
case SQLITE_VEC_ELEMENT_TYPE_BIT: {
// https://github.com/asg017/sqlite-vec/issues/42
sqlite3_result_error(context,
"vec_npy_each only supports float32 vectors", -1);
break;
}
}
break;
}
}
return SQLITE_OK;
}
static int vec_npy_eachColumnFile(vec_npy_each_cursor *pCur,
sqlite3_context *context, int i) {
switch (i) {
case VEC_NPY_EACH_COLUMN_VECTOR: {
switch (pCur->elementType) {
case SQLITE_VEC_ELEMENT_TYPE_FLOAT32: {
sqlite3_result_blob(
context,
&((unsigned char *)
pCur->chunksBuffer)[pCur->currentChunkIndex *
pCur->nDimensions * sizeof(f32)],
pCur->nDimensions * sizeof(f32), SQLITE_TRANSIENT);
break;
}
case SQLITE_VEC_ELEMENT_TYPE_INT8:
case SQLITE_VEC_ELEMENT_TYPE_BIT: {
// https://github.com/asg017/sqlite-vec/issues/42
sqlite3_result_error(context,
"vec_npy_each only supports float32 vectors", -1);
break;
}
}
break;
}
}
return SQLITE_OK;
}
static int vec_npy_eachColumn(sqlite3_vtab_cursor *cur,
sqlite3_context *context, int i) {
vec_npy_each_cursor *pCur = (vec_npy_each_cursor *)cur;
switch (pCur->input_type) {
case VEC_NPY_EACH_INPUT_BUFFER:
return vec_npy_eachColumnBuffer(pCur, context, i);
case VEC_NPY_EACH_INPUT_FILE:
return vec_npy_eachColumnFile(pCur, context, i);
}
return SQLITE_ERROR;
}
static sqlite3_module vec_npy_eachModule = {
/* iVersion */ 0,
/* xCreate */ 0,
/* xConnect */ vec_npy_eachConnect,
/* xBestIndex */ vec_npy_eachBestIndex,
/* xDisconnect */ vec_npy_eachDisconnect,
/* xDestroy */ 0,
/* xOpen */ vec_npy_eachOpen,
/* xClose */ vec_npy_eachClose,
/* xFilter */ vec_npy_eachFilter,
/* xNext */ vec_npy_eachNext,
/* xEof */ vec_npy_eachEof,
/* xColumn */ vec_npy_eachColumn,
/* xRowid */ vec_npy_eachRowid,
/* xUpdate */ 0,
/* xBegin */ 0,
/* xSync */ 0,
/* xCommit */ 0,
/* xRollback */ 0,
/* xFindMethod */ 0,
/* xRename */ 0,
/* xSavepoint */ 0,
/* xRelease */ 0,
/* xRollbackTo */ 0,
/* xShadowName */ 0,
#if SQLITE_VERSION_NUMBER >= 3044000
/* xIntegrity */ 0,
#endif
};
#pragma endregion
#pragma region vec0 virtual table
#define VEC0_COLUMN_ID 0
#define VEC0_COLUMN_USERN_START 1
#define VEC0_COLUMN_OFFSET_DISTANCE 1
#define VEC0_COLUMN_OFFSET_K 2
#define VEC0_SHADOW_INFO_NAME "\"%w\".\"%w_info\""
#define VEC0_SHADOW_CHUNKS_NAME "\"%w\".\"%w_chunks\""
/// 1) schema, 2) original vtab table name
#define VEC0_SHADOW_CHUNKS_CREATE \
"CREATE TABLE " VEC0_SHADOW_CHUNKS_NAME "(" \
"chunk_id INTEGER PRIMARY KEY AUTOINCREMENT," \
"size INTEGER NOT NULL," \
"validity BLOB NOT NULL," \
"rowids BLOB NOT NULL" \
");"
#define VEC0_SHADOW_ROWIDS_NAME "\"%w\".\"%w_rowids\""
/// 1) schema, 2) original vtab table name
#define VEC0_SHADOW_ROWIDS_CREATE_BASIC \
"CREATE TABLE " VEC0_SHADOW_ROWIDS_NAME "(" \
"rowid INTEGER PRIMARY KEY AUTOINCREMENT," \
"id," \
"chunk_id INTEGER," \
"chunk_offset INTEGER" \
");"
// vec0 tables with a text primary keys are still backed by int64 primary keys,
// since a fixed-length rowid is required for vec0 chunks. But we add a new 'id
// text unique' column to emulate a text primary key interface.
#define VEC0_SHADOW_ROWIDS_CREATE_PK_TEXT \
"CREATE TABLE " VEC0_SHADOW_ROWIDS_NAME "(" \
"rowid INTEGER PRIMARY KEY AUTOINCREMENT," \
"id TEXT UNIQUE NOT NULL," \
"chunk_id INTEGER," \
"chunk_offset INTEGER" \
");"
/// 1) schema, 2) original vtab table name
#define VEC0_SHADOW_VECTOR_N_NAME "\"%w\".\"%w_vector_chunks%02d\""
/// 1) schema, 2) original vtab table name
//
// IMPORTANT: "rowid" is declared as PRIMARY KEY but WITHOUT the INTEGER type.
// This means it is NOT a true SQLite rowid alias — the user-defined "rowid"
// column and the internal SQLite rowid (_rowid_) are two separate values.
// When inserting, both must be set explicitly to keep them in sync. See the
// _rowid_ bindings in vec0_new_chunk() and the explanation in
// SHADOW_TABLE_ROWID_QUIRK below.
#define VEC0_SHADOW_VECTOR_N_CREATE \
"CREATE TABLE " VEC0_SHADOW_VECTOR_N_NAME "(" \
"rowid PRIMARY KEY," \
"vectors BLOB NOT NULL" \
");"
#define VEC0_SHADOW_AUXILIARY_NAME "\"%w\".\"%w_auxiliary\""
#define VEC0_SHADOW_METADATA_N_NAME "\"%w\".\"%w_metadatachunks%02d\""
#define VEC0_SHADOW_METADATA_TEXT_DATA_NAME "\"%w\".\"%w_metadatatext%02d\""
#define VEC_INTERAL_ERROR "Internal sqlite-vec error: "
#define REPORT_URL "https://github.com/asg017/sqlite-vec/issues/new"
typedef struct vec0_vtab vec0_vtab;
#define VEC0_MAX_VECTOR_COLUMNS 16
#define VEC0_MAX_PARTITION_COLUMNS 4
#define VEC0_MAX_AUXILIARY_COLUMNS 16
#define VEC0_MAX_METADATA_COLUMNS 16
#define SQLITE_VEC_VEC0_MAX_DIMENSIONS 8192
#define VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH 16
#define VEC0_METADATA_TEXT_VIEW_DATA_LENGTH 12
typedef enum {
// vector column, ie "contents_embedding float[1024]"
SQLITE_VEC0_USER_COLUMN_KIND_VECTOR = 1,
// partition key column, ie "user_id integer partition key"
SQLITE_VEC0_USER_COLUMN_KIND_PARTITION = 2,
//
SQLITE_VEC0_USER_COLUMN_KIND_AUXILIARY = 3,
// metadata column that can be filtered, ie "genre text"
SQLITE_VEC0_USER_COLUMN_KIND_METADATA = 4,
} vec0_user_column_kind;
struct vec0_vtab {
sqlite3_vtab base;
// the SQLite connection of the host database
sqlite3 *db;
// True if the primary key of the vec0 table has a column type TEXT.
// Will change the schema of the _rowids table, and insert/query logic.
int pkIsText;
// number of defined vector columns.
int numVectorColumns;
// number of defined PARTITION KEY columns.
int numPartitionColumns;
// number of defined auxiliary columns
int numAuxiliaryColumns;
// number of defined metadata columns
int numMetadataColumns;
// Name of the schema the table exists on.
// Must be freed with sqlite3_free()
char *schemaName;
// Name of the table the table exists on.
// Must be freed with sqlite3_free()
char *tableName;
// Name of the _rowids shadow table.
// Must be freed with sqlite3_free()
char *shadowRowidsName;
// Name of the _chunks shadow table.
// Must be freed with sqlite3_free()
char *shadowChunksName;
// contains enum vec0_user_column_kind values for up to
// numVectorColumns + numPartitionColumns entries
vec0_user_column_kind user_column_kinds[VEC0_MAX_VECTOR_COLUMNS + VEC0_MAX_PARTITION_COLUMNS + VEC0_MAX_AUXILIARY_COLUMNS + VEC0_MAX_METADATA_COLUMNS];
uint8_t user_column_idxs[VEC0_MAX_VECTOR_COLUMNS + VEC0_MAX_PARTITION_COLUMNS + VEC0_MAX_AUXILIARY_COLUMNS + VEC0_MAX_METADATA_COLUMNS];
// Name of all the vector chunk shadow tables.
// Ex '_vector_chunks00'
// Only the first numVectorColumns entries will be available.
// The first numVectorColumns entries must be freed with sqlite3_free()
char *shadowVectorChunksNames[VEC0_MAX_VECTOR_COLUMNS];
// Name of all metadata chunk shadow tables, ie `_metadatachunks00`
// Only the first numMetadataColumns entries will be available.
// The first numMetadataColumns entries must be freed with sqlite3_free()
char *shadowMetadataChunksNames[VEC0_MAX_METADATA_COLUMNS];
struct VectorColumnDefinition vector_columns[VEC0_MAX_VECTOR_COLUMNS];
struct Vec0PartitionColumnDefinition paritition_columns[VEC0_MAX_PARTITION_COLUMNS];
struct Vec0AuxiliaryColumnDefinition auxiliary_columns[VEC0_MAX_AUXILIARY_COLUMNS];
struct Vec0MetadataColumnDefinition metadata_columns[VEC0_MAX_METADATA_COLUMNS];
int chunk_size;
// select latest chunk from _chunks, getting chunk_id
sqlite3_stmt *stmtLatestChunk;
/**
* Statement to insert a row into the _rowids table, with a rowid.
* Parameters:
* 1: int64, rowid to insert
* Result columns: none
* SQL: "INSERT INTO _rowids(rowid) VALUES (?)"
*
* Must be cleaned up with sqlite3_finalize().
*/
sqlite3_stmt *stmtRowidsInsertRowid;
/**
* Statement to insert a row into the _rowids table, with an id.
* The id column isn't a tradition primary key, but instead a unique
* column to handle "text primary key" vec0 tables. The true int64 rowid
* can be retrieved after inserting with sqlite3_last_rowid().
*
* Parameters:
* 1: text or null, id to insert
* Result columns: none
*
* Must be cleaned up with sqlite3_finalize().
*/
sqlite3_stmt *stmtRowidsInsertId;
/**
* Statement to update the "position" columns chunk_id and chunk_offset for
* a given _rowids row. Used when the "next available" chunk position is found
* for a vector.
*
* Parameters:
* 1: int64, chunk_id value
* 2: int64, chunk_offset value
* 3: int64, rowid value
* Result columns: none
*
* Must be cleaned up with sqlite3_finalize().
*/
sqlite3_stmt *stmtRowidsUpdatePosition;
/**
* Statement to quickly find the chunk_id + chunk_offset of a given row.
* Parameters:
* 1: rowid of the row/vector to lookup
* Result columns:
* 0: chunk_id (i64)
* 1: chunk_offset (i64)
* SQL: "SELECT id, chunk_id, chunk_offset FROM _rowids WHERE rowid = ?""
*
* Must be cleaned up with sqlite3_finalize().
*/
sqlite3_stmt *stmtRowidsGetChunkPosition;
};
/**
* @brief Finalize all the sqlite3_stmt members in a vec0_vtab.
*
* @param p vec0_vtab pointer
*/
void vec0_free_resources(vec0_vtab *p) {
sqlite3_finalize(p->stmtLatestChunk);
p->stmtLatestChunk = NULL;
sqlite3_finalize(p->stmtRowidsInsertRowid);
p->stmtRowidsInsertRowid = NULL;
sqlite3_finalize(p->stmtRowidsInsertId);
p->stmtRowidsInsertId = NULL;
sqlite3_finalize(p->stmtRowidsUpdatePosition);
p->stmtRowidsUpdatePosition = NULL;
sqlite3_finalize(p->stmtRowidsGetChunkPosition);
p->stmtRowidsGetChunkPosition = NULL;
}
/**
* @brief Free all memory and sqlite3_stmt members of a vec0_vtab
*
* @param p vec0_vtab pointer
*/
void vec0_free(vec0_vtab *p) {
vec0_free_resources(p);
sqlite3_free(p->schemaName);
p->schemaName = NULL;
sqlite3_free(p->tableName);
p->tableName = NULL;
sqlite3_free(p->shadowChunksName);
p->shadowChunksName = NULL;
sqlite3_free(p->shadowRowidsName);
p->shadowRowidsName = NULL;
for (int i = 0; i < p->numVectorColumns; i++) {
sqlite3_free(p->shadowVectorChunksNames[i]);
p->shadowVectorChunksNames[i] = NULL;
sqlite3_free(p->vector_columns[i].name);
p->vector_columns[i].name = NULL;
}
for (int i = 0; i < p->numPartitionColumns; i++) {
sqlite3_free(p->paritition_columns[i].name);
p->paritition_columns[i].name = NULL;
}
for (int i = 0; i < p->numAuxiliaryColumns; i++) {
sqlite3_free(p->auxiliary_columns[i].name);
p->auxiliary_columns[i].name = NULL;
}
for (int i = 0; i < p->numMetadataColumns; i++) {
sqlite3_free(p->metadata_columns[i].name);
p->metadata_columns[i].name = NULL;
}
}
int vec0_num_defined_user_columns(vec0_vtab *p) {
return p->numVectorColumns + p->numPartitionColumns + p->numAuxiliaryColumns + p->numMetadataColumns;
}
/**
* @brief Returns the index of the distance hidden column for the given vec0
* table.
*
* @param p vec0 table
* @return int
*/
int vec0_column_distance_idx(vec0_vtab *p) {
return VEC0_COLUMN_USERN_START + (vec0_num_defined_user_columns(p) - 1) +
VEC0_COLUMN_OFFSET_DISTANCE;
}
/**
* @brief Returns the index of the k hidden column for the given vec0 table.
*
* @param p vec0 table
* @return int k column index
*/
int vec0_column_k_idx(vec0_vtab *p) {
return VEC0_COLUMN_USERN_START + (vec0_num_defined_user_columns(p) - 1) +
VEC0_COLUMN_OFFSET_K;
}
/**
* Returns 1 if the given column-based index is a valid vector column,
* 0 otherwise.
*/
int vec0_column_idx_is_vector(vec0_vtab *pVtab, int column_idx) {
return column_idx >= VEC0_COLUMN_USERN_START &&
column_idx <= (VEC0_COLUMN_USERN_START + vec0_num_defined_user_columns(pVtab) - 1) &&
pVtab->user_column_kinds[column_idx - VEC0_COLUMN_USERN_START] == SQLITE_VEC0_USER_COLUMN_KIND_VECTOR;
}
/**
* Returns the vector index of the given user column index.
* ONLY call if validated with vec0_column_idx_is_vector before
*/
int vec0_column_idx_to_vector_idx(vec0_vtab *pVtab, int column_idx) {
UNUSED_PARAMETER(pVtab);
return pVtab->user_column_idxs[column_idx - VEC0_COLUMN_USERN_START];
}
/**
* Returns 1 if the given column-based index is a "partition key" column,
* 0 otherwise.
*/
int vec0_column_idx_is_partition(vec0_vtab *pVtab, int column_idx) {
return column_idx >= VEC0_COLUMN_USERN_START &&
column_idx <= (VEC0_COLUMN_USERN_START + vec0_num_defined_user_columns(pVtab) - 1) &&
pVtab->user_column_kinds[column_idx - VEC0_COLUMN_USERN_START] == SQLITE_VEC0_USER_COLUMN_KIND_PARTITION;
}
/**
* Returns the partition column index of the given user column index.
* ONLY call if validated with vec0_column_idx_is_vector before
*/
int vec0_column_idx_to_partition_idx(vec0_vtab *pVtab, int column_idx) {
UNUSED_PARAMETER(pVtab);
return pVtab->user_column_idxs[column_idx - VEC0_COLUMN_USERN_START];
}
/**
* Returns 1 if the given column-based index is a auxiliary column,
* 0 otherwise.
*/
int vec0_column_idx_is_auxiliary(vec0_vtab *pVtab, int column_idx) {
return column_idx >= VEC0_COLUMN_USERN_START &&
column_idx <= (VEC0_COLUMN_USERN_START + vec0_num_defined_user_columns(pVtab) - 1) &&
pVtab->user_column_kinds[column_idx - VEC0_COLUMN_USERN_START] == SQLITE_VEC0_USER_COLUMN_KIND_AUXILIARY;
}
/**
* Returns the auxiliary column index of the given user column index.
* ONLY call if validated with vec0_column_idx_to_partition_idx before
*/
int vec0_column_idx_to_auxiliary_idx(vec0_vtab *pVtab, int column_idx) {
UNUSED_PARAMETER(pVtab);
return pVtab->user_column_idxs[column_idx - VEC0_COLUMN_USERN_START];
}
/**
* Returns 1 if the given column-based index is a metadata column,
* 0 otherwise.
*/
int vec0_column_idx_is_metadata(vec0_vtab *pVtab, int column_idx) {
return column_idx >= VEC0_COLUMN_USERN_START &&
column_idx <= (VEC0_COLUMN_USERN_START + vec0_num_defined_user_columns(pVtab) - 1) &&
pVtab->user_column_kinds[column_idx - VEC0_COLUMN_USERN_START] == SQLITE_VEC0_USER_COLUMN_KIND_METADATA;
}
/**
* Returns the metadata column index of the given user column index.
* ONLY call if validated with vec0_column_idx_is_metadata before
*/
int vec0_column_idx_to_metadata_idx(vec0_vtab *pVtab, int column_idx) {
UNUSED_PARAMETER(pVtab);
return pVtab->user_column_idxs[column_idx - VEC0_COLUMN_USERN_START];
}
/**
* @brief Retrieve the chunk_id, chunk_offset, and possible "id" value
* of a vec0_vtab row with the provided rowid
*
* @param p vec0_vtab
* @param rowid the rowid of the row to query
* @param id output, optional sqlite3_value to provide the id.
* Useful for text PK rows. Must be freed with sqlite3_value_free()
* @param chunk_id output, the chunk_id the row belongs to
* @param chunk_offset output, the offset within the chunk the row belongs to
* @return SQLITE_ROW on success, error code otherwise. SQLITE_EMPTY if row DNE
*/
int vec0_get_chunk_position(vec0_vtab *p, i64 rowid, sqlite3_value **id,
i64 *chunk_id, i64 *chunk_offset) {
int rc;
if (!p->stmtRowidsGetChunkPosition) {
const char *zSql =
sqlite3_mprintf("SELECT id, chunk_id, chunk_offset "
"FROM " VEC0_SHADOW_ROWIDS_NAME " WHERE rowid = ?",
p->schemaName, p->tableName);
if (!zSql) {
rc = SQLITE_NOMEM;
goto cleanup;
}
rc = sqlite3_prepare_v2(p->db, zSql, -1, &p->stmtRowidsGetChunkPosition, 0);
sqlite3_free((void *)zSql);
if (rc != SQLITE_OK) {
vtab_set_error(
&p->base, VEC_INTERAL_ERROR
"could not initialize 'rowids get chunk position' statement");
goto cleanup;
}
}
sqlite3_bind_int64(p->stmtRowidsGetChunkPosition, 1, rowid);
rc = sqlite3_step(p->stmtRowidsGetChunkPosition);
// special case: when no results, return SQLITE_EMPTY to convey "that chunk
// position doesnt exist"
if (rc == SQLITE_DONE) {
rc = SQLITE_EMPTY;
goto cleanup;
}
if (rc != SQLITE_ROW) {
goto cleanup;
}
if (id) {
sqlite3_value *value =
sqlite3_column_value(p->stmtRowidsGetChunkPosition, 0);
*id = sqlite3_value_dup(value);
if (!*id) {
rc = SQLITE_NOMEM;
goto cleanup;
}
}
if (chunk_id) {
*chunk_id = sqlite3_column_int64(p->stmtRowidsGetChunkPosition, 1);
}
if (chunk_offset) {
*chunk_offset = sqlite3_column_int64(p->stmtRowidsGetChunkPosition, 2);
}
rc = SQLITE_OK;
cleanup:
sqlite3_reset(p->stmtRowidsGetChunkPosition);
sqlite3_clear_bindings(p->stmtRowidsGetChunkPosition);
return rc;
}
/**
* @brief Return the id value from the _rowids table where _rowids.rowid =
* rowid.
*
* @param pVtab: vec0 table to query
* @param rowid: rowid of the row to query.
* @param out: A dup'ed sqlite3_value of the id column. Might be null.
* Must be cleaned up with sqlite3_value_free().
* @returns SQLITE_OK on success, error code on failure
*/
int vec0_get_id_value_from_rowid(vec0_vtab *pVtab, i64 rowid,
sqlite3_value **out) {
// PERF: different strategy than get_chunk_position?
return vec0_get_chunk_position((vec0_vtab *)pVtab, rowid, out, NULL, NULL);
}
int vec0_rowid_from_id(vec0_vtab *p, sqlite3_value *valueId, i64 *rowid) {
sqlite3_stmt *stmt = NULL;
int rc;
char *zSql;
zSql = sqlite3_mprintf("SELECT rowid"
" FROM " VEC0_SHADOW_ROWIDS_NAME " WHERE id = ?",
p->schemaName, p->tableName);
if (!zSql) {
rc = SQLITE_NOMEM;
goto cleanup;
}
rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL);
sqlite3_free(zSql);
if (rc != SQLITE_OK) {
goto cleanup;
}
sqlite3_bind_value(stmt, 1, valueId);
rc = sqlite3_step(stmt);
if (rc == SQLITE_DONE) {
rc = SQLITE_EMPTY;
goto cleanup;
}
if (rc != SQLITE_ROW) {
goto cleanup;
}
*rowid = sqlite3_column_int64(stmt, 0);
rc = sqlite3_step(stmt);
if (rc != SQLITE_DONE) {
goto cleanup;
}
rc = SQLITE_OK;
cleanup:
sqlite3_finalize(stmt);
return rc;
}
int vec0_result_id(vec0_vtab *p, sqlite3_context *context, i64 rowid) {
if (!p->pkIsText) {
sqlite3_result_int64(context, rowid);
return SQLITE_OK;
}
sqlite3_value *valueId;
int rc = vec0_get_id_value_from_rowid(p, rowid, &valueId);
if (rc != SQLITE_OK) {
return rc;
}
if (!valueId) {
sqlite3_result_error_nomem(context);
} else {
sqlite3_result_value(context, valueId);
sqlite3_value_free(valueId);
}
return SQLITE_OK;
}
/**
* @brief
*
* @param pVtab: virtual table to query
* @param rowid: row to lookup
* @param vector_column_idx: which vector column to query
* @param outVector: Output pointer to the vector buffer.
* Must be sqlite3_free()'ed.
* @param outVectorSize: Pointer to a int where the size of outVector
* will be stored.
* @return int SQLITE_OK on success.
*/
int vec0_get_vector_data(vec0_vtab *pVtab, i64 rowid, int vector_column_idx,
void **outVector, int *outVectorSize) {
vec0_vtab *p = pVtab;
int rc, brc;
i64 chunk_id;
i64 chunk_offset;
size_t size;
void *buf = NULL;
int blobOffset;
sqlite3_blob *vectorBlob = NULL;
assert((vector_column_idx >= 0) &&
(vector_column_idx < pVtab->numVectorColumns));
rc = vec0_get_chunk_position(pVtab, rowid, NULL, &chunk_id, &chunk_offset);
if (rc == SQLITE_EMPTY) {
vtab_set_error(&pVtab->base, "Could not find a row with rowid %lld", rowid);
goto cleanup;
}
if (rc != SQLITE_OK) {
goto cleanup;
}
rc = sqlite3_blob_open(p->db, p->schemaName,
p->shadowVectorChunksNames[vector_column_idx],
"vectors", chunk_id, 0, &vectorBlob);
if (rc != SQLITE_OK) {
vtab_set_error(&pVtab->base,
"Could not fetch vector data for %lld, opening blob failed",
rowid);
rc = SQLITE_ERROR;
goto cleanup;
}
size = vector_column_byte_size(pVtab->vector_columns[vector_column_idx]);
blobOffset = chunk_offset * size;
buf = sqlite3_malloc(size);
if (!buf) {
rc = SQLITE_NOMEM;
goto cleanup;
}
rc = sqlite3_blob_read(vectorBlob, buf, size, blobOffset);
if (rc != SQLITE_OK) {
sqlite3_free(buf);
buf = NULL;
vtab_set_error(
&pVtab->base,
"Could not fetch vector data for %lld, reading from blob failed",
rowid);
rc = SQLITE_ERROR;
goto cleanup;
}
*outVector = buf;
if (outVectorSize) {
*outVectorSize = size;
}
rc = SQLITE_OK;
cleanup:
brc = sqlite3_blob_close(vectorBlob);
if ((rc == SQLITE_OK) && (brc != SQLITE_OK)) {
vtab_set_error(
&p->base, VEC_INTERAL_ERROR
"unknown error, could not close vector blob, please file an issue");
return brc;
}
return rc;
}
/**
* @brief Retrieve the sqlite3_value of the i'th partition value for the given row.
*
* @param pVtab - the vec0_vtab in questions
* @param rowid - rowid of target row
* @param partition_idx - which partition column to retrieve
* @param outValue - output sqlite3_value
* @return int - SQLITE_OK on success, otherwise error code
*/
int vec0_get_partition_value_for_rowid(vec0_vtab *pVtab, i64 rowid, int partition_idx, sqlite3_value ** outValue) {
int rc;
i64 chunk_id;
i64 chunk_offset;
rc = vec0_get_chunk_position(pVtab, rowid, NULL, &chunk_id, &chunk_offset);
if(rc != SQLITE_OK) {
return rc;
}
sqlite3_stmt * stmt = NULL;
char * zSql = sqlite3_mprintf("SELECT partition%02d FROM " VEC0_SHADOW_CHUNKS_NAME " WHERE chunk_id = ?", partition_idx, pVtab->schemaName, pVtab->tableName);
if(!zSql) {
return SQLITE_NOMEM;
}
rc = sqlite3_prepare_v2(pVtab->db, zSql, -1, &stmt, NULL);
sqlite3_free(zSql);
if(rc != SQLITE_OK) {
return rc;
}
sqlite3_bind_int64(stmt, 1, chunk_id);
rc = sqlite3_step(stmt);
if(rc != SQLITE_ROW) {
rc = SQLITE_ERROR;
goto done;
}
*outValue = sqlite3_value_dup(sqlite3_column_value(stmt, 0));
if(!*outValue) {
rc = SQLITE_NOMEM;
goto done;
}
rc = SQLITE_OK;
done:
sqlite3_finalize(stmt);
return rc;
}
/**
* @brief Get the value of an auxiliary column for the given rowid
*
* @param pVtab vec0_vtab
* @param rowid the rowid of the row to lookup
* @param auxiliary_idx aux index of the column we care about
* @param outValue Output sqlite3_value to store
* @return int SQLITE_OK on success, error code otherwise
*/
int vec0_get_auxiliary_value_for_rowid(vec0_vtab *pVtab, i64 rowid, int auxiliary_idx, sqlite3_value ** outValue) {
int rc;
sqlite3_stmt * stmt = NULL;
char * zSql = sqlite3_mprintf("SELECT value%02d FROM " VEC0_SHADOW_AUXILIARY_NAME " WHERE rowid = ?", auxiliary_idx, pVtab->schemaName, pVtab->tableName);
if(!zSql) {
return SQLITE_NOMEM;
}
rc = sqlite3_prepare_v2(pVtab->db, zSql, -1, &stmt, NULL);
sqlite3_free(zSql);
if(rc != SQLITE_OK) {
return rc;
}
sqlite3_bind_int64(stmt, 1, rowid);
rc = sqlite3_step(stmt);
if(rc != SQLITE_ROW) {
rc = SQLITE_ERROR;
goto done;
}
*outValue = sqlite3_value_dup(sqlite3_column_value(stmt, 0));
if(!*outValue) {
rc = SQLITE_NOMEM;
goto done;
}
rc = SQLITE_OK;
done:
sqlite3_finalize(stmt);
return rc;
}
/**
* @brief Result the given metadata value for the given row and metadata column index.
* Will traverse the metadatachunksNN table with BLOB I/0 for the given rowid.
*
* @param p
* @param rowid
* @param metadata_idx
* @param context
* @return int
*/
int vec0_result_metadata_value_for_rowid(vec0_vtab *p, i64 rowid, int metadata_idx, sqlite3_context * context) {
int rc;
i64 chunk_id;
i64 chunk_offset;
rc = vec0_get_chunk_position(p, rowid, NULL, &chunk_id, &chunk_offset);
if(rc != SQLITE_OK) {
return rc;
}
sqlite3_blob * blobValue;
rc = sqlite3_blob_open(p->db, p->schemaName, p->shadowMetadataChunksNames[metadata_idx], "data", chunk_id, 0, &blobValue);
if(rc != SQLITE_OK) {
return rc;
}
switch(p->metadata_columns[metadata_idx].kind) {
case VEC0_METADATA_COLUMN_KIND_BOOLEAN: {
u8 block;
rc = sqlite3_blob_read(blobValue, &block, sizeof(block), chunk_offset / CHAR_BIT);
if(rc != SQLITE_OK) {
goto done;
}
int value = block >> ((chunk_offset % CHAR_BIT)) & 1;
sqlite3_result_int(context, value);
break;
}
case VEC0_METADATA_COLUMN_KIND_INTEGER: {
i64 value;
rc = sqlite3_blob_read(blobValue, &value, sizeof(value), chunk_offset * sizeof(i64));
if(rc != SQLITE_OK) {
goto done;
}
sqlite3_result_int64(context, value);
break;
}
case VEC0_METADATA_COLUMN_KIND_FLOAT: {
double value;
rc = sqlite3_blob_read(blobValue, &value, sizeof(value), chunk_offset * sizeof(double));
if(rc != SQLITE_OK) {
goto done;
}
sqlite3_result_double(context, value);
break;
}
case VEC0_METADATA_COLUMN_KIND_TEXT: {
u8 view[VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH];
rc = sqlite3_blob_read(blobValue, &view, VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH, chunk_offset * VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH);
if(rc != SQLITE_OK) {
goto done;
}
int length = ((int *)view)[0];
if(length <= VEC0_METADATA_TEXT_VIEW_DATA_LENGTH) {
sqlite3_result_text(context, (const char*) (view + 4), length, SQLITE_TRANSIENT);
}
else {
sqlite3_stmt * stmt;
const char * zSql = sqlite3_mprintf("SELECT data FROM " VEC0_SHADOW_METADATA_TEXT_DATA_NAME " WHERE rowid = ?", p->schemaName, p->tableName, metadata_idx);
if(!zSql) {
rc = SQLITE_ERROR;
goto done;
}
rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL);
sqlite3_free((void *) zSql);
if(rc != SQLITE_OK) {
goto done;
}
sqlite3_bind_int64(stmt, 1, rowid);
rc = sqlite3_step(stmt);
if(rc != SQLITE_ROW) {
sqlite3_finalize(stmt);
rc = SQLITE_ERROR;
goto done;
}
sqlite3_result_value(context, sqlite3_column_value(stmt, 0));
sqlite3_finalize(stmt);
rc = SQLITE_OK;
}
break;
}
}
done:
// blobValue is read-only, will not fail on close
sqlite3_blob_close(blobValue);
return rc;
}
int vec0_get_latest_chunk_rowid(vec0_vtab *p, i64 *chunk_rowid, sqlite3_value ** partitionKeyValues) {
int rc;
const char *zSql;
// lazy initialize stmtLatestChunk when needed. May be cleared during xSync()
if (!p->stmtLatestChunk) {
if(p->numPartitionColumns > 0) {
sqlite3_str * s = sqlite3_str_new(NULL);
sqlite3_str_appendf(s, "SELECT max(rowid) FROM " VEC0_SHADOW_CHUNKS_NAME " WHERE ",
p->schemaName, p->tableName);
for(int i = 0; i < p->numPartitionColumns; i++) {
if(i != 0) {
sqlite3_str_appendall(s, " AND ");
}
sqlite3_str_appendf(s, " partition%02d = ? ", i);
}
zSql = sqlite3_str_finish(s);
}else {
zSql = sqlite3_mprintf("SELECT max(rowid) FROM " VEC0_SHADOW_CHUNKS_NAME,
p->schemaName, p->tableName);
}
if (!zSql) {
rc = SQLITE_NOMEM;
goto cleanup;
}
rc = sqlite3_prepare_v2(p->db, zSql, -1, &p->stmtLatestChunk, 0);
sqlite3_free((void *)zSql);
if (rc != SQLITE_OK) {
// IMP: V21406_05476
vtab_set_error(&p->base, VEC_INTERAL_ERROR
"could not initialize 'latest chunk' statement");
goto cleanup;
}
}
for(int i = 0; i < p->numPartitionColumns; i++) {
sqlite3_bind_value(p->stmtLatestChunk, i+1, (partitionKeyValues[i]));
}
rc = sqlite3_step(p->stmtLatestChunk);
if (rc != SQLITE_ROW) {
// IMP: V31559_15629
vtab_set_error(&p->base, VEC_INTERAL_ERROR "Could not find latest chunk");
rc = SQLITE_ERROR;
goto cleanup;
}
if(sqlite3_column_type(p->stmtLatestChunk, 0) == SQLITE_NULL){
rc = SQLITE_EMPTY;
goto cleanup;
}
*chunk_rowid = sqlite3_column_int64(p->stmtLatestChunk, 0);
rc = sqlite3_step(p->stmtLatestChunk);
if (rc != SQLITE_DONE) {
vtab_set_error(&p->base,
VEC_INTERAL_ERROR
"unknown result code when closing out stmtLatestChunk. "
"Please file an issue: " REPORT_URL,
p->schemaName, p->shadowChunksName);
goto cleanup;
}
rc = SQLITE_OK;
cleanup:
if (p->stmtLatestChunk) {
sqlite3_reset(p->stmtLatestChunk);
sqlite3_clear_bindings(p->stmtLatestChunk);
}
return rc;
}
int vec0_rowids_insert_rowid(vec0_vtab *p, i64 rowid) {
int rc = SQLITE_OK;
int entered = 0;
UNUSED_PARAMETER(entered); // temporary
if (!p->stmtRowidsInsertRowid) {
const char *zSql =
sqlite3_mprintf("INSERT INTO " VEC0_SHADOW_ROWIDS_NAME "(rowid)"
"VALUES (?);",
p->schemaName, p->tableName);
if (!zSql) {
rc = SQLITE_NOMEM;
goto cleanup;
}
rc = sqlite3_prepare_v2(p->db, zSql, -1, &p->stmtRowidsInsertRowid, 0);
sqlite3_free((void *)zSql);
if (rc != SQLITE_OK) {
vtab_set_error(&p->base, VEC_INTERAL_ERROR
"could not initialize 'insert rowids' statement");
goto cleanup;
}
}
#if SQLITE_THREADSAFE
if (sqlite3_mutex_enter) {
sqlite3_mutex_enter(sqlite3_db_mutex(p->db));
entered = 1;
}
#endif
sqlite3_bind_int64(p->stmtRowidsInsertRowid, 1, rowid);
rc = sqlite3_step(p->stmtRowidsInsertRowid);
if (rc != SQLITE_DONE) {
if (sqlite3_extended_errcode(p->db) == SQLITE_CONSTRAINT_PRIMARYKEY) {
// IMP: V17090_01160
vtab_set_error(&p->base, "UNIQUE constraint failed on %s primary key",
p->tableName);
} else {
// IMP: V04679_21517
vtab_set_error(&p->base,
"Error inserting rowid into rowids shadow table: %s",
sqlite3_errmsg(sqlite3_db_handle(p->stmtRowidsInsertId)));
}
rc = SQLITE_ERROR;
goto cleanup;
}
rc = SQLITE_OK;
cleanup:
if (p->stmtRowidsInsertRowid) {
sqlite3_reset(p->stmtRowidsInsertRowid);
sqlite3_clear_bindings(p->stmtRowidsInsertRowid);
}
#if SQLITE_THREADSAFE
if (sqlite3_mutex_leave && entered) {
sqlite3_mutex_leave(sqlite3_db_mutex(p->db));
}
#endif
return rc;
}
int vec0_rowids_insert_id(vec0_vtab *p, sqlite3_value *idValue, i64 *rowid) {
int rc = SQLITE_OK;
int entered = 0;
UNUSED_PARAMETER(entered); // temporary
if (!p->stmtRowidsInsertId) {
const char *zSql =
sqlite3_mprintf("INSERT INTO " VEC0_SHADOW_ROWIDS_NAME "(id)"
"VALUES (?);",
p->schemaName, p->tableName);
if (!zSql) {
rc = SQLITE_NOMEM;
goto complete;
}
rc = sqlite3_prepare_v2(p->db, zSql, -1, &p->stmtRowidsInsertId, 0);
sqlite3_free((void *)zSql);
if (rc != SQLITE_OK) {
vtab_set_error(&p->base, VEC_INTERAL_ERROR
"could not initialize 'insert rowids id' statement");
goto complete;
}
}
#if SQLITE_THREADSAFE
if (sqlite3_mutex_enter) {
sqlite3_mutex_enter(sqlite3_db_mutex(p->db));
entered = 1;
}
#endif
if (idValue) {
sqlite3_bind_value(p->stmtRowidsInsertId, 1, idValue);
}
rc = sqlite3_step(p->stmtRowidsInsertId);
if (rc != SQLITE_DONE) {
if (sqlite3_extended_errcode(p->db) == SQLITE_CONSTRAINT_UNIQUE) {
// IMP: V20497_04568
vtab_set_error(&p->base, "UNIQUE constraint failed on %s primary key",
p->tableName);
} else {
// IMP: V24016_08086
// IMP: V15177_32015
vtab_set_error(&p->base,
"Error inserting id into rowids shadow table: %s",
sqlite3_errmsg(sqlite3_db_handle(p->stmtRowidsInsertId)));
}
rc = SQLITE_ERROR;
goto complete;
}
*rowid = sqlite3_last_insert_rowid(p->db);
rc = SQLITE_OK;
complete:
if (p->stmtRowidsInsertId) {
sqlite3_reset(p->stmtRowidsInsertId);
sqlite3_clear_bindings(p->stmtRowidsInsertId);
}
#if SQLITE_THREADSAFE
if (sqlite3_mutex_leave && entered) {
sqlite3_mutex_leave(sqlite3_db_mutex(p->db));
}
#endif
return rc;
}
int vec0_metadata_chunk_size(vec0_metadata_column_kind kind, int chunk_size) {
switch(kind) {
case VEC0_METADATA_COLUMN_KIND_BOOLEAN:
return chunk_size / 8;
case VEC0_METADATA_COLUMN_KIND_INTEGER:
return chunk_size * sizeof(i64);
case VEC0_METADATA_COLUMN_KIND_FLOAT:
return chunk_size * sizeof(double);
case VEC0_METADATA_COLUMN_KIND_TEXT:
return chunk_size * VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH;
}
return 0;
}
int vec0_rowids_update_position(vec0_vtab *p, i64 rowid, i64 chunk_rowid,
i64 chunk_offset) {
int rc = SQLITE_OK;
if (!p->stmtRowidsUpdatePosition) {
const char *zSql = sqlite3_mprintf(" UPDATE " VEC0_SHADOW_ROWIDS_NAME
" SET chunk_id = ?, chunk_offset = ?"
" WHERE rowid = ?",
p->schemaName, p->tableName);
if (!zSql) {
rc = SQLITE_NOMEM;
goto cleanup;
}
rc = sqlite3_prepare_v2(p->db, zSql, -1, &p->stmtRowidsUpdatePosition, 0);
sqlite3_free((void *)zSql);
if (rc != SQLITE_OK) {
vtab_set_error(&p->base, VEC_INTERAL_ERROR
"could not initialize 'update rowids position' statement");
goto cleanup;
}
}
sqlite3_bind_int64(p->stmtRowidsUpdatePosition, 1, chunk_rowid);
sqlite3_bind_int64(p->stmtRowidsUpdatePosition, 2, chunk_offset);
sqlite3_bind_int64(p->stmtRowidsUpdatePosition, 3, rowid);
rc = sqlite3_step(p->stmtRowidsUpdatePosition);
if (rc != SQLITE_DONE) {
// IMP: V21925_05995
vtab_set_error(&p->base,
VEC_INTERAL_ERROR
"could not update rowids position for rowid=%lld, "
"chunk_rowid=%lld, chunk_offset=%lld",
rowid, chunk_rowid, chunk_offset);
rc = SQLITE_ERROR;
goto cleanup;
}
rc = SQLITE_OK;
cleanup:
if (p->stmtRowidsUpdatePosition) {
sqlite3_reset(p->stmtRowidsUpdatePosition);
sqlite3_clear_bindings(p->stmtRowidsUpdatePosition);
}
return rc;
}
/**
* @brief Adds a new chunk for the vec0 table, and the corresponding vector
* chunks.
*
* Inserts a new row into the _chunks table, with blank data, and uses that new
* rowid to insert new blank rows into _vector_chunksXX tables.
*
* @param p: vec0 table to add new chunk
* @param paritionKeyValues: Array of partition key valeus for the new chunk, if available
* @param chunk_rowid: Output pointer, if not NULL, then will be filled with the
* new chunk rowid.
* @return int SQLITE_OK on success, error code otherwise.
*/
int vec0_new_chunk(vec0_vtab *p, sqlite3_value ** partitionKeyValues, i64 *chunk_rowid) {
int rc;
char *zSql;
sqlite3_stmt *stmt;
i64 rowid;
// Step 1: Insert a new row in _chunks, capture that new rowid
if(p->numPartitionColumns > 0) {
sqlite3_str * s = sqlite3_str_new(NULL);
sqlite3_str_appendf(s, "INSERT INTO " VEC0_SHADOW_CHUNKS_NAME, p->schemaName, p->tableName);
sqlite3_str_appendall(s, "(size, validity, rowids");
for(int i = 0; i < p->numPartitionColumns; i++) {
sqlite3_str_appendf(s, ", partition%02d", i);
}
sqlite3_str_appendall(s, ") VALUES (?, ?, ?");
for(int i = 0; i < p->numPartitionColumns; i++) {
sqlite3_str_appendall(s, ", ?");
}
sqlite3_str_appendall(s, ")");
zSql = sqlite3_str_finish(s);
}else {
zSql = sqlite3_mprintf("INSERT INTO " VEC0_SHADOW_CHUNKS_NAME
"(size, validity, rowids) "
"VALUES (?, ?, ?);",
p->schemaName, p->tableName);
}
if (!zSql) {
return SQLITE_NOMEM;
}
rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL);
sqlite3_free(zSql);
if (rc != SQLITE_OK) {
sqlite3_finalize(stmt);
return rc;
}
#if SQLITE_THREADSAFE
if (sqlite3_mutex_enter) {
sqlite3_mutex_enter(sqlite3_db_mutex(p->db));
}
#endif
sqlite3_bind_int64(stmt, 1, p->chunk_size); // size
sqlite3_bind_zeroblob(stmt, 2, p->chunk_size / CHAR_BIT); // validity bitmap
sqlite3_bind_zeroblob(stmt, 3, p->chunk_size * sizeof(i64)); // rowids
for(int i = 0; i < p->numPartitionColumns; i++) {
sqlite3_bind_value(stmt, 4 + i, partitionKeyValues[i]);
}
rc = sqlite3_step(stmt);
int failed = rc != SQLITE_DONE;
rowid = sqlite3_last_insert_rowid(p->db);
#if SQLITE_THREADSAFE
if (sqlite3_mutex_leave) {
sqlite3_mutex_leave(sqlite3_db_mutex(p->db));
}
#endif
sqlite3_finalize(stmt);
if (failed) {
return SQLITE_ERROR;
}
// Step 2: Create new vector chunks for each vector column, with
// that new chunk_rowid.
//
// SHADOW_TABLE_ROWID_QUIRK: The _vector_chunksNN and _metadatachunksNN
// shadow tables declare "rowid PRIMARY KEY" without the INTEGER type, so
// the user-defined "rowid" column is NOT an alias for the internal SQLite
// rowid (_rowid_). When only appending rows these two happen to stay in
// sync, but after a chunk is deleted (vec0Update_Delete_DeleteChunkIfEmpty)
// and a new one is created, the auto-assigned _rowid_ can diverge from the
// user "rowid" value. Since sqlite3_blob_open() addresses rows by internal
// _rowid_, we must explicitly set BOTH _rowid_ and "rowid" to the same
// value so that later blob operations can find the row.
//
// The correct long-term fix is changing the schema to
// "rowid INTEGER PRIMARY KEY"
// which makes it a true alias, but that would break existing databases.
for (int i = 0; i < vec0_num_defined_user_columns(p); i++) {
if(p->user_column_kinds[i] != SQLITE_VEC0_USER_COLUMN_KIND_VECTOR) {
continue;
}
int vector_column_idx = p->user_column_idxs[i];
i64 vectorsSize =
p->chunk_size * vector_column_byte_size(p->vector_columns[vector_column_idx]);
// See SHADOW_TABLE_ROWID_QUIRK above for why _rowid_ and rowid are both set.
zSql = sqlite3_mprintf("INSERT INTO " VEC0_SHADOW_VECTOR_N_NAME
"(_rowid_, rowid, vectors)"
"VALUES (?, ?, ?)",
p->schemaName, p->tableName, vector_column_idx);
if (!zSql) {
return SQLITE_NOMEM;
}
rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL);
sqlite3_free(zSql);
if (rc != SQLITE_OK) {
sqlite3_finalize(stmt);
return rc;
}
sqlite3_bind_int64(stmt, 1, rowid); // _rowid_ (internal SQLite rowid)
sqlite3_bind_int64(stmt, 2, rowid); // rowid (user-defined column)
sqlite3_bind_zeroblob64(stmt, 3, vectorsSize);
rc = sqlite3_step(stmt);
sqlite3_finalize(stmt);
if (rc != SQLITE_DONE) {
return rc;
}
}
// Step 3: Create new metadata chunks for each metadata column
for (int i = 0; i < vec0_num_defined_user_columns(p); i++) {
if(p->user_column_kinds[i] != SQLITE_VEC0_USER_COLUMN_KIND_METADATA) {
continue;
}
int metadata_column_idx = p->user_column_idxs[i];
// See SHADOW_TABLE_ROWID_QUIRK above for why _rowid_ and rowid are both set.
zSql = sqlite3_mprintf("INSERT INTO " VEC0_SHADOW_METADATA_N_NAME
"(_rowid_, rowid, data)"
"VALUES (?, ?, ?)",
p->schemaName, p->tableName, metadata_column_idx);
if (!zSql) {
return SQLITE_NOMEM;
}
rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL);
sqlite3_free(zSql);
if (rc != SQLITE_OK) {
sqlite3_finalize(stmt);
return rc;
}
sqlite3_bind_int64(stmt, 1, rowid); // _rowid_ (internal SQLite rowid)
sqlite3_bind_int64(stmt, 2, rowid); // rowid (user-defined column)
sqlite3_bind_zeroblob64(stmt, 3, vec0_metadata_chunk_size(p->metadata_columns[metadata_column_idx].kind, p->chunk_size));
rc = sqlite3_step(stmt);
sqlite3_finalize(stmt);
if (rc != SQLITE_DONE) {
return rc;
}
}
if (chunk_rowid) {
*chunk_rowid = rowid;
}
return SQLITE_OK;
}
struct vec0_query_fullscan_data {
sqlite3_stmt *rowids_stmt;
i8 done;
};
void vec0_query_fullscan_data_clear(
struct vec0_query_fullscan_data *fullscan_data) {
if (!fullscan_data)
return;
if (fullscan_data->rowids_stmt) {
sqlite3_finalize(fullscan_data->rowids_stmt);
fullscan_data->rowids_stmt = NULL;
}
}
struct vec0_query_knn_data {
i64 k;
i64 k_used;
// Array of rowids of size k. Must be freed with sqlite3_free().
i64 *rowids;
// Array of distances of size k. Must be freed with sqlite3_free().
f32 *distances;
i64 current_idx;
};
void vec0_query_knn_data_clear(struct vec0_query_knn_data *knn_data) {
if (!knn_data)
return;
if (knn_data->rowids) {
sqlite3_free(knn_data->rowids);
knn_data->rowids = NULL;
}
if (knn_data->distances) {
sqlite3_free(knn_data->distances);
knn_data->distances = NULL;
}
}
struct vec0_query_point_data {
i64 rowid;
void *vectors[VEC0_MAX_VECTOR_COLUMNS];
int done;
};
void vec0_query_point_data_clear(struct vec0_query_point_data *point_data) {
if (!point_data)
return;
for (int i = 0; i < VEC0_MAX_VECTOR_COLUMNS; i++) {
sqlite3_free(point_data->vectors[i]);
point_data->vectors[i] = NULL;
}
}
typedef enum {
// If any values are updated, please update the ARCHITECTURE.md docs accordingly!
VEC0_QUERY_PLAN_FULLSCAN = '1',
VEC0_QUERY_PLAN_POINT = '2',
VEC0_QUERY_PLAN_KNN = '3',
} vec0_query_plan;
typedef struct vec0_cursor vec0_cursor;
struct vec0_cursor {
sqlite3_vtab_cursor base;
vec0_query_plan query_plan;
struct vec0_query_fullscan_data *fullscan_data;
struct vec0_query_knn_data *knn_data;
struct vec0_query_point_data *point_data;
};
void vec0_cursor_clear(vec0_cursor *pCur) {
if (pCur->fullscan_data) {
vec0_query_fullscan_data_clear(pCur->fullscan_data);
sqlite3_free(pCur->fullscan_data);
pCur->fullscan_data = NULL;
}
if (pCur->knn_data) {
vec0_query_knn_data_clear(pCur->knn_data);
sqlite3_free(pCur->knn_data);
pCur->knn_data = NULL;
}
if (pCur->point_data) {
vec0_query_point_data_clear(pCur->point_data);
sqlite3_free(pCur->point_data);
pCur->point_data = NULL;
}
}
#define VEC_CONSTRUCTOR_ERROR "vec0 constructor error: "
static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv,
sqlite3_vtab **ppVtab, char **pzErr, bool isCreate) {
UNUSED_PARAMETER(pAux);
vec0_vtab *pNew;
int rc;
const char *zSql;
pNew = sqlite3_malloc(sizeof(*pNew));
if (pNew == 0)
return SQLITE_NOMEM;
memset(pNew, 0, sizeof(*pNew));
// Declared chunk_size=N for entire table.
// -1 to use the defualt, otherwise will get re-assigned on `chunk_size=N`
// option
int chunk_size = -1;
int numVectorColumns = 0;
int numPartitionColumns = 0;
int numAuxiliaryColumns = 0;
int numMetadataColumns = 0;
int user_column_idx = 0;
// track if a "primary key" column is defined
char *pkColumnName = NULL;
int pkColumnNameLength;
int pkColumnType = SQLITE_INTEGER;
for (int i = 3; i < argc; i++) {
struct VectorColumnDefinition vecColumn;
struct Vec0PartitionColumnDefinition partitionColumn;
struct Vec0AuxiliaryColumnDefinition auxColumn;
struct Vec0MetadataColumnDefinition metadataColumn;
char *cName = NULL;
int cNameLength;
int cType;
// Scenario #1: Constructor argument is a vector column definition, ie `foo float[1024]`
rc = vec0_parse_vector_column(argv[i], strlen(argv[i]), &vecColumn);
if (rc == SQLITE_ERROR) {
*pzErr = sqlite3_mprintf(
VEC_CONSTRUCTOR_ERROR "could not parse vector column '%s'", argv[i]);
goto error;
}
if (rc == SQLITE_OK) {
if (numVectorColumns >= VEC0_MAX_VECTOR_COLUMNS) {
sqlite3_free(vecColumn.name);
*pzErr = sqlite3_mprintf(VEC_CONSTRUCTOR_ERROR
"Too many provided vector columns, maximum %d",
VEC0_MAX_VECTOR_COLUMNS);
goto error;
}
if (vecColumn.dimensions > SQLITE_VEC_VEC0_MAX_DIMENSIONS) {
sqlite3_free(vecColumn.name);
*pzErr = sqlite3_mprintf(
VEC_CONSTRUCTOR_ERROR
"Dimension on vector column too large, provided %lld, maximum %lld",
(i64)vecColumn.dimensions, SQLITE_VEC_VEC0_MAX_DIMENSIONS);
goto error;
}
pNew->user_column_kinds[user_column_idx] = SQLITE_VEC0_USER_COLUMN_KIND_VECTOR;
pNew->user_column_idxs[user_column_idx] = numVectorColumns;
memcpy(&pNew->vector_columns[numVectorColumns], &vecColumn, sizeof(vecColumn));
numVectorColumns++;
pNew->numVectorColumns = numVectorColumns;
user_column_idx++;
continue;
}
// Scenario #2: Constructor argument is a partition key column definition, ie `user_id text partition key`
rc = vec0_parse_partition_key_definition(argv[i], strlen(argv[i]), &cName,
&cNameLength, &cType);
if (rc == SQLITE_OK) {
if (numPartitionColumns >= VEC0_MAX_PARTITION_COLUMNS) {
*pzErr = sqlite3_mprintf(
VEC_CONSTRUCTOR_ERROR
"More than %d partition key columns were provided",
VEC0_MAX_PARTITION_COLUMNS);
goto error;
}
partitionColumn.type = cType;
partitionColumn.name_length = cNameLength;
partitionColumn.name = sqlite3_mprintf("%.*s", cNameLength, cName);
if(!partitionColumn.name) {
rc = SQLITE_NOMEM;
goto error;
}
pNew->user_column_kinds[user_column_idx] = SQLITE_VEC0_USER_COLUMN_KIND_PARTITION;
pNew->user_column_idxs[user_column_idx] = numPartitionColumns;
memcpy(&pNew->paritition_columns[numPartitionColumns], &partitionColumn, sizeof(partitionColumn));
numPartitionColumns++;
pNew->numPartitionColumns = numPartitionColumns;
user_column_idx++;
continue;
}
// Scenario #3: Constructor argument is a primary key column definition, ie `article_id text primary key`
rc = vec0_parse_primary_key_definition(argv[i], strlen(argv[i]), &cName,
&cNameLength, &cType);
if (rc == SQLITE_OK) {
if (pkColumnName) {
*pzErr = sqlite3_mprintf(
VEC_CONSTRUCTOR_ERROR
"More than one primary key definition was provided, vec0 only "
"suports a single primary key column",
argv[i]);
goto error;
}
pkColumnName = cName;
pkColumnNameLength = cNameLength;
pkColumnType = cType;
continue;
}
// Scenario #4: Constructor argument is a auxiliary column definition, ie `+contents text`
rc = vec0_parse_auxiliary_column_definition(argv[i], strlen(argv[i]), &cName,
&cNameLength, &cType);
if(rc == SQLITE_OK) {
if (numAuxiliaryColumns >= VEC0_MAX_AUXILIARY_COLUMNS) {
*pzErr = sqlite3_mprintf(
VEC_CONSTRUCTOR_ERROR
"More than %d auxiliary columns were provided",
VEC0_MAX_AUXILIARY_COLUMNS);
goto error;
}
auxColumn.type = cType;
auxColumn.name_length = cNameLength;
auxColumn.name = sqlite3_mprintf("%.*s", cNameLength, cName);
if(!auxColumn.name) {
rc = SQLITE_NOMEM;
goto error;
}
pNew->user_column_kinds[user_column_idx] = SQLITE_VEC0_USER_COLUMN_KIND_AUXILIARY;
pNew->user_column_idxs[user_column_idx] = numAuxiliaryColumns;
memcpy(&pNew->auxiliary_columns[numAuxiliaryColumns], &auxColumn, sizeof(auxColumn));
numAuxiliaryColumns++;
pNew->numAuxiliaryColumns = numAuxiliaryColumns;
user_column_idx++;
continue;
}
vec0_metadata_column_kind kind;
rc = vec0_parse_metadata_column_definition(argv[i], strlen(argv[i]), &cName,
&cNameLength, &kind);
if(rc == SQLITE_OK) {
if (numMetadataColumns >= VEC0_MAX_METADATA_COLUMNS) {
*pzErr = sqlite3_mprintf(
VEC_CONSTRUCTOR_ERROR
"More than %d metadata columns were provided",
VEC0_MAX_METADATA_COLUMNS);
goto error;
}
metadataColumn.kind = kind;
metadataColumn.name_length = cNameLength;
metadataColumn.name = sqlite3_mprintf("%.*s", cNameLength, cName);
if(!metadataColumn.name) {
rc = SQLITE_NOMEM;
goto error;
}
pNew->user_column_kinds[user_column_idx] = SQLITE_VEC0_USER_COLUMN_KIND_METADATA;
pNew->user_column_idxs[user_column_idx] = numMetadataColumns;
memcpy(&pNew->metadata_columns[numMetadataColumns], &metadataColumn, sizeof(metadataColumn));
numMetadataColumns++;
pNew->numMetadataColumns = numMetadataColumns;
user_column_idx++;
continue;
}
// Scenario #4: Constructor argument is a table-level option, ie `chunk_size`
char *key;
char *value;
int keyLength, valueLength;
rc = vec0_parse_table_option(argv[i], strlen(argv[i]), &key, &keyLength,
&value, &valueLength);
if (rc == SQLITE_ERROR) {
*pzErr = sqlite3_mprintf(
VEC_CONSTRUCTOR_ERROR "could not parse table option '%s'", argv[i]);
goto error;
}
if (rc == SQLITE_OK) {
if (sqlite3_strnicmp(key, "chunk_size", keyLength) == 0) {
chunk_size = atoi(value);
if (chunk_size <= 0) {
// IMP: V01931_18769
*pzErr =
sqlite3_mprintf(VEC_CONSTRUCTOR_ERROR
"chunk_size must be a non-zero positive integer");
goto error;
}
if ((chunk_size % 8) != 0) {
// IMP: V14110_30948
*pzErr = sqlite3_mprintf(VEC_CONSTRUCTOR_ERROR
"chunk_size must be divisible by 8");
goto error;
}
#define SQLITE_VEC_CHUNK_SIZE_MAX 4096
if (chunk_size > SQLITE_VEC_CHUNK_SIZE_MAX) {
*pzErr =
sqlite3_mprintf(VEC_CONSTRUCTOR_ERROR "chunk_size too large");
goto error;
}
} else {
// IMP: V27642_11712
*pzErr = sqlite3_mprintf(
VEC_CONSTRUCTOR_ERROR "Unknown table option: %.*s", keyLength, key);
goto error;
}
continue;
}
// Scenario #5: Unknown constructor argument
*pzErr =
sqlite3_mprintf(VEC_CONSTRUCTOR_ERROR "Could not parse '%s'", argv[i]);
goto error;
}
if (chunk_size < 0) {
chunk_size = 1024;
}
if (numVectorColumns <= 0) {
*pzErr = sqlite3_mprintf(VEC_CONSTRUCTOR_ERROR
"At least one vector column is required");
goto error;
}
sqlite3_str *createStr = sqlite3_str_new(NULL);
sqlite3_str_appendall(createStr, "CREATE TABLE x(");
if (pkColumnName) {
sqlite3_str_appendf(createStr, "\"%.*w\" primary key, ", pkColumnNameLength,
pkColumnName);
} else {
sqlite3_str_appendall(createStr, "rowid, ");
}
for (int i = 0; i < numVectorColumns + numPartitionColumns + numAuxiliaryColumns + numMetadataColumns; i++) {
switch(pNew->user_column_kinds[i]) {
case SQLITE_VEC0_USER_COLUMN_KIND_VECTOR: {
int vector_idx = pNew->user_column_idxs[i];
sqlite3_str_appendf(createStr, "\"%.*w\", ",
pNew->vector_columns[vector_idx].name_length,
pNew->vector_columns[vector_idx].name);
break;
}
case SQLITE_VEC0_USER_COLUMN_KIND_PARTITION: {
int partition_idx = pNew->user_column_idxs[i];
sqlite3_str_appendf(createStr, "\"%.*w\", ",
pNew->paritition_columns[partition_idx].name_length,
pNew->paritition_columns[partition_idx].name);
break;
}
case SQLITE_VEC0_USER_COLUMN_KIND_AUXILIARY: {
int auxiliary_idx = pNew->user_column_idxs[i];
sqlite3_str_appendf(createStr, "\"%.*w\", ",
pNew->auxiliary_columns[auxiliary_idx].name_length,
pNew->auxiliary_columns[auxiliary_idx].name);
break;
}
case SQLITE_VEC0_USER_COLUMN_KIND_METADATA: {
int metadata_idx = pNew->user_column_idxs[i];
sqlite3_str_appendf(createStr, "\"%.*w\", ",
pNew->metadata_columns[metadata_idx].name_length,
pNew->metadata_columns[metadata_idx].name);
break;
}
}
}
sqlite3_str_appendall(createStr, " distance hidden, k hidden) ");
if (pkColumnName) {
sqlite3_str_appendall(createStr, "without rowid ");
}
zSql = sqlite3_str_finish(createStr);
if (!zSql) {
goto error;
}
rc = sqlite3_declare_vtab(db, zSql);
sqlite3_free((void *)zSql);
if (rc != SQLITE_OK) {
*pzErr = sqlite3_mprintf(VEC_CONSTRUCTOR_ERROR
"could not declare virtual table, '%s'",
sqlite3_errmsg(db));
goto error;
}
const char *schemaName = argv[1];
const char *tableName = argv[2];
pNew->db = db;
pNew->pkIsText = pkColumnType == SQLITE_TEXT;
pNew->schemaName = sqlite3_mprintf("%s", schemaName);
if (!pNew->schemaName) {
goto error;
}
pNew->tableName = sqlite3_mprintf("%s", tableName);
if (!pNew->tableName) {
goto error;
}
pNew->shadowRowidsName = sqlite3_mprintf("%s_rowids", tableName);
if (!pNew->shadowRowidsName) {
goto error;
}
pNew->shadowChunksName = sqlite3_mprintf("%s_chunks", tableName);
if (!pNew->shadowChunksName) {
goto error;
}
pNew->numVectorColumns = numVectorColumns;
pNew->numPartitionColumns = numPartitionColumns;
pNew->numAuxiliaryColumns = numAuxiliaryColumns;
pNew->numMetadataColumns = numMetadataColumns;
for (int i = 0; i < pNew->numVectorColumns; i++) {
pNew->shadowVectorChunksNames[i] =
sqlite3_mprintf("%s_vector_chunks%02d", tableName, i);
if (!pNew->shadowVectorChunksNames[i]) {
goto error;
}
}
for (int i = 0; i < pNew->numMetadataColumns; i++) {
pNew->shadowMetadataChunksNames[i] =
sqlite3_mprintf("%s_metadatachunks%02d", tableName, i);
if (!pNew->shadowMetadataChunksNames[i]) {
goto error;
}
}
pNew->chunk_size = chunk_size;
// if xCreate, then create the necessary shadow tables
if (isCreate) {
sqlite3_stmt *stmt;
int rc;
char * zCreateInfo = sqlite3_mprintf("CREATE TABLE "VEC0_SHADOW_INFO_NAME " (key text primary key, value any)", pNew->schemaName, pNew->tableName);
if(!zCreateInfo) {
goto error;
}
rc = sqlite3_prepare_v2(db, zCreateInfo, -1, &stmt, NULL);
sqlite3_free((void *) zCreateInfo);
if ((rc != SQLITE_OK) || (sqlite3_step(stmt) != SQLITE_DONE)) {
// TODO(IMP)
sqlite3_finalize(stmt);
*pzErr = sqlite3_mprintf("Could not create '_info' shadow table: %s",
sqlite3_errmsg(db));
goto error;
}
sqlite3_finalize(stmt);
char * zSeedInfo = sqlite3_mprintf(
"INSERT INTO "VEC0_SHADOW_INFO_NAME "(key, value) VALUES "
"(?1, ?2), (?3, ?4), (?5, ?6), (?7, ?8) ",
pNew->schemaName, pNew->tableName
);
if(!zSeedInfo) {
goto error;
}
rc = sqlite3_prepare_v2(db, zSeedInfo, -1, &stmt, NULL);
sqlite3_free((void *) zSeedInfo);
if (rc != SQLITE_OK) {
// TODO(IMP)
sqlite3_finalize(stmt);
*pzErr = sqlite3_mprintf("Could not seed '_info' shadow table: %s",
sqlite3_errmsg(db));
goto error;
}
sqlite3_bind_text(stmt, 1, "CREATE_VERSION", -1, SQLITE_STATIC);
sqlite3_bind_text(stmt, 2, SQLITE_VEC_VERSION, -1, SQLITE_STATIC);
sqlite3_bind_text(stmt, 3, "CREATE_VERSION_MAJOR", -1, SQLITE_STATIC);
sqlite3_bind_int(stmt, 4, SQLITE_VEC_VERSION_MAJOR);
sqlite3_bind_text(stmt, 5, "CREATE_VERSION_MINOR", -1, SQLITE_STATIC);
sqlite3_bind_int(stmt, 6, SQLITE_VEC_VERSION_MINOR);
sqlite3_bind_text(stmt, 7, "CREATE_VERSION_PATCH", -1, SQLITE_STATIC);
sqlite3_bind_int(stmt, 8, SQLITE_VEC_VERSION_PATCH);
if(sqlite3_step(stmt) != SQLITE_DONE) {
// TODO(IMP)
sqlite3_finalize(stmt);
*pzErr = sqlite3_mprintf("Could not seed '_info' shadow table: %s",
sqlite3_errmsg(db));
goto error;
}
sqlite3_finalize(stmt);
// create the _chunks shadow table
char *zCreateShadowChunks = NULL;
if(pNew->numPartitionColumns) {
sqlite3_str * s = sqlite3_str_new(NULL);
sqlite3_str_appendf(s, "CREATE TABLE " VEC0_SHADOW_CHUNKS_NAME "(", pNew->schemaName, pNew->tableName);
sqlite3_str_appendall(s, "chunk_id INTEGER PRIMARY KEY AUTOINCREMENT," "size INTEGER NOT NULL,");
sqlite3_str_appendall(s, "sequence_id integer,");
for(int i = 0; i < pNew->numPartitionColumns;i++) {
sqlite3_str_appendf(s, "partition%02d,", i);
}
sqlite3_str_appendall(s, "validity BLOB NOT NULL, rowids BLOB NOT NULL);");
zCreateShadowChunks = sqlite3_str_finish(s);
}else {
zCreateShadowChunks = sqlite3_mprintf(VEC0_SHADOW_CHUNKS_CREATE,
pNew->schemaName, pNew->tableName);
}
if (!zCreateShadowChunks) {
goto error;
}
rc = sqlite3_prepare_v2(db, zCreateShadowChunks, -1, &stmt, 0);
sqlite3_free((void *)zCreateShadowChunks);
if ((rc != SQLITE_OK) || (sqlite3_step(stmt) != SQLITE_DONE)) {
// IMP: V17740_01811
sqlite3_finalize(stmt);
*pzErr = sqlite3_mprintf("Could not create '_chunks' shadow table: %s",
sqlite3_errmsg(db));
goto error;
}
sqlite3_finalize(stmt);
// create the _rowids shadow table
char *zCreateShadowRowids;
if (pNew->pkIsText) {
// adds a "text unique not null" constraint to the id column
zCreateShadowRowids = sqlite3_mprintf(VEC0_SHADOW_ROWIDS_CREATE_PK_TEXT,
pNew->schemaName, pNew->tableName);
} else {
zCreateShadowRowids = sqlite3_mprintf(VEC0_SHADOW_ROWIDS_CREATE_BASIC,
pNew->schemaName, pNew->tableName);
}
if (!zCreateShadowRowids) {
goto error;
}
rc = sqlite3_prepare_v2(db, zCreateShadowRowids, -1, &stmt, 0);
sqlite3_free((void *)zCreateShadowRowids);
if ((rc != SQLITE_OK) || (sqlite3_step(stmt) != SQLITE_DONE)) {
// IMP: V11631_28470
sqlite3_finalize(stmt);
*pzErr = sqlite3_mprintf("Could not create '_rowids' shadow table: %s",
sqlite3_errmsg(db));
goto error;
}
sqlite3_finalize(stmt);
for (int i = 0; i < pNew->numVectorColumns; i++) {
char *zSql = sqlite3_mprintf(VEC0_SHADOW_VECTOR_N_CREATE,
pNew->schemaName, pNew->tableName, i);
if (!zSql) {
goto error;
}
rc = sqlite3_prepare_v2(db, zSql, -1, &stmt, 0);
sqlite3_free((void *)zSql);
if ((rc != SQLITE_OK) || (sqlite3_step(stmt) != SQLITE_DONE)) {
// IMP: V25919_09989
sqlite3_finalize(stmt);
*pzErr = sqlite3_mprintf(
"Could not create '_vector_chunks%02d' shadow table: %s", i,
sqlite3_errmsg(db));
goto error;
}
sqlite3_finalize(stmt);
}
// See SHADOW_TABLE_ROWID_QUIRK in vec0_new_chunk() — same "rowid PRIMARY KEY"
// without INTEGER type issue applies here.
for (int i = 0; i < pNew->numMetadataColumns; i++) {
char *zSql = sqlite3_mprintf("CREATE TABLE " VEC0_SHADOW_METADATA_N_NAME "(rowid PRIMARY KEY, data BLOB NOT NULL);",
pNew->schemaName, pNew->tableName, i);
if (!zSql) {
goto error;
}
rc = sqlite3_prepare_v2(db, zSql, -1, &stmt, 0);
sqlite3_free((void *)zSql);
if ((rc != SQLITE_OK) || (sqlite3_step(stmt) != SQLITE_DONE)) {
sqlite3_finalize(stmt);
*pzErr = sqlite3_mprintf(
"Could not create '_metata_chunks%02d' shadow table: %s", i,
sqlite3_errmsg(db));
goto error;
}
sqlite3_finalize(stmt);
if(pNew->metadata_columns[i].kind == VEC0_METADATA_COLUMN_KIND_TEXT) {
char *zSql = sqlite3_mprintf("CREATE TABLE " VEC0_SHADOW_METADATA_TEXT_DATA_NAME "(rowid PRIMARY KEY, data TEXT);",
pNew->schemaName, pNew->tableName, i);
if (!zSql) {
goto error;
}
rc = sqlite3_prepare_v2(db, zSql, -1, &stmt, 0);
sqlite3_free((void *)zSql);
if ((rc != SQLITE_OK) || (sqlite3_step(stmt) != SQLITE_DONE)) {
sqlite3_finalize(stmt);
*pzErr = sqlite3_mprintf(
"Could not create '_metadatatext%02d' shadow table: %s", i,
sqlite3_errmsg(db));
goto error;
}
sqlite3_finalize(stmt);
}
}
if(pNew->numAuxiliaryColumns > 0) {
sqlite3_stmt * stmt;
sqlite3_str * s = sqlite3_str_new(NULL);
sqlite3_str_appendf(s, "CREATE TABLE " VEC0_SHADOW_AUXILIARY_NAME "( rowid integer PRIMARY KEY ", pNew->schemaName, pNew->tableName);
for(int i = 0; i < pNew->numAuxiliaryColumns; i++) {
sqlite3_str_appendf(s, ", value%02d", i);
}
sqlite3_str_appendall(s, ")");
char *zSql = sqlite3_str_finish(s);
if(!zSql) {
goto error;
}
rc = sqlite3_prepare_v2(db, zSql, -1, &stmt, NULL);
if ((rc != SQLITE_OK) || (sqlite3_step(stmt) != SQLITE_DONE)) {
sqlite3_finalize(stmt);
*pzErr = sqlite3_mprintf(
"Could not create auxiliary shadow table: %s",
sqlite3_errmsg(db));
goto error;
}
sqlite3_finalize(stmt);
}
}
*ppVtab = (sqlite3_vtab *)pNew;
return SQLITE_OK;
error:
vec0_free(pNew);
sqlite3_free(pNew);
return SQLITE_ERROR;
}
static int vec0Create(sqlite3 *db, void *pAux, int argc,
const char *const *argv, sqlite3_vtab **ppVtab,
char **pzErr) {
return vec0_init(db, pAux, argc, argv, ppVtab, pzErr, true);
}
static int vec0Connect(sqlite3 *db, void *pAux, int argc,
const char *const *argv, sqlite3_vtab **ppVtab,
char **pzErr) {
return vec0_init(db, pAux, argc, argv, ppVtab, pzErr, false);
}
static int vec0Disconnect(sqlite3_vtab *pVtab) {
vec0_vtab *p = (vec0_vtab *)pVtab;
vec0_free(p);
sqlite3_free(p);
return SQLITE_OK;
}
static int vec0Destroy(sqlite3_vtab *pVtab) {
vec0_vtab *p = (vec0_vtab *)pVtab;
sqlite3_stmt *stmt;
int rc;
const char *zSql;
// Free up any sqlite3_stmt, otherwise DROPs on those tables will fail
vec0_free_resources(p);
// TODO(test) later: can't evidence-of here, bc always gives "SQL logic error" instead of
// provided error
zSql = sqlite3_mprintf("DROP TABLE " VEC0_SHADOW_CHUNKS_NAME, p->schemaName,
p->tableName);
rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, 0);
sqlite3_free((void *)zSql);
if ((rc != SQLITE_OK) || (sqlite3_step(stmt) != SQLITE_DONE)) {
rc = SQLITE_ERROR;
vtab_set_error(pVtab, "could not drop chunks shadow table");
goto done;
}
sqlite3_finalize(stmt);
zSql = sqlite3_mprintf("DROP TABLE " VEC0_SHADOW_INFO_NAME, p->schemaName,
p->tableName);
rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, 0);
sqlite3_free((void *)zSql);
if ((rc != SQLITE_OK) || (sqlite3_step(stmt) != SQLITE_DONE)) {
rc = SQLITE_ERROR;
vtab_set_error(pVtab, "could not drop info shadow table");
goto done;
}
sqlite3_finalize(stmt);
zSql = sqlite3_mprintf("DROP TABLE " VEC0_SHADOW_ROWIDS_NAME, p->schemaName,
p->tableName);
rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, 0);
sqlite3_free((void *)zSql);
if ((rc != SQLITE_OK) || (sqlite3_step(stmt) != SQLITE_DONE)) {
rc = SQLITE_ERROR;
goto done;
}
sqlite3_finalize(stmt);
for (int i = 0; i < p->numVectorColumns; i++) {
zSql = sqlite3_mprintf("DROP TABLE \"%w\".\"%w\"", p->schemaName,
p->shadowVectorChunksNames[i]);
rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, 0);
sqlite3_free((void *)zSql);
if ((rc != SQLITE_OK) || (sqlite3_step(stmt) != SQLITE_DONE)) {
rc = SQLITE_ERROR;
goto done;
}
sqlite3_finalize(stmt);
}
if(p->numAuxiliaryColumns > 0) {
zSql = sqlite3_mprintf("DROP TABLE " VEC0_SHADOW_AUXILIARY_NAME, p->schemaName, p->tableName);
rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, 0);
sqlite3_free((void *)zSql);
if ((rc != SQLITE_OK) || (sqlite3_step(stmt) != SQLITE_DONE)) {
rc = SQLITE_ERROR;
goto done;
}
sqlite3_finalize(stmt);
}
for (int i = 0; i < p->numMetadataColumns; i++) {
zSql = sqlite3_mprintf("DROP TABLE " VEC0_SHADOW_METADATA_N_NAME, p->schemaName,p->tableName, i);
rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, 0);
sqlite3_free((void *)zSql);
if ((rc != SQLITE_OK) || (sqlite3_step(stmt) != SQLITE_DONE)) {
rc = SQLITE_ERROR;
goto done;
}
sqlite3_finalize(stmt);
if(p->metadata_columns[i].kind == VEC0_METADATA_COLUMN_KIND_TEXT) {
zSql = sqlite3_mprintf("DROP TABLE " VEC0_SHADOW_METADATA_TEXT_DATA_NAME, p->schemaName,p->tableName, i);
rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, 0);
sqlite3_free((void *)zSql);
if ((rc != SQLITE_OK) || (sqlite3_step(stmt) != SQLITE_DONE)) {
rc = SQLITE_ERROR;
goto done;
}
sqlite3_finalize(stmt);
}
}
stmt = NULL;
rc = SQLITE_OK;
done:
sqlite3_finalize(stmt);
vec0_free(p);
// If there was an error
if (rc == SQLITE_OK) {
sqlite3_free(p);
}
return rc;
}
static int vec0Open(sqlite3_vtab *p, sqlite3_vtab_cursor **ppCursor) {
UNUSED_PARAMETER(p);
vec0_cursor *pCur;
pCur = sqlite3_malloc(sizeof(*pCur));
if (pCur == 0)
return SQLITE_NOMEM;
memset(pCur, 0, sizeof(*pCur));
*ppCursor = &pCur->base;
return SQLITE_OK;
}
static int vec0Close(sqlite3_vtab_cursor *cur) {
vec0_cursor *pCur = (vec0_cursor *)cur;
vec0_cursor_clear(pCur);
sqlite3_free(pCur);
return SQLITE_OK;
}
// All the different type of "values" provided to argv/argc in vec0Filter.
// These enums denote the use and purpose of all of them.
typedef enum {
// If any values are updated, please update the ARCHITECTURE.md docs accordingly!
// ~~~ KNN QUERIES ~~~ //
VEC0_IDXSTR_KIND_KNN_MATCH = '{',
VEC0_IDXSTR_KIND_KNN_K = '}',
VEC0_IDXSTR_KIND_KNN_ROWID_IN = '[',
// argv[i] is a constraint on a PARTITON KEY column in a KNN query
//
VEC0_IDXSTR_KIND_KNN_PARTITON_CONSTRAINT = ']',
// argv[i] is a constraint on the distance column in a KNN query
VEC0_IDXSTR_KIND_KNN_DISTANCE_CONSTRAINT = '*',
// ~~~ POINT QUERIES ~~~ //
VEC0_IDXSTR_KIND_POINT_ID = '!',
// ~~~ ??? ~~~ //
VEC0_IDXSTR_KIND_METADATA_CONSTRAINT = '&',
} vec0_idxstr_kind;
// The different SQLITE_INDEX_CONSTRAINT values that vec0 partition key columns
// support, but as characters that fit nicely in idxstr.
typedef enum {
// If any values are updated, please update the ARCHITECTURE.md docs accordingly!
// Equality constraint on a PARTITON KEY column, ex `user_id = 123`
VEC0_PARTITION_OPERATOR_EQ = 'a',
// "Greater than" constraint on a PARTITON KEY column, ex `year > 2024`
VEC0_PARTITION_OPERATOR_GT = 'b',
// "Less than or equal to" constraint on a PARTITON KEY column, ex `year <= 2024`
VEC0_PARTITION_OPERATOR_LE = 'c',
// "Less than" constraint on a PARTITON KEY column, ex `year < 2024`
VEC0_PARTITION_OPERATOR_LT = 'd',
// "Greater than or equal to" constraint on a PARTITON KEY column, ex `year >= 2024`
VEC0_PARTITION_OPERATOR_GE = 'e',
// "Not equal to" constraint on a PARTITON KEY column, ex `year != 2024`
VEC0_PARTITION_OPERATOR_NE = 'f',
} vec0_partition_operator;
typedef enum {
VEC0_METADATA_OPERATOR_EQ = 'a',
VEC0_METADATA_OPERATOR_GT = 'b',
VEC0_METADATA_OPERATOR_LE = 'c',
VEC0_METADATA_OPERATOR_LT = 'd',
VEC0_METADATA_OPERATOR_GE = 'e',
VEC0_METADATA_OPERATOR_NE = 'f',
VEC0_METADATA_OPERATOR_IN = 'g',
} vec0_metadata_operator;
typedef enum {
VEC0_DISTANCE_CONSTRAINT_GT = 'a',
VEC0_DISTANCE_CONSTRAINT_GE = 'b',
VEC0_DISTANCE_CONSTRAINT_LT = 'c',
VEC0_DISTANCE_CONSTRAINT_LE = 'd',
} vec0_distance_constraint_operator;
static int vec0BestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pIdxInfo) {
vec0_vtab *p = (vec0_vtab *)pVTab;
/**
* Possible query plans are:
* 1. KNN when:
* a) An `MATCH` op on vector column
* b) ORDER BY on distance column
* c) LIMIT
* d) rowid in (...) OPTIONAL
* 2. Point when:
* a) An `EQ` op on rowid column
* 3. else: fullscan
*
*/
int iMatchTerm = -1;
int iMatchVectorTerm = -1;
int iLimitTerm = -1;
int iRowidTerm = -1;
int iKTerm = -1;
int iRowidInTerm = -1;
int hasAuxConstraint = 0;
#ifdef SQLITE_VEC_DEBUG
printf("pIdxInfo->nOrderBy=%d, pIdxInfo->nConstraint=%d\n", pIdxInfo->nOrderBy, pIdxInfo->nConstraint);
#endif
for (int i = 0; i < pIdxInfo->nConstraint; i++) {
u8 vtabIn = 0;
#if COMPILER_SUPPORTS_VTAB_IN
if (sqlite3_libversion_number() >= 3038000) {
vtabIn = sqlite3_vtab_in(pIdxInfo, i, -1);
}
#endif
#ifdef SQLITE_VEC_DEBUG
printf("xBestIndex [%d] usable=%d iColumn=%d op=%d vtabin=%d\n", i,
pIdxInfo->aConstraint[i].usable, pIdxInfo->aConstraint[i].iColumn,
pIdxInfo->aConstraint[i].op, vtabIn);
#endif
if (!pIdxInfo->aConstraint[i].usable)
continue;
int iColumn = pIdxInfo->aConstraint[i].iColumn;
int op = pIdxInfo->aConstraint[i].op;
if (op == SQLITE_INDEX_CONSTRAINT_LIMIT) {
iLimitTerm = i;
}
if (op == SQLITE_INDEX_CONSTRAINT_MATCH &&
vec0_column_idx_is_vector(p, iColumn)) {
if (iMatchTerm > -1) {
vtab_set_error(
pVTab, "only 1 MATCH operator is allowed in a single vec0 query");
return SQLITE_ERROR;
}
iMatchTerm = i;
iMatchVectorTerm = vec0_column_idx_to_vector_idx(p, iColumn);
}
if (op == SQLITE_INDEX_CONSTRAINT_EQ && iColumn == VEC0_COLUMN_ID) {
if (vtabIn) {
if (iRowidInTerm != -1) {
vtab_set_error(pVTab, "only 1 'rowid in (..)' operator is allowed in "
"a single vec0 query");
return SQLITE_ERROR;
}
iRowidInTerm = i;
} else {
iRowidTerm = i;
}
}
if (op == SQLITE_INDEX_CONSTRAINT_EQ && iColumn == vec0_column_k_idx(p)) {
iKTerm = i;
}
if(
(op != SQLITE_INDEX_CONSTRAINT_LIMIT && op != SQLITE_INDEX_CONSTRAINT_OFFSET)
&& vec0_column_idx_is_auxiliary(p, iColumn)) {
hasAuxConstraint = 1;
}
}
sqlite3_str *idxStr = sqlite3_str_new(NULL);
int rc;
if (iMatchTerm >= 0) {
if (iLimitTerm < 0 && iKTerm < 0) {
vtab_set_error(
pVTab,
"A LIMIT or 'k = ?' constraint is required on vec0 knn queries.");
rc = SQLITE_ERROR;
goto done;
}
if (iLimitTerm >= 0 && iKTerm >= 0) {
vtab_set_error(pVTab, "Only LIMIT or 'k =?' can be provided, not both");
rc = SQLITE_ERROR;
goto done;
}
if (pIdxInfo->nOrderBy) {
if (pIdxInfo->nOrderBy > 1) {
vtab_set_error(pVTab, "Only a single 'ORDER BY distance' clause is "
"allowed on vec0 KNN queries");
rc = SQLITE_ERROR;
goto done;
}
if (pIdxInfo->aOrderBy[0].iColumn != vec0_column_distance_idx(p)) {
vtab_set_error(pVTab,
"Only a single 'ORDER BY distance' clause is allowed on "
"vec0 KNN queries, not on other columns");
rc = SQLITE_ERROR;
goto done;
}
if (pIdxInfo->aOrderBy[0].desc) {
vtab_set_error(
pVTab, "Only ascending in ORDER BY distance clause is supported, "
"DESC is not supported yet.");
rc = SQLITE_ERROR;
goto done;
}
}
if(hasAuxConstraint) {
// IMP: V25623_09693
vtab_set_error(pVTab, "An illegal WHERE constraint was provided on a vec0 auxiliary column in a KNN query.");
rc = SQLITE_ERROR;
goto done;
}
sqlite3_str_appendchar(idxStr, 1, VEC0_QUERY_PLAN_KNN);
int argvIndex = 1;
pIdxInfo->aConstraintUsage[iMatchTerm].argvIndex = argvIndex++;
pIdxInfo->aConstraintUsage[iMatchTerm].omit = 1;
sqlite3_str_appendchar(idxStr, 1, VEC0_IDXSTR_KIND_KNN_MATCH);
sqlite3_str_appendchar(idxStr, 3, '_');
if (iLimitTerm >= 0) {
pIdxInfo->aConstraintUsage[iLimitTerm].argvIndex = argvIndex++;
pIdxInfo->aConstraintUsage[iLimitTerm].omit = 1;
} else {
pIdxInfo->aConstraintUsage[iKTerm].argvIndex = argvIndex++;
pIdxInfo->aConstraintUsage[iKTerm].omit = 1;
}
sqlite3_str_appendchar(idxStr, 1, VEC0_IDXSTR_KIND_KNN_K);
sqlite3_str_appendchar(idxStr, 3, '_');
#if COMPILER_SUPPORTS_VTAB_IN
if (iRowidInTerm >= 0) {
// already validated as >= SQLite 3.38 bc iRowidInTerm is only >= 0 when
// vtabIn == 1
sqlite3_vtab_in(pIdxInfo, iRowidInTerm, 1);
pIdxInfo->aConstraintUsage[iRowidInTerm].argvIndex = argvIndex++;
pIdxInfo->aConstraintUsage[iRowidInTerm].omit = 1;
sqlite3_str_appendchar(idxStr, 1, VEC0_IDXSTR_KIND_KNN_ROWID_IN);
sqlite3_str_appendchar(idxStr, 3, '_');
}
#endif
// find any PARTITION KEY column constraints
for (int i = 0; i < pIdxInfo->nConstraint; i++) {
if (!pIdxInfo->aConstraint[i].usable)
continue;
int iColumn = pIdxInfo->aConstraint[i].iColumn;
int op = pIdxInfo->aConstraint[i].op;
if(op == SQLITE_INDEX_CONSTRAINT_LIMIT || op == SQLITE_INDEX_CONSTRAINT_OFFSET) {
continue;
}
if(!vec0_column_idx_is_partition(p, iColumn)) {
continue;
}
int partition_idx = vec0_column_idx_to_partition_idx(p, iColumn);
char value = 0;
switch(op) {
case SQLITE_INDEX_CONSTRAINT_EQ: {
value = VEC0_PARTITION_OPERATOR_EQ;
break;
}
case SQLITE_INDEX_CONSTRAINT_GT: {
value = VEC0_PARTITION_OPERATOR_GT;
break;
}
case SQLITE_INDEX_CONSTRAINT_LE: {
value = VEC0_PARTITION_OPERATOR_LE;
break;
}
case SQLITE_INDEX_CONSTRAINT_LT: {
value = VEC0_PARTITION_OPERATOR_LT;
break;
}
case SQLITE_INDEX_CONSTRAINT_GE: {
value = VEC0_PARTITION_OPERATOR_GE;
break;
}
case SQLITE_INDEX_CONSTRAINT_NE: {
value = VEC0_PARTITION_OPERATOR_NE;
break;
}
}
if(value) {
pIdxInfo->aConstraintUsage[i].argvIndex = argvIndex++;
pIdxInfo->aConstraintUsage[i].omit = 1;
sqlite3_str_appendchar(idxStr, 1, VEC0_IDXSTR_KIND_KNN_PARTITON_CONSTRAINT);
sqlite3_str_appendchar(idxStr, 1, 'A' + partition_idx);
sqlite3_str_appendchar(idxStr, 1, value);
sqlite3_str_appendchar(idxStr, 1, '_');
}
}
// find any metadata column constraints
for (int i = 0; i < pIdxInfo->nConstraint; i++) {
if (!pIdxInfo->aConstraint[i].usable)
continue;
int iColumn = pIdxInfo->aConstraint[i].iColumn;
int op = pIdxInfo->aConstraint[i].op;
if(op == SQLITE_INDEX_CONSTRAINT_LIMIT || op == SQLITE_INDEX_CONSTRAINT_OFFSET) {
continue;
}
if(!vec0_column_idx_is_metadata(p, iColumn)) {
continue;
}
int metadata_idx = vec0_column_idx_to_metadata_idx(p, iColumn);
char value = 0;
switch(op) {
case SQLITE_INDEX_CONSTRAINT_EQ: {
int vtabIn = 0;
#if COMPILER_SUPPORTS_VTAB_IN
if (sqlite3_libversion_number() >= 3038000) {
vtabIn = sqlite3_vtab_in(pIdxInfo, i, -1);
}
if(vtabIn) {
switch(p->metadata_columns[metadata_idx].kind) {
case VEC0_METADATA_COLUMN_KIND_FLOAT:
case VEC0_METADATA_COLUMN_KIND_BOOLEAN: {
// IMP: V15248_32086
rc = SQLITE_ERROR;
vtab_set_error(pVTab, "'xxx in (...)' is only available on INTEGER or TEXT metadata columns.");
goto done;
break;
}
case VEC0_METADATA_COLUMN_KIND_INTEGER:
case VEC0_METADATA_COLUMN_KIND_TEXT: {
break;
}
}
value = VEC0_METADATA_OPERATOR_IN;
sqlite3_vtab_in(pIdxInfo, i, 1);
}else
#endif
{
value = VEC0_PARTITION_OPERATOR_EQ;
}
break;
}
case SQLITE_INDEX_CONSTRAINT_GT: {
value = VEC0_METADATA_OPERATOR_GT;
break;
}
case SQLITE_INDEX_CONSTRAINT_LE: {
value = VEC0_METADATA_OPERATOR_LE;
break;
}
case SQLITE_INDEX_CONSTRAINT_LT: {
value = VEC0_METADATA_OPERATOR_LT;
break;
}
case SQLITE_INDEX_CONSTRAINT_GE: {
value = VEC0_METADATA_OPERATOR_GE;
break;
}
case SQLITE_INDEX_CONSTRAINT_NE: {
value = VEC0_METADATA_OPERATOR_NE;
break;
}
default: {
// IMP: V16511_00582
rc = SQLITE_ERROR;
vtab_set_error(pVTab,
"An illegal WHERE constraint was provided on a vec0 metadata column in a KNN query. "
"Only one of EQUALS, GREATER_THAN, LESS_THAN_OR_EQUAL, LESS_THAN, GREATER_THAN_OR_EQUAL, NOT_EQUALS is allowed."
);
goto done;
}
}
if(p->metadata_columns[metadata_idx].kind == VEC0_METADATA_COLUMN_KIND_BOOLEAN) {
if(!(value == VEC0_METADATA_OPERATOR_EQ || value == VEC0_METADATA_OPERATOR_NE)) {
// IMP: V10145_26984
rc = SQLITE_ERROR;
vtab_set_error(pVTab, "ONLY EQUALS (=) or NOT_EQUALS (!=) operators are allowed on boolean metadata columns.");
goto done;
}
}
pIdxInfo->aConstraintUsage[i].argvIndex = argvIndex++;
pIdxInfo->aConstraintUsage[i].omit = 1;
sqlite3_str_appendchar(idxStr, 1, VEC0_IDXSTR_KIND_METADATA_CONSTRAINT);
sqlite3_str_appendchar(idxStr, 1, 'A' + metadata_idx);
sqlite3_str_appendchar(idxStr, 1, value);
sqlite3_str_appendchar(idxStr, 1, '_');
}
// find any distance column constraints
for (int i = 0; i < pIdxInfo->nConstraint; i++) {
if (!pIdxInfo->aConstraint[i].usable)
continue;
int iColumn = pIdxInfo->aConstraint[i].iColumn;
int op = pIdxInfo->aConstraint[i].op;
if(op == SQLITE_INDEX_CONSTRAINT_LIMIT || op == SQLITE_INDEX_CONSTRAINT_OFFSET) {
continue;
}
if(vec0_column_distance_idx(p) != iColumn) {
continue;
}
char value = 0;
switch(op) {
case SQLITE_INDEX_CONSTRAINT_GT: {
value = VEC0_DISTANCE_CONSTRAINT_GT;
break;
}
case SQLITE_INDEX_CONSTRAINT_GE: {
value = VEC0_DISTANCE_CONSTRAINT_GE;
break;
}
case SQLITE_INDEX_CONSTRAINT_LT: {
value = VEC0_DISTANCE_CONSTRAINT_LT;
break;
}
case SQLITE_INDEX_CONSTRAINT_LE: {
value = VEC0_DISTANCE_CONSTRAINT_LE;
break;
}
default: {
// IMP TODO
rc = SQLITE_ERROR;
vtab_set_error(
pVTab,
"Illegal WHERE constraint on distance column in a KNN query. "
"Only one of GT, GE, LT, LE constraints are allowed."
);
goto done;
}
}
pIdxInfo->aConstraintUsage[i].argvIndex = argvIndex++;
pIdxInfo->aConstraintUsage[i].omit = 1;
sqlite3_str_appendchar(idxStr, 1, VEC0_IDXSTR_KIND_KNN_DISTANCE_CONSTRAINT);
sqlite3_str_appendchar(idxStr, 1, value);
sqlite3_str_appendchar(idxStr, 1, '_');
sqlite3_str_appendchar(idxStr, 1, '_');
}
pIdxInfo->idxNum = iMatchVectorTerm;
pIdxInfo->estimatedCost = 30.0;
pIdxInfo->estimatedRows = 10;
} else if (iRowidTerm >= 0) {
sqlite3_str_appendchar(idxStr, 1, VEC0_QUERY_PLAN_POINT);
pIdxInfo->aConstraintUsage[iRowidTerm].argvIndex = 1;
pIdxInfo->aConstraintUsage[iRowidTerm].omit = 1;
sqlite3_str_appendchar(idxStr, 1, VEC0_IDXSTR_KIND_POINT_ID);
sqlite3_str_appendchar(idxStr, 3, '_');
pIdxInfo->idxNum = pIdxInfo->colUsed;
pIdxInfo->estimatedCost = 10.0;
pIdxInfo->estimatedRows = 1;
} else {
sqlite3_str_appendchar(idxStr, 1, VEC0_QUERY_PLAN_FULLSCAN);
pIdxInfo->estimatedCost = 3000000.0;
pIdxInfo->estimatedRows = 100000;
}
pIdxInfo->idxStr = sqlite3_str_finish(idxStr);
idxStr = NULL;
if (!pIdxInfo->idxStr) {
rc = SQLITE_OK;
goto done;
}
pIdxInfo->needToFreeIdxStr = 1;
rc = SQLITE_OK;
done:
if(idxStr) {
sqlite3_str_finish(idxStr);
}
return rc;
}
// forward delcaration bc vec0Filter uses it
static int vec0Next(sqlite3_vtab_cursor *cur);
void merge_sorted_lists(f32 *a, i64 *a_rowids, i64 a_length, f32 *b,
i64 *b_rowids, i32 *b_top_idxs, i64 b_length, f32 *out,
i64 *out_rowids, i64 out_length, i64 *out_used) {
// assert((a_length >= out_length) || (b_length >= out_length));
i64 ptrA = 0;
i64 ptrB = 0;
for (int i = 0; i < out_length; i++) {
if ((ptrA >= a_length) && (ptrB >= b_length)) {
*out_used = i;
return;
}
if (ptrA >= a_length) {
out[i] = b[b_top_idxs[ptrB]];
out_rowids[i] = b_rowids[b_top_idxs[ptrB]];
ptrB++;
} else if (ptrB >= b_length) {
out[i] = a[ptrA];
out_rowids[i] = a_rowids[ptrA];
ptrA++;
} else {
if (a[ptrA] <= b[b_top_idxs[ptrB]]) {
out[i] = a[ptrA];
out_rowids[i] = a_rowids[ptrA];
ptrA++;
} else {
out[i] = b[b_top_idxs[ptrB]];
out_rowids[i] = b_rowids[b_top_idxs[ptrB]];
ptrB++;
}
}
}
*out_used = out_length;
}
u8 *bitmap_new(i32 n) {
assert(n % 8 == 0);
u8 *p = sqlite3_malloc(n * sizeof(u8) / CHAR_BIT);
if (p) {
memset(p, 0, n * sizeof(u8) / CHAR_BIT);
}
return p;
}
u8 *bitmap_new_from(i32 n, u8 *from) {
assert(n % 8 == 0);
u8 *p = sqlite3_malloc(n * sizeof(u8) / CHAR_BIT);
if (p) {
memcpy(p, from, n / CHAR_BIT);
}
return p;
}
void bitmap_copy(u8 *base, u8 *from, i32 n) {
assert(n % 8 == 0);
memcpy(base, from, n / CHAR_BIT);
}
void bitmap_and_inplace(u8 *base, u8 *other, i32 n) {
assert((n % 8) == 0);
for (int i = 0; i < n / CHAR_BIT; i++) {
base[i] = base[i] & other[i];
}
}
void bitmap_set(u8 *bitmap, i32 position, int value) {
if (value) {
bitmap[position / CHAR_BIT] |= 1 << (position % CHAR_BIT);
} else {
bitmap[position / CHAR_BIT] &= ~(1 << (position % CHAR_BIT));
}
}
int bitmap_get(u8 *bitmap, i32 position) {
return (((bitmap[position / CHAR_BIT]) >> (position % CHAR_BIT)) & 1);
}
void bitmap_clear(u8 *bitmap, i32 n) {
assert((n % 8) == 0);
memset(bitmap, 0, n / CHAR_BIT);
}
void bitmap_fill(u8 *bitmap, i32 n) {
assert((n % 8) == 0);
memset(bitmap, 0xFF, n / CHAR_BIT);
}
/**
* @brief Finds the minimum k items in distances, and writes the indicies to
* out.
*
* @param distances input f32 array of size n, the items to consider.
* @param n: size of distances array.
* @param out: Output array of size k, will contain at most k element indicies
* @param k: Size of output array
* @return int
*/
int min_idx(const f32 *distances, i32 n, u8 *candidates, i32 *out, i32 k,
u8 *bTaken, i32 *k_used) {
assert(k > 0);
assert(k <= n);
bitmap_clear(bTaken, n);
for (int ik = 0; ik < k; ik++) {
int min_idx = 0;
while (min_idx < n &&
(bitmap_get(bTaken, min_idx) || !bitmap_get(candidates, min_idx))) {
min_idx++;
}
if (min_idx >= n) {
*k_used = ik;
return SQLITE_OK;
}
for (int i = 0; i < n; i++) {
if (distances[i] <= distances[min_idx] && !bitmap_get(bTaken, i) &&
(bitmap_get(candidates, i))) {
min_idx = i;
}
}
out[ik] = min_idx;
bitmap_set(bTaken, min_idx, 1);
}
*k_used = k;
return SQLITE_OK;
}
int vec0_get_metadata_text_long_value(
vec0_vtab * p,
sqlite3_stmt ** stmt,
int metadata_idx,
i64 rowid,
int *n,
char ** s) {
int rc;
if(!(*stmt)) {
const char * zSql = sqlite3_mprintf("select data from " VEC0_SHADOW_METADATA_TEXT_DATA_NAME " where rowid = ?", p->schemaName, p->tableName, metadata_idx);
if(!zSql) {
rc = SQLITE_NOMEM;
goto done;
}
rc = sqlite3_prepare_v2(p->db, zSql, -1, stmt, NULL);
sqlite3_free( (void *) zSql);
if(rc != SQLITE_OK) {
goto done;
}
}
sqlite3_reset(*stmt);
sqlite3_bind_int64(*stmt, 1, rowid);
rc = sqlite3_step(*stmt);
if(rc != SQLITE_ROW) {
rc = SQLITE_ERROR;
goto done;
}
*s = (char *) sqlite3_column_text(*stmt, 0);
*n = sqlite3_column_bytes(*stmt, 0);
rc = SQLITE_OK;
done:
return rc;
}
/**
* @brief Crete at "iterator" (sqlite3_stmt) of chunks with the given constraints
*
* Any VEC0_IDXSTR_KIND_KNN_PARTITON_CONSTRAINT values in idxStr/argv will be applied
* as WHERE constraints in the underlying stmt SQL, and any consumer of the stmt
* can freely step through the stmt with all constraints satisfied.
*
* @param p - vec0_vtab
* @param idxStr - the xBestIndex/xFilter idxstr containing VEC0_IDXSTR values
* @param argc - number of argv values from xFilter
* @param argv - array of sqlite3_value from xFilter
* @param outStmt - output sqlite3_stmt of chunks with all filters applied
* @return int SQLITE_OK on success, error code otherwise
*/
int vec0_chunks_iter(vec0_vtab * p, const char * idxStr, int argc, sqlite3_value ** argv, sqlite3_stmt** outStmt) {
// always null terminated, enforced by SQLite
int idxStrLength = strlen(idxStr);
// "1" refers to the initial vec0_query_plan char, 4 is the number of chars per "element"
int numValueEntries = (idxStrLength-1) / 4;
assert(argc == numValueEntries);
int rc;
sqlite3_str * s = sqlite3_str_new(NULL);
sqlite3_str_appendf(s, "select chunk_id, validity, rowids "
" from " VEC0_SHADOW_CHUNKS_NAME,
p->schemaName, p->tableName);
int appendedWhere = 0;
for(int i = 0; i < numValueEntries; i++) {
int idx = 1 + (i * 4);
char kind = idxStr[idx + 0];
if(kind != VEC0_IDXSTR_KIND_KNN_PARTITON_CONSTRAINT) {
continue;
}
int partition_idx = idxStr[idx + 1] - 'A';
int operator = idxStr[idx + 2];
// idxStr[idx + 3] is just null, a '_' placeholder
if(!appendedWhere) {
sqlite3_str_appendall(s, " WHERE ");
appendedWhere = 1;
}else {
sqlite3_str_appendall(s, " AND ");
}
switch(operator) {
case VEC0_PARTITION_OPERATOR_EQ:
sqlite3_str_appendf(s, " partition%02d = ? ", partition_idx);
break;
case VEC0_PARTITION_OPERATOR_GT:
sqlite3_str_appendf(s, " partition%02d > ? ", partition_idx);
break;
case VEC0_PARTITION_OPERATOR_LE:
sqlite3_str_appendf(s, " partition%02d <= ? ", partition_idx);
break;
case VEC0_PARTITION_OPERATOR_LT:
sqlite3_str_appendf(s, " partition%02d < ? ", partition_idx);
break;
case VEC0_PARTITION_OPERATOR_GE:
sqlite3_str_appendf(s, " partition%02d >= ? ", partition_idx);
break;
case VEC0_PARTITION_OPERATOR_NE:
sqlite3_str_appendf(s, " partition%02d != ? ", partition_idx);
break;
default: {
char * zSql = sqlite3_str_finish(s);
sqlite3_free(zSql);
return SQLITE_ERROR;
}
}
}
char *zSql = sqlite3_str_finish(s);
if (!zSql) {
return SQLITE_NOMEM;
}
rc = sqlite3_prepare_v2(p->db, zSql, -1, outStmt, NULL);
sqlite3_free(zSql);
if(rc != SQLITE_OK) {
return rc;
}
int n = 1;
for(int i = 0; i < numValueEntries; i++) {
int idx = 1 + (i * 4);
char kind = idxStr[idx + 0];
if(kind != VEC0_IDXSTR_KIND_KNN_PARTITON_CONSTRAINT) {
continue;
}
sqlite3_bind_value(*outStmt, n++, argv[i]);
}
return rc;
}
// a single `xxx in (...)` constraint on a metadata column. TEXT or INTEGER only for now.
struct Vec0MetadataIn{
// index of argv[i]` the constraint is on
int argv_idx;
// metadata column index of the constraint, derived from idxStr + argv_idx
int metadata_idx;
// array of the copied `(...)` values from sqlite3_vtab_in_first()/sqlite3_vtab_in_next()
struct Array array;
};
// Array elements for `xxx in (...)` values for a text column. basically just a string
struct Vec0MetadataInTextEntry {
int n;
char * zString;
};
int vec0_metadata_filter_text(vec0_vtab * p, sqlite3_value * value, const void * buffer, int size, vec0_metadata_operator op, u8* b, int metadata_idx, int chunk_rowid, struct Array * aMetadataIn, int argv_idx) {
int rc;
sqlite3_stmt * stmt = NULL;
i64 * rowids = NULL;
sqlite3_blob * rowidsBlob;
const char * sTarget = (const char *) sqlite3_value_text(value);
int nTarget = sqlite3_value_bytes(value);
// TODO(perf): only text metadata news the rowids BLOB. Make it so that
// rowids BLOB is re-used when multiple fitlers on text columns,
// ex "name BETWEEN 'a' and 'b'""
rc = sqlite3_blob_open(p->db, p->schemaName, p->shadowChunksName, "rowids", chunk_rowid, 0, &rowidsBlob);
if(rc != SQLITE_OK) {
return rc;
}
assert(sqlite3_blob_bytes(rowidsBlob) % sizeof(i64) == 0);
assert((sqlite3_blob_bytes(rowidsBlob) / sizeof(i64)) == size);
rowids = sqlite3_malloc(sqlite3_blob_bytes(rowidsBlob));
if(!rowids) {
sqlite3_blob_close(rowidsBlob);
return SQLITE_NOMEM;
}
rc = sqlite3_blob_read(rowidsBlob, rowids, sqlite3_blob_bytes(rowidsBlob), 0);
if(rc != SQLITE_OK) {
sqlite3_blob_close(rowidsBlob);
return rc;
}
sqlite3_blob_close(rowidsBlob);
switch(op) {
int nPrefix;
char * sPrefix;
char *sFull;
int nFull;
u8 * view;
case VEC0_METADATA_OPERATOR_EQ: {
for(int i = 0; i < size; i++) {
view = &((u8*) buffer)[i * VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH];
nPrefix = ((int*) view)[0];
sPrefix = (char *) &view[4];
// for EQ the text lengths must match
if(nPrefix != nTarget) {
bitmap_set(b, i, 0);
continue;
}
int cmpPrefix = strncmp(sPrefix, sTarget, min(nPrefix, VEC0_METADATA_TEXT_VIEW_DATA_LENGTH));
// for short strings, use the prefix comparison direclty
if(nPrefix <= VEC0_METADATA_TEXT_VIEW_DATA_LENGTH) {
bitmap_set(b, i, cmpPrefix == 0);
continue;
}
// for EQ on longs strings, the prefix must match
if(cmpPrefix) {
bitmap_set(b, i, 0);
continue;
}
// consult the full string
rc = vec0_get_metadata_text_long_value(p, &stmt, metadata_idx, rowids[i], &nFull, &sFull);
if(rc != SQLITE_OK) {
goto done;
}
if(nPrefix != nFull) {
rc = SQLITE_ERROR;
goto done;
}
bitmap_set(b, i, strncmp(sFull, sTarget, nFull) == 0);
}
break;
}
case VEC0_METADATA_OPERATOR_NE: {
for(int i = 0; i < size; i++) {
view = &((u8*) buffer)[i * VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH];
nPrefix = ((int*) view)[0];
sPrefix = (char *) &view[4];
// for NE if text lengths dont match, it never will
if(nPrefix != nTarget) {
bitmap_set(b, i, 1);
continue;
}
int cmpPrefix = strncmp(sPrefix, sTarget, min(nPrefix, VEC0_METADATA_TEXT_VIEW_DATA_LENGTH));
// for short strings, use the prefix comparison direclty
if(nPrefix <= VEC0_METADATA_TEXT_VIEW_DATA_LENGTH) {
bitmap_set(b, i, cmpPrefix != 0);
continue;
}
// for NE on longs strings, if prefixes dont match, then long string wont
if(cmpPrefix) {
bitmap_set(b, i, 1);
continue;
}
// consult the full string
rc = vec0_get_metadata_text_long_value(p, &stmt, metadata_idx, rowids[i], &nFull, &sFull);
if(rc != SQLITE_OK) {
goto done;
}
if(nPrefix != nFull) {
rc = SQLITE_ERROR;
goto done;
}
bitmap_set(b, i, strncmp(sFull, sTarget, nFull) != 0);
}
break;
}
case VEC0_METADATA_OPERATOR_GT: {
for(int i = 0; i < size; i++) {
view = &((u8*) buffer)[i * VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH];
nPrefix = ((int*) view)[0];
sPrefix = (char *) &view[4];
int cmpPrefix = strncmp(sPrefix, sTarget, min(min(nPrefix, VEC0_METADATA_TEXT_VIEW_DATA_LENGTH), nTarget));
if(nPrefix < VEC0_METADATA_TEXT_VIEW_DATA_LENGTH) {
// if prefix match, check which is longer
if(cmpPrefix == 0) {
bitmap_set(b, i, nPrefix > nTarget);
}
else {
bitmap_set(b, i, cmpPrefix > 0);
}
continue;
}
// TODO(perf): may not need to compare full text in some cases
rc = vec0_get_metadata_text_long_value(p, &stmt, metadata_idx, rowids[i], &nFull, &sFull);
if(rc != SQLITE_OK) {
goto done;
}
if(nPrefix != nFull) {
rc = SQLITE_ERROR;
goto done;
}
bitmap_set(b, i, strncmp(sFull, sTarget, nFull) > 0);
}
break;
}
case VEC0_METADATA_OPERATOR_GE: {
for(int i = 0; i < size; i++) {
view = &((u8*) buffer)[i * VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH];
nPrefix = ((int*) view)[0];
sPrefix = (char *) &view[4];
int cmpPrefix = strncmp(sPrefix, sTarget, min(min(nPrefix, VEC0_METADATA_TEXT_VIEW_DATA_LENGTH), nTarget));
if(nPrefix < VEC0_METADATA_TEXT_VIEW_DATA_LENGTH) {
// if prefix match, check which is longer
if(cmpPrefix == 0) {
bitmap_set(b, i, nPrefix >= nTarget);
}
else {
bitmap_set(b, i, cmpPrefix >= 0);
}
continue;
}
// TODO(perf): may not need to compare full text in some cases
rc = vec0_get_metadata_text_long_value(p, &stmt, metadata_idx, rowids[i], &nFull, &sFull);
if(rc != SQLITE_OK) {
goto done;
}
if(nPrefix != nFull) {
rc = SQLITE_ERROR;
goto done;
}
bitmap_set(b, i, strncmp(sFull, sTarget, nFull) >= 0);
}
break;
}
case VEC0_METADATA_OPERATOR_LE: {
for(int i = 0; i < size; i++) {
view = &((u8*) buffer)[i * VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH];
nPrefix = ((int*) view)[0];
sPrefix = (char *) &view[4];
int cmpPrefix = strncmp(sPrefix, sTarget, min(min(nPrefix, VEC0_METADATA_TEXT_VIEW_DATA_LENGTH), nTarget));
if(nPrefix < VEC0_METADATA_TEXT_VIEW_DATA_LENGTH) {
// if prefix match, check which is longer
if(cmpPrefix == 0) {
bitmap_set(b, i, nPrefix <= nTarget);
}
else {
bitmap_set(b, i, cmpPrefix <= 0);
}
continue;
}
// TODO(perf): may not need to compare full text in some cases
rc = vec0_get_metadata_text_long_value(p, &stmt, metadata_idx, rowids[i], &nFull, &sFull);
if(rc != SQLITE_OK) {
goto done;
}
if(nPrefix != nFull) {
rc = SQLITE_ERROR;
goto done;
}
bitmap_set(b, i, strncmp(sFull, sTarget, nFull) <= 0);
}
break;
}
case VEC0_METADATA_OPERATOR_LT: {
for(int i = 0; i < size; i++) {
view = &((u8*) buffer)[i * VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH];
nPrefix = ((int*) view)[0];
sPrefix = (char *) &view[4];
int cmpPrefix = strncmp(sPrefix, sTarget, min(min(nPrefix, VEC0_METADATA_TEXT_VIEW_DATA_LENGTH), nTarget));
if(nPrefix < VEC0_METADATA_TEXT_VIEW_DATA_LENGTH) {
// if prefix match, check which is longer
if(cmpPrefix == 0) {
bitmap_set(b, i, nPrefix < nTarget);
}
else {
bitmap_set(b, i, cmpPrefix < 0);
}
continue;
}
// TODO(perf): may not need to compare full text in some cases
rc = vec0_get_metadata_text_long_value(p, &stmt, metadata_idx, rowids[i], &nFull, &sFull);
if(rc != SQLITE_OK) {
goto done;
}
if(nPrefix != nFull) {
rc = SQLITE_ERROR;
goto done;
}
bitmap_set(b, i, strncmp(sFull, sTarget, nFull) < 0);
}
break;
}
case VEC0_METADATA_OPERATOR_IN: {
size_t metadataInIdx = -1;
for(size_t i = 0; i < aMetadataIn->length; i++) {
struct Vec0MetadataIn * metadataIn = &(((struct Vec0MetadataIn *) aMetadataIn->z)[i]);
if(metadataIn->argv_idx == argv_idx) {
metadataInIdx = i;
break;
}
}
if(metadataInIdx < 0) {
rc = SQLITE_ERROR;
goto done;
}
struct Vec0MetadataIn * metadataIn = &((struct Vec0MetadataIn *) aMetadataIn->z)[metadataInIdx];
struct Array * aTarget = &(metadataIn->array);
int nPrefix;
char * sPrefix;
char *sFull;
int nFull;
u8 * view;
for(int i = 0; i < size; i++) {
view = &((u8*) buffer)[i * VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH];
nPrefix = ((int*) view)[0];
sPrefix = (char *) &view[4];
for(size_t target_idx = 0; target_idx < aTarget->length; target_idx++) {
struct Vec0MetadataInTextEntry * entry = &(((struct Vec0MetadataInTextEntry*)aTarget->z)[target_idx]);
if(entry->n != nPrefix) {
continue;
}
int cmpPrefix = strncmp(sPrefix, entry->zString, min(nPrefix, VEC0_METADATA_TEXT_VIEW_DATA_LENGTH));
if(nPrefix <= VEC0_METADATA_TEXT_VIEW_DATA_LENGTH) {
if(cmpPrefix == 0) {
bitmap_set(b, i, 1);
break;
}
continue;
}
if(cmpPrefix) {
continue;
}
rc = vec0_get_metadata_text_long_value(p, &stmt, metadata_idx, rowids[i], &nFull, &sFull);
if(rc != SQLITE_OK) {
goto done;
}
if(nPrefix != nFull) {
rc = SQLITE_ERROR;
goto done;
}
if(strncmp(sFull, entry->zString, nFull) == 0) {
bitmap_set(b, i, 1);
break;
}
}
}
break;
}
}
rc = SQLITE_OK;
done:
sqlite3_finalize(stmt);
sqlite3_free(rowids);
return rc;
}
/**
* @brief Fill in bitmap of chunk values, whether or not the values match a metadata constraint
*
* @param p vec0_vtab
* @param metadata_idx index of the metatadata column to perfrom constraints on
* @param value sqlite3_value of the constraints value
* @param blob sqlite3_blob that is already opened on the metdata column's shadow chunk table
* @param chunk_rowid rowid of the chunk to calculate on
* @param b pre-allocated and zero'd out bitmap to write results to
* @param size size of the chunk
* @return int SQLITE_OK on success, error code otherwise
*/
int vec0_set_metadata_filter_bitmap(
vec0_vtab *p,
int metadata_idx,
vec0_metadata_operator op,
sqlite3_value * value,
sqlite3_blob * blob,
i64 chunk_rowid,
u8* b,
int size,
struct Array * aMetadataIn, int argv_idx) {
// TODO: shouldn't this skip in-valid entries from the chunk's validity bitmap?
int rc;
rc = sqlite3_blob_reopen(blob, chunk_rowid);
if(rc != SQLITE_OK) {
return rc;
}
vec0_metadata_column_kind kind = p->metadata_columns[metadata_idx].kind;
int szMatch = 0;
int blobSize = sqlite3_blob_bytes(blob);
switch(kind) {
case VEC0_METADATA_COLUMN_KIND_BOOLEAN: {
szMatch = blobSize == size / CHAR_BIT;
break;
}
case VEC0_METADATA_COLUMN_KIND_INTEGER: {
szMatch = blobSize == size * sizeof(i64);
break;
}
case VEC0_METADATA_COLUMN_KIND_FLOAT: {
szMatch = blobSize == size * sizeof(double);
break;
}
case VEC0_METADATA_COLUMN_KIND_TEXT: {
szMatch = blobSize == size * VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH;
break;
}
}
if(!szMatch) {
return SQLITE_ERROR;
}
void * buffer = sqlite3_malloc(blobSize);
if(!buffer) {
return SQLITE_NOMEM;
}
rc = sqlite3_blob_read(blob, buffer, blobSize, 0);
if(rc != SQLITE_OK) {
goto done;
}
switch(kind) {
case VEC0_METADATA_COLUMN_KIND_BOOLEAN: {
int target = sqlite3_value_int(value);
if( (target && op == VEC0_METADATA_OPERATOR_EQ) || (!target && op == VEC0_METADATA_OPERATOR_NE)) {
for(int i = 0; i < size; i++) { bitmap_set(b, i, bitmap_get((u8*) buffer, i)); }
}
else {
for(int i = 0; i < size; i++) { bitmap_set(b, i, !bitmap_get((u8*) buffer, i)); }
}
break;
}
case VEC0_METADATA_COLUMN_KIND_INTEGER: {
i64 * array = (i64*) buffer;
i64 target = sqlite3_value_int64(value);
switch(op) {
case VEC0_METADATA_OPERATOR_EQ: {
for(int i = 0; i < size; i++) { bitmap_set(b, i, array[i] == target); }
break;
}
case VEC0_METADATA_OPERATOR_GT: {
for(int i = 0; i < size; i++) { bitmap_set(b, i, array[i] > target); }
break;
}
case VEC0_METADATA_OPERATOR_LE: {
for(int i = 0; i < size; i++) { bitmap_set(b, i, array[i] <= target); }
break;
}
case VEC0_METADATA_OPERATOR_LT: {
for(int i = 0; i < size; i++) { bitmap_set(b, i, array[i] < target); }
break;
}
case VEC0_METADATA_OPERATOR_GE: {
for(int i = 0; i < size; i++) { bitmap_set(b, i, array[i] >= target); }
break;
}
case VEC0_METADATA_OPERATOR_NE: {
for(int i = 0; i < size; i++) { bitmap_set(b, i, array[i] != target); }
break;
}
case VEC0_METADATA_OPERATOR_IN: {
int metadataInIdx = -1;
for(size_t i = 0; i < aMetadataIn->length; i++) {
struct Vec0MetadataIn * metadataIn = &((struct Vec0MetadataIn *) aMetadataIn->z)[i];
if(metadataIn->argv_idx == argv_idx) {
metadataInIdx = i;
break;
}
}
if(metadataInIdx < 0) {
rc = SQLITE_ERROR;
goto done;
}
struct Vec0MetadataIn * metadataIn = &((struct Vec0MetadataIn *) aMetadataIn->z)[metadataInIdx];
struct Array * aTarget = &(metadataIn->array);
for(int i = 0; i < size; i++) {
for(size_t target_idx = 0; target_idx < aTarget->length; target_idx++) {
if( ((i64*)aTarget->z)[target_idx] == array[i]) {
bitmap_set(b, i, 1);
break;
}
}
}
break;
}
}
break;
}
case VEC0_METADATA_COLUMN_KIND_FLOAT: {
double * array = (double*) buffer;
double target = sqlite3_value_double(value);
switch(op) {
case VEC0_METADATA_OPERATOR_EQ: {
for(int i = 0; i < size; i++) { bitmap_set(b, i, array[i] == target); }
break;
}
case VEC0_METADATA_OPERATOR_GT: {
for(int i = 0; i < size; i++) { bitmap_set(b, i, array[i] > target); }
break;
}
case VEC0_METADATA_OPERATOR_LE: {
for(int i = 0; i < size; i++) { bitmap_set(b, i, array[i] <= target); }
break;
}
case VEC0_METADATA_OPERATOR_LT: {
for(int i = 0; i < size; i++) { bitmap_set(b, i, array[i] < target); }
break;
}
case VEC0_METADATA_OPERATOR_GE: {
for(int i = 0; i < size; i++) { bitmap_set(b, i, array[i] >= target); }
break;
}
case VEC0_METADATA_OPERATOR_NE: {
for(int i = 0; i < size; i++) { bitmap_set(b, i, array[i] != target); }
break;
}
case VEC0_METADATA_OPERATOR_IN: {
// should never be reached
break;
}
}
break;
}
case VEC0_METADATA_COLUMN_KIND_TEXT: {
rc = vec0_metadata_filter_text(p, value, buffer, size, op, b, metadata_idx, chunk_rowid, aMetadataIn, argv_idx);
if(rc != SQLITE_OK) {
goto done;
}
break;
}
}
done:
sqlite3_free(buffer);
return rc;
}
int vec0Filter_knn_chunks_iter(vec0_vtab *p, sqlite3_stmt *stmtChunks,
struct VectorColumnDefinition *vector_column,
int vectorColumnIdx, struct Array *arrayRowidsIn,
struct Array * aMetadataIn,
const char * idxStr, int argc, sqlite3_value ** argv,
void *queryVector, i64 k, i64 **out_topk_rowids,
f32 **out_topk_distances, i64 *out_used) {
// for each chunk, get top min(k, chunk_size) rowid + distances to query vec.
// then reconcile all topk_chunks for a true top k.
// output only rowids + distances for now
int rc = SQLITE_OK;
sqlite3_blob *blobVectors = NULL;
void *baseVectors = NULL; // memory: chunk_size * dimensions * element_size
// OWNED BY CALLER ON SUCCESS
i64 *topk_rowids = NULL; // memory: k * 4
// OWNED BY CALLER ON SUCCESS
f32 *topk_distances = NULL; // memory: k * 4
i64 *tmp_topk_rowids = NULL; // memory: k * 4
f32 *tmp_topk_distances = NULL; // memory: k * 4
f32 *chunk_distances = NULL; // memory: chunk_size * 4
u8 *b = NULL; // memory: chunk_size / 8
u8 *bTaken = NULL; // memory: chunk_size / 8
i32 *chunk_topk_idxs = NULL; // memory: k * 4
u8 *bmRowids = NULL; // memory: chunk_size / 8
u8 *bmMetadata = NULL; // memory: chunk_size / 8
// // total: a lot???
// 6 * (k * 4) + (k * 2) + (chunk_size / 8) + (chunk_size * dimensions * 4)
topk_rowids = sqlite3_malloc(k * sizeof(i64));
if (!topk_rowids) {
rc = SQLITE_NOMEM;
goto cleanup;
}
memset(topk_rowids, 0, k * sizeof(i64));
topk_distances = sqlite3_malloc(k * sizeof(f32));
if (!topk_distances) {
rc = SQLITE_NOMEM;
goto cleanup;
}
memset(topk_distances, 0, k * sizeof(f32));
tmp_topk_rowids = sqlite3_malloc(k * sizeof(i64));
if (!tmp_topk_rowids) {
rc = SQLITE_NOMEM;
goto cleanup;
}
memset(tmp_topk_rowids, 0, k * sizeof(i64));
tmp_topk_distances = sqlite3_malloc(k * sizeof(f32));
if (!tmp_topk_distances) {
rc = SQLITE_NOMEM;
goto cleanup;
}
memset(tmp_topk_distances, 0, k * sizeof(f32));
i64 k_used = 0;
i64 baseVectorsSize = p->chunk_size * vector_column_byte_size(*vector_column);
baseVectors = sqlite3_malloc(baseVectorsSize);
if (!baseVectors) {
rc = SQLITE_NOMEM;
goto cleanup;
}
chunk_distances = sqlite3_malloc(p->chunk_size * sizeof(f32));
if (!chunk_distances) {
rc = SQLITE_NOMEM;
goto cleanup;
}
b = bitmap_new(p->chunk_size);
if (!b) {
rc = SQLITE_NOMEM;
goto cleanup;
}
bTaken = bitmap_new(p->chunk_size);
if (!bTaken) {
rc = SQLITE_NOMEM;
goto cleanup;
}
chunk_topk_idxs = sqlite3_malloc(k * sizeof(i32));
if (!chunk_topk_idxs) {
rc = SQLITE_NOMEM;
goto cleanup;
}
bmRowids = arrayRowidsIn ? bitmap_new(p->chunk_size) : NULL;
if (arrayRowidsIn && !bmRowids) {
rc = SQLITE_NOMEM;
goto cleanup;
}
sqlite3_blob * metadataBlobs[VEC0_MAX_METADATA_COLUMNS];
memset(metadataBlobs, 0, sizeof(sqlite3_blob*) * VEC0_MAX_METADATA_COLUMNS);
bmMetadata = bitmap_new(p->chunk_size);
if(!bmMetadata) {
rc = SQLITE_NOMEM;
goto cleanup;
}
int idxStrLength = strlen(idxStr);
int numValueEntries = (idxStrLength-1) / 4;
assert(numValueEntries == argc);
int hasMetadataFilters = 0;
int hasDistanceConstraints = 0;
for(int i = 0; i < argc; i++) {
int idx = 1 + (i * 4);
char kind = idxStr[idx + 0];
if(kind == VEC0_IDXSTR_KIND_METADATA_CONSTRAINT) {
hasMetadataFilters = 1;
}
else if(kind == VEC0_IDXSTR_KIND_KNN_DISTANCE_CONSTRAINT) {
hasDistanceConstraints = 1;
}
}
while (true) {
rc = sqlite3_step(stmtChunks);
if (rc == SQLITE_DONE) {
break;
}
if (rc != SQLITE_ROW) {
vtab_set_error(&p->base, "chunks iter error");
rc = SQLITE_ERROR;
goto cleanup;
}
memset(chunk_distances, 0, p->chunk_size * sizeof(f32));
memset(chunk_topk_idxs, 0, k * sizeof(i32));
bitmap_clear(b, p->chunk_size);
i64 chunk_id = sqlite3_column_int64(stmtChunks, 0);
unsigned char *chunkValidity =
(unsigned char *)sqlite3_column_blob(stmtChunks, 1);
i64 validitySize = sqlite3_column_bytes(stmtChunks, 1);
if (validitySize != p->chunk_size / CHAR_BIT) {
// IMP: V05271_22109
vtab_set_error(
&p->base,
"chunk validity size doesn't match - expected %lld, found %lld",
p->chunk_size / CHAR_BIT, validitySize);
rc = SQLITE_ERROR;
goto cleanup;
}
i64 *chunkRowids = (i64 *)sqlite3_column_blob(stmtChunks, 2);
i64 rowidsSize = sqlite3_column_bytes(stmtChunks, 2);
if (rowidsSize != p->chunk_size * sizeof(i64)) {
// IMP: V02796_19635
vtab_set_error(&p->base, "rowids size doesn't match");
vtab_set_error(
&p->base,
"chunk rowids size doesn't match - expected %lld, found %lld",
p->chunk_size * sizeof(i64), rowidsSize);
rc = SQLITE_ERROR;
goto cleanup;
}
// open the vector chunk blob for the current chunk
rc = sqlite3_blob_open(p->db, p->schemaName,
p->shadowVectorChunksNames[vectorColumnIdx],
"vectors", chunk_id, 0, &blobVectors);
if (rc != SQLITE_OK) {
vtab_set_error(&p->base, "could not open vectors blob for chunk %lld",
chunk_id);
rc = SQLITE_ERROR;
goto cleanup;
}
i64 currentBaseVectorsSize = sqlite3_blob_bytes(blobVectors);
i64 expectedBaseVectorsSize =
p->chunk_size * vector_column_byte_size(*vector_column);
if (currentBaseVectorsSize != expectedBaseVectorsSize) {
// IMP: V16465_00535
vtab_set_error(
&p->base,
"vectors blob size doesn't match - expected %lld, found %lld",
expectedBaseVectorsSize, currentBaseVectorsSize);
rc = SQLITE_ERROR;
goto cleanup;
}
rc = sqlite3_blob_read(blobVectors, baseVectors, currentBaseVectorsSize, 0);
if (rc != SQLITE_OK) {
vtab_set_error(&p->base, "vectors blob read error for %lld", chunk_id);
rc = SQLITE_ERROR;
goto cleanup;
}
bitmap_copy(b, chunkValidity, p->chunk_size);
if (arrayRowidsIn) {
bitmap_clear(bmRowids, p->chunk_size);
for (int i = 0; i < p->chunk_size; i++) {
if (!bitmap_get(chunkValidity, i)) {
continue;
}
i64 rowid = chunkRowids[i];
void *in = bsearch(&rowid, arrayRowidsIn->z, arrayRowidsIn->length,
sizeof(i64), _cmp);
bitmap_set(bmRowids, i, in ? 1 : 0);
}
bitmap_and_inplace(b, bmRowids, p->chunk_size);
}
if(hasMetadataFilters) {
for(int i = 0; i < argc; i++) {
int idx = 1 + (i * 4);
char kind = idxStr[idx + 0];
if(kind != VEC0_IDXSTR_KIND_METADATA_CONSTRAINT) {
continue;
}
int metadata_idx = idxStr[idx + 1] - 'A';
int operator = idxStr[idx + 2];
if(!metadataBlobs[metadata_idx]) {
rc = sqlite3_blob_open(p->db, p->schemaName, p->shadowMetadataChunksNames[metadata_idx], "data", chunk_id, 0, &metadataBlobs[metadata_idx]);
vtab_set_error(&p->base, "Could not open metadata blob");
if(rc != SQLITE_OK) {
goto cleanup;
}
}
bitmap_clear(bmMetadata, p->chunk_size);
rc = vec0_set_metadata_filter_bitmap(p, metadata_idx, operator, argv[i], metadataBlobs[metadata_idx], chunk_id, bmMetadata, p->chunk_size, aMetadataIn, i);
if(rc != SQLITE_OK) {
vtab_set_error(&p->base, "Could not filter metadata fields");
if(rc != SQLITE_OK) {
goto cleanup;
}
}
bitmap_and_inplace(b, bmMetadata, p->chunk_size);
}
}
for (int i = 0; i < p->chunk_size; i++) {
if (!bitmap_get(b, i)) {
continue;
};
f32 result;
switch (vector_column->element_type) {
case SQLITE_VEC_ELEMENT_TYPE_FLOAT32: {
const f32 *base_i =
((f32 *)baseVectors) + (i * vector_column->dimensions);
switch (vector_column->distance_metric) {
case VEC0_DISTANCE_METRIC_L2: {
result = distance_l2_sqr_float(base_i, (f32 *)queryVector,
&vector_column->dimensions);
break;
}
case VEC0_DISTANCE_METRIC_L1: {
result = distance_l1_f32(base_i, (f32 *)queryVector,
&vector_column->dimensions);
break;
}
case VEC0_DISTANCE_METRIC_COSINE: {
result = distance_cosine_float(base_i, (f32 *)queryVector,
&vector_column->dimensions);
break;
}
}
break;
}
case SQLITE_VEC_ELEMENT_TYPE_INT8: {
const i8 *base_i =
((i8 *)baseVectors) + (i * vector_column->dimensions);
switch (vector_column->distance_metric) {
case VEC0_DISTANCE_METRIC_L2: {
result = distance_l2_sqr_int8(base_i, (i8 *)queryVector,
&vector_column->dimensions);
break;
}
case VEC0_DISTANCE_METRIC_L1: {
result = distance_l1_int8(base_i, (i8 *)queryVector,
&vector_column->dimensions);
break;
}
case VEC0_DISTANCE_METRIC_COSINE: {
result = distance_cosine_int8(base_i, (i8 *)queryVector,
&vector_column->dimensions);
break;
}
}
break;
}
case SQLITE_VEC_ELEMENT_TYPE_BIT: {
const u8 *base_i =
((u8 *)baseVectors) + (i * (vector_column->dimensions / CHAR_BIT));
result = distance_hamming(base_i, (u8 *)queryVector,
&vector_column->dimensions);
break;
}
}
chunk_distances[i] = result;
}
if(hasDistanceConstraints) {
for(int i = 0; i < argc; i++) {
int idx = 1 + (i * 4);
char kind = idxStr[idx + 0];
// TODO casts f64 to f32, is that a problem?
f32 target = (f32) sqlite3_value_double(argv[i]);
if(kind != VEC0_IDXSTR_KIND_KNN_DISTANCE_CONSTRAINT) {
continue;
}
vec0_distance_constraint_operator op = idxStr[idx + 1];
switch(op) {
case VEC0_DISTANCE_CONSTRAINT_GE: {
for(int i = 0; i < p->chunk_size;i++) {
if(bitmap_get(b, i) && !(chunk_distances[i] >= target)) {
bitmap_set(b, i, 0);
}
}
break;
}
case VEC0_DISTANCE_CONSTRAINT_GT: {
for(int i = 0; i < p->chunk_size;i++) {
if(bitmap_get(b, i) && !(chunk_distances[i] > target)) {
bitmap_set(b, i, 0);
}
}
break;
}
case VEC0_DISTANCE_CONSTRAINT_LE: {
for(int i = 0; i < p->chunk_size;i++) {
if(bitmap_get(b, i) && !(chunk_distances[i] <= target)) {
bitmap_set(b, i, 0);
}
}
break;
}
case VEC0_DISTANCE_CONSTRAINT_LT: {
for(int i = 0; i < p->chunk_size;i++) {
if(bitmap_get(b, i) && !(chunk_distances[i] < target)) {
bitmap_set(b, i, 0);
}
}
break;
}
}
}
}
int used1;
min_idx(chunk_distances, p->chunk_size, b, chunk_topk_idxs,
min(k, p->chunk_size), bTaken, &used1);
i64 used;
merge_sorted_lists(topk_distances, topk_rowids, k_used, chunk_distances,
chunkRowids, chunk_topk_idxs,
min(min(k, p->chunk_size), used1), tmp_topk_distances,
tmp_topk_rowids, k, &used);
for (int i = 0; i < used; i++) {
topk_rowids[i] = tmp_topk_rowids[i];
topk_distances[i] = tmp_topk_distances[i];
}
k_used = used;
// blobVectors is always opened with read-only permissions, so this never
// fails.
sqlite3_blob_close(blobVectors);
blobVectors = NULL;
}
*out_topk_rowids = topk_rowids;
*out_topk_distances = topk_distances;
*out_used = k_used;
rc = SQLITE_OK;
cleanup:
if (rc != SQLITE_OK) {
sqlite3_free(topk_rowids);
sqlite3_free(topk_distances);
}
sqlite3_free(chunk_topk_idxs);
sqlite3_free(tmp_topk_rowids);
sqlite3_free(tmp_topk_distances);
sqlite3_free(b);
sqlite3_free(bTaken);
sqlite3_free(bmRowids);
sqlite3_free(baseVectors);
sqlite3_free(chunk_distances);
sqlite3_free(bmMetadata);
for(int i = 0; i < VEC0_MAX_METADATA_COLUMNS; i++) {
sqlite3_blob_close(metadataBlobs[i]);
}
// blobVectors is always opened with read-only permissions, so this never
// fails.
sqlite3_blob_close(blobVectors);
return rc;
}
int vec0Filter_knn(vec0_cursor *pCur, vec0_vtab *p, int idxNum,
const char *idxStr, int argc, sqlite3_value **argv) {
assert(argc == (strlen(idxStr)-1) / 4);
int rc;
struct vec0_query_knn_data *knn_data;
int vectorColumnIdx = idxNum;
struct VectorColumnDefinition *vector_column =
&p->vector_columns[vectorColumnIdx];
struct Array *arrayRowidsIn = NULL;
sqlite3_stmt *stmtChunks = NULL;
void *queryVector;
size_t dimensions;
enum VectorElementType elementType;
vector_cleanup queryVectorCleanup = vector_cleanup_noop;
char *pzError;
knn_data = sqlite3_malloc(sizeof(*knn_data));
if (!knn_data) {
return SQLITE_NOMEM;
}
memset(knn_data, 0, sizeof(*knn_data));
// array of `struct Vec0MetadataIn`, IF there are any `xxx in (...)` metadata constraints
struct Array * aMetadataIn = NULL;
int query_idx =-1;
int k_idx = -1;
int rowid_in_idx = -1;
for(int i = 0; i < argc; i++) {
if(idxStr[1 + (i*4)] == VEC0_IDXSTR_KIND_KNN_MATCH) {
query_idx = i;
}
if(idxStr[1 + (i*4)] == VEC0_IDXSTR_KIND_KNN_K) {
k_idx = i;
}
if(idxStr[1 + (i*4)] == VEC0_IDXSTR_KIND_KNN_ROWID_IN) {
rowid_in_idx = i;
}
}
assert(query_idx >= 0);
assert(k_idx >= 0);
// make sure the query vector matches the vector column (type dimensions etc.)
rc = vector_from_value(argv[query_idx], &queryVector, &dimensions, &elementType,
&queryVectorCleanup, &pzError);
if (rc != SQLITE_OK) {
vtab_set_error(&p->base,
"Query vector on the \"%.*s\" column is invalid: %z",
vector_column->name_length, vector_column->name, pzError);
rc = SQLITE_ERROR;
goto cleanup;
}
if (elementType != vector_column->element_type) {
vtab_set_error(
&p->base,
"Query vector for the \"%.*s\" column is expected to be of type "
"%s, but a %s vector was provided.",
vector_column->name_length, vector_column->name,
vector_subtype_name(vector_column->element_type),
vector_subtype_name(elementType));
rc = SQLITE_ERROR;
goto cleanup;
}
if (dimensions != vector_column->dimensions) {
vtab_set_error(
&p->base,
"Dimension mismatch for query vector for the \"%.*s\" column. "
"Expected %d dimensions but received %d.",
vector_column->name_length, vector_column->name,
vector_column->dimensions, dimensions);
rc = SQLITE_ERROR;
goto cleanup;
}
i64 k = sqlite3_value_int64(argv[k_idx]);
if (k < 0) {
vtab_set_error(
&p->base, "k value in knn queries must be greater than or equal to 0.");
rc = SQLITE_ERROR;
goto cleanup;
}
#define SQLITE_VEC_VEC0_K_MAX 4096
if (k > SQLITE_VEC_VEC0_K_MAX) {
vtab_set_error(
&p->base,
"k value in knn query too large, provided %lld and the limit is %lld",
k, SQLITE_VEC_VEC0_K_MAX);
rc = SQLITE_ERROR;
goto cleanup;
}
if (k == 0) {
knn_data->k = 0;
pCur->knn_data = knn_data;
pCur->query_plan = VEC0_QUERY_PLAN_KNN;
rc = SQLITE_OK;
goto cleanup;
}
// handle when a `rowid in (...)` operation was provided
// Array of all the rowids that appear in any `rowid in (...)` constraint.
// NULL if none were provided, which means a "full" scan.
#if COMPILER_SUPPORTS_VTAB_IN
if (rowid_in_idx >= 0) {
sqlite3_value *item;
int rc;
arrayRowidsIn = sqlite3_malloc(sizeof(*arrayRowidsIn));
if (!arrayRowidsIn) {
rc = SQLITE_NOMEM;
goto cleanup;
}
memset(arrayRowidsIn, 0, sizeof(*arrayRowidsIn));
rc = array_init(arrayRowidsIn, sizeof(i64), 32);
if (rc != SQLITE_OK) {
goto cleanup;
}
for (rc = sqlite3_vtab_in_first(argv[rowid_in_idx], &item); rc == SQLITE_OK && item;
rc = sqlite3_vtab_in_next(argv[rowid_in_idx], &item)) {
i64 rowid;
if (p->pkIsText) {
rc = vec0_rowid_from_id(p, item, &rowid);
if (rc != SQLITE_OK) {
goto cleanup;
}
} else {
rowid = sqlite3_value_int64(item);
}
rc = array_append(arrayRowidsIn, &rowid);
if (rc != SQLITE_OK) {
goto cleanup;
}
}
if (rc != SQLITE_DONE) {
vtab_set_error(&p->base, "error processing rowid in (...) array");
goto cleanup;
}
qsort(arrayRowidsIn->z, arrayRowidsIn->length, arrayRowidsIn->element_size,
_cmp);
}
#endif
#if COMPILER_SUPPORTS_VTAB_IN
for(int i = 0; i < argc; i++) {
if(!(idxStr[1 + (i*4)] == VEC0_IDXSTR_KIND_METADATA_CONSTRAINT && idxStr[1 + (i*4) + 2] == VEC0_METADATA_OPERATOR_IN)) {
continue;
}
int metadata_idx = idxStr[1 + (i*4) + 1] - 'A';
if(!aMetadataIn) {
aMetadataIn = sqlite3_malloc(sizeof(*aMetadataIn));
if(!aMetadataIn) {
rc = SQLITE_NOMEM;
goto cleanup;
}
memset(aMetadataIn, 0, sizeof(*aMetadataIn));
rc = array_init(aMetadataIn, sizeof(struct Vec0MetadataIn), 8);
if(rc != SQLITE_OK) {
goto cleanup;
}
}
struct Vec0MetadataIn item;
memset(&item, 0, sizeof(item));
item.metadata_idx=metadata_idx;
item.argv_idx = i;
switch(p->metadata_columns[metadata_idx].kind) {
case VEC0_METADATA_COLUMN_KIND_INTEGER: {
rc = array_init(&item.array, sizeof(i64), 16);
if(rc != SQLITE_OK) {
goto cleanup;
}
sqlite3_value *entry;
for (rc = sqlite3_vtab_in_first(argv[i], &entry); rc == SQLITE_OK && entry; rc = sqlite3_vtab_in_next(argv[i], &entry)) {
i64 v = sqlite3_value_int64(entry);
rc = array_append(&item.array, &v);
if (rc != SQLITE_OK) {
goto cleanup;
}
}
if (rc != SQLITE_DONE) {
vtab_set_error(&p->base, "Error fetching next value in `x in (...)` integer expression");
goto cleanup;
}
break;
}
case VEC0_METADATA_COLUMN_KIND_TEXT: {
rc = array_init(&item.array, sizeof(struct Vec0MetadataInTextEntry), 16);
if(rc != SQLITE_OK) {
goto cleanup;
}
sqlite3_value *entry;
for (rc = sqlite3_vtab_in_first(argv[i], &entry); rc == SQLITE_OK && entry; rc = sqlite3_vtab_in_next(argv[i], &entry)) {
const char * s = (const char *) sqlite3_value_text(entry);
int n = sqlite3_value_bytes(entry);
struct Vec0MetadataInTextEntry entry;
entry.zString = sqlite3_mprintf("%.*s", n, s);
if(!entry.zString) {
rc = SQLITE_NOMEM;
goto cleanup;
}
entry.n = n;
rc = array_append(&item.array, &entry);
if (rc != SQLITE_OK) {
goto cleanup;
}
}
if (rc != SQLITE_DONE) {
vtab_set_error(&p->base, "Error fetching next value in `x in (...)` text expression");
goto cleanup;
}
break;
}
default: {
vtab_set_error(&p->base, "Internal sqlite-vec error");
goto cleanup;
}
}
rc = array_append(aMetadataIn, &item);
if(rc != SQLITE_OK) {
goto cleanup;
}
}
#endif
rc = vec0_chunks_iter(p, idxStr, argc, argv, &stmtChunks);
if (rc != SQLITE_OK) {
// IMP: V06942_23781
vtab_set_error(&p->base, "Error preparing stmtChunk: %s",
sqlite3_errmsg(p->db));
goto cleanup;
}
i64 *topk_rowids = NULL;
f32 *topk_distances = NULL;
i64 k_used = 0;
rc = vec0Filter_knn_chunks_iter(p, stmtChunks, vector_column, vectorColumnIdx,
arrayRowidsIn, aMetadataIn, idxStr, argc, argv, queryVector, k, &topk_rowids,
&topk_distances, &k_used);
if (rc != SQLITE_OK) {
goto cleanup;
}
knn_data->current_idx = 0;
knn_data->k = k;
knn_data->rowids = topk_rowids;
knn_data->distances = topk_distances;
knn_data->k_used = k_used;
pCur->knn_data = knn_data;
pCur->query_plan = VEC0_QUERY_PLAN_KNN;
rc = SQLITE_OK;
cleanup:
sqlite3_finalize(stmtChunks);
array_cleanup(arrayRowidsIn);
sqlite3_free(arrayRowidsIn);
queryVectorCleanup(queryVector);
if(aMetadataIn) {
for(size_t i = 0; i < aMetadataIn->length; i++) {
struct Vec0MetadataIn* item = &((struct Vec0MetadataIn *) aMetadataIn->z)[i];
for(size_t j = 0; j < item->array.length; j++) {
if(p->metadata_columns[item->metadata_idx].kind == VEC0_METADATA_COLUMN_KIND_TEXT) {
struct Vec0MetadataInTextEntry entry = ((struct Vec0MetadataInTextEntry*)item->array.z)[j];
sqlite3_free(entry.zString);
}
}
array_cleanup(&item->array);
}
array_cleanup(aMetadataIn);
}
sqlite3_free(aMetadataIn);
if (rc != SQLITE_OK) {
sqlite3_free(knn_data);
}
return rc;
}
int vec0Filter_fullscan(vec0_vtab *p, vec0_cursor *pCur) {
int rc;
char *zSql;
struct vec0_query_fullscan_data *fullscan_data;
fullscan_data = sqlite3_malloc(sizeof(*fullscan_data));
if (!fullscan_data) {
return SQLITE_NOMEM;
}
memset(fullscan_data, 0, sizeof(*fullscan_data));
zSql = sqlite3_mprintf(" SELECT rowid "
" FROM " VEC0_SHADOW_ROWIDS_NAME
" ORDER by chunk_id, chunk_offset ",
p->schemaName, p->tableName);
if (!zSql) {
rc = SQLITE_NOMEM;
goto error;
}
rc = sqlite3_prepare_v2(p->db, zSql, -1, &fullscan_data->rowids_stmt, NULL);
sqlite3_free(zSql);
if (rc != SQLITE_OK) {
// IMP: V09901_26739
vtab_set_error(&p->base, "Error preparing rowid scan: %s",
sqlite3_errmsg(p->db));
goto error;
}
rc = sqlite3_step(fullscan_data->rowids_stmt);
// DONE when there's no rowids, ROW when there are, both "success"
if (!(rc == SQLITE_ROW || rc == SQLITE_DONE)) {
goto error;
}
fullscan_data->done = rc == SQLITE_DONE;
pCur->query_plan = VEC0_QUERY_PLAN_FULLSCAN;
pCur->fullscan_data = fullscan_data;
return SQLITE_OK;
error:
vec0_query_fullscan_data_clear(fullscan_data);
sqlite3_free(fullscan_data);
return rc;
}
int vec0Filter_point(vec0_cursor *pCur, vec0_vtab *p, int argc,
sqlite3_value **argv) {
int rc;
assert(argc == 1);
i64 rowid;
struct vec0_query_point_data *point_data = NULL;
point_data = sqlite3_malloc(sizeof(*point_data));
if (!point_data) {
rc = SQLITE_NOMEM;
goto error;
}
memset(point_data, 0, sizeof(*point_data));
if (p->pkIsText) {
rc = vec0_rowid_from_id(p, argv[0], &rowid);
if (rc == SQLITE_EMPTY) {
goto eof;
}
if (rc != SQLITE_OK) {
goto error;
}
} else {
rowid = sqlite3_value_int64(argv[0]);
}
for (int i = 0; i < p->numVectorColumns; i++) {
rc = vec0_get_vector_data(p, rowid, i, &point_data->vectors[i], NULL);
if (rc == SQLITE_EMPTY) {
goto eof;
}
if (rc != SQLITE_OK) {
goto error;
}
}
point_data->rowid = rowid;
point_data->done = 0;
pCur->point_data = point_data;
pCur->query_plan = VEC0_QUERY_PLAN_POINT;
return SQLITE_OK;
eof:
point_data->rowid = rowid;
point_data->done = 1;
pCur->point_data = point_data;
pCur->query_plan = VEC0_QUERY_PLAN_POINT;
return SQLITE_OK;
error:
vec0_query_point_data_clear(point_data);
sqlite3_free(point_data);
return rc;
}
static int vec0Filter(sqlite3_vtab_cursor *pVtabCursor, int idxNum,
const char *idxStr, int argc, sqlite3_value **argv) {
vec0_vtab *p = (vec0_vtab *)pVtabCursor->pVtab;
vec0_cursor *pCur = (vec0_cursor *)pVtabCursor;
vec0_cursor_clear(pCur);
int idxStrLength = strlen(idxStr);
if(idxStrLength <= 0) {
return SQLITE_ERROR;
}
if((idxStrLength-1) % 4 != 0) {
return SQLITE_ERROR;
}
int numValueEntries = (idxStrLength-1) / 4;
if(numValueEntries != argc) {
return SQLITE_ERROR;
}
char query_plan = idxStr[0];
switch(query_plan) {
case VEC0_QUERY_PLAN_FULLSCAN:
return vec0Filter_fullscan(p, pCur);
case VEC0_QUERY_PLAN_KNN:
return vec0Filter_knn(pCur, p, idxNum, idxStr, argc, argv);
case VEC0_QUERY_PLAN_POINT:
return vec0Filter_point(pCur, p, argc, argv);
default:
vtab_set_error(pVtabCursor->pVtab, "unknown idxStr '%s'", idxStr);
return SQLITE_ERROR;
}
}
static int vec0Rowid(sqlite3_vtab_cursor *cur, sqlite_int64 *pRowid) {
vec0_cursor *pCur = (vec0_cursor *)cur;
switch (pCur->query_plan) {
case VEC0_QUERY_PLAN_FULLSCAN: {
*pRowid = sqlite3_column_int64(pCur->fullscan_data->rowids_stmt, 0);
return SQLITE_OK;
}
case VEC0_QUERY_PLAN_POINT: {
*pRowid = pCur->point_data->rowid;
return SQLITE_OK;
}
case VEC0_QUERY_PLAN_KNN: {
vtab_set_error(cur->pVtab,
"Internal sqlite-vec error: expected point query plan in "
"vec0Rowid, found %d",
pCur->query_plan);
return SQLITE_ERROR;
}
}
return SQLITE_ERROR;
}
static int vec0Next(sqlite3_vtab_cursor *cur) {
vec0_cursor *pCur = (vec0_cursor *)cur;
switch (pCur->query_plan) {
case VEC0_QUERY_PLAN_FULLSCAN: {
if (!pCur->fullscan_data) {
return SQLITE_ERROR;
}
int rc = sqlite3_step(pCur->fullscan_data->rowids_stmt);
if (rc == SQLITE_DONE) {
pCur->fullscan_data->done = 1;
return SQLITE_OK;
}
if (rc == SQLITE_ROW) {
return SQLITE_OK;
}
return SQLITE_ERROR;
}
case VEC0_QUERY_PLAN_KNN: {
if (!pCur->knn_data) {
return SQLITE_ERROR;
}
pCur->knn_data->current_idx++;
return SQLITE_OK;
}
case VEC0_QUERY_PLAN_POINT: {
if (!pCur->point_data) {
return SQLITE_ERROR;
}
pCur->point_data->done = 1;
return SQLITE_OK;
}
}
return SQLITE_ERROR;
}
static int vec0Eof(sqlite3_vtab_cursor *cur) {
vec0_cursor *pCur = (vec0_cursor *)cur;
switch (pCur->query_plan) {
case VEC0_QUERY_PLAN_FULLSCAN: {
if (!pCur->fullscan_data) {
return 1;
}
return pCur->fullscan_data->done;
}
case VEC0_QUERY_PLAN_KNN: {
if (!pCur->knn_data) {
return 1;
}
// return (pCur->knn_data->current_idx >= pCur->knn_data->k) ||
// (pCur->knn_data->distances[pCur->knn_data->current_idx] == FLT_MAX);
return (pCur->knn_data->current_idx >= pCur->knn_data->k_used);
}
case VEC0_QUERY_PLAN_POINT: {
if (!pCur->point_data) {
return 1;
}
return pCur->point_data->done;
}
}
return 1;
}
static int vec0Column_fullscan(vec0_vtab *pVtab, vec0_cursor *pCur,
sqlite3_context *context, int i) {
if (!pCur->fullscan_data) {
sqlite3_result_error(
context, "Internal sqlite-vec error: fullscan_data is NULL.", -1);
return SQLITE_ERROR;
}
i64 rowid = sqlite3_column_int64(pCur->fullscan_data->rowids_stmt, 0);
if (i == VEC0_COLUMN_ID) {
return vec0_result_id(pVtab, context, rowid);
}
else if (vec0_column_idx_is_vector(pVtab, i)) {
void *v;
int sz;
int vector_idx = vec0_column_idx_to_vector_idx(pVtab, i);
int rc = vec0_get_vector_data(pVtab, rowid, vector_idx, &v, &sz);
if (rc != SQLITE_OK) {
return rc;
}
sqlite3_result_blob(context, v, sz, sqlite3_free);
sqlite3_result_subtype(context,
pVtab->vector_columns[vector_idx].element_type);
}
else if (i == vec0_column_distance_idx(pVtab)) {
sqlite3_result_null(context);
}
else if(vec0_column_idx_is_partition(pVtab, i)) {
int partition_idx = vec0_column_idx_to_partition_idx(pVtab, i);
sqlite3_value * v;
int rc = vec0_get_partition_value_for_rowid(pVtab, rowid, partition_idx, &v);
if(rc == SQLITE_OK) {
sqlite3_result_value(context, v);
sqlite3_value_free(v);
}else {
sqlite3_result_error_code(context, rc);
}
}
else if(vec0_column_idx_is_auxiliary(pVtab, i)) {
int auxiliary_idx = vec0_column_idx_to_auxiliary_idx(pVtab, i);
sqlite3_value * v;
int rc = vec0_get_auxiliary_value_for_rowid(pVtab, rowid, auxiliary_idx, &v);
if(rc == SQLITE_OK) {
sqlite3_result_value(context, v);
sqlite3_value_free(v);
}else {
sqlite3_result_error_code(context, rc);
}
}
else if(vec0_column_idx_is_metadata(pVtab, i)) {
if(sqlite3_vtab_nochange(context)) {
return SQLITE_OK;
}
int metadata_idx = vec0_column_idx_to_metadata_idx(pVtab, i);
int rc = vec0_result_metadata_value_for_rowid(pVtab, rowid, metadata_idx, context);
if(rc != SQLITE_OK) {
// IMP: V15466_32305
const char * zErr = sqlite3_mprintf(
"Could not extract metadata value for column %.*s at rowid %lld",
pVtab->metadata_columns[metadata_idx].name_length,
pVtab->metadata_columns[metadata_idx].name, rowid
);
if(zErr) {
sqlite3_result_error(context, zErr, -1);
sqlite3_free((void *) zErr);
}else {
sqlite3_result_error_nomem(context);
}
}
}
return SQLITE_OK;
}
static int vec0Column_point(vec0_vtab *pVtab, vec0_cursor *pCur,
sqlite3_context *context, int i) {
if (!pCur->point_data) {
sqlite3_result_error(context,
"Internal sqlite-vec error: point_data is NULL.", -1);
return SQLITE_ERROR;
}
if (i == VEC0_COLUMN_ID) {
return vec0_result_id(pVtab, context, pCur->point_data->rowid);
}
else if (i == vec0_column_distance_idx(pVtab)) {
sqlite3_result_null(context);
return SQLITE_OK;
}
else if (vec0_column_idx_is_vector(pVtab, i)) {
if (sqlite3_vtab_nochange(context)) {
sqlite3_result_null(context);
return SQLITE_OK;
}
int vector_idx = vec0_column_idx_to_vector_idx(pVtab, i);
sqlite3_result_blob(
context, pCur->point_data->vectors[vector_idx],
vector_column_byte_size(pVtab->vector_columns[vector_idx]),
SQLITE_TRANSIENT);
sqlite3_result_subtype(context,
pVtab->vector_columns[vector_idx].element_type);
return SQLITE_OK;
}
else if(vec0_column_idx_is_partition(pVtab, i)) {
if(sqlite3_vtab_nochange(context)) {
return SQLITE_OK;
}
int partition_idx = vec0_column_idx_to_partition_idx(pVtab, i);
i64 rowid = pCur->point_data->rowid;
sqlite3_value * v;
int rc = vec0_get_partition_value_for_rowid(pVtab, rowid, partition_idx, &v);
if(rc == SQLITE_OK) {
sqlite3_result_value(context, v);
sqlite3_value_free(v);
}else {
sqlite3_result_error_code(context, rc);
}
}
else if(vec0_column_idx_is_auxiliary(pVtab, i)) {
if(sqlite3_vtab_nochange(context)) {
return SQLITE_OK;
}
i64 rowid = pCur->point_data->rowid;
int auxiliary_idx = vec0_column_idx_to_auxiliary_idx(pVtab, i);
sqlite3_value * v;
int rc = vec0_get_auxiliary_value_for_rowid(pVtab, rowid, auxiliary_idx, &v);
if(rc == SQLITE_OK) {
sqlite3_result_value(context, v);
sqlite3_value_free(v);
}else {
sqlite3_result_error_code(context, rc);
}
}
else if(vec0_column_idx_is_metadata(pVtab, i)) {
if(sqlite3_vtab_nochange(context)) {
return SQLITE_OK;
}
i64 rowid = pCur->point_data->rowid;
int metadata_idx = vec0_column_idx_to_metadata_idx(pVtab, i);
int rc = vec0_result_metadata_value_for_rowid(pVtab, rowid, metadata_idx, context);
if(rc != SQLITE_OK) {
const char * zErr = sqlite3_mprintf(
"Could not extract metadata value for column %.*s at rowid %lld",
pVtab->metadata_columns[metadata_idx].name_length,
pVtab->metadata_columns[metadata_idx].name, rowid
);
if(zErr) {
sqlite3_result_error(context, zErr, -1);
sqlite3_free((void *) zErr);
}else {
sqlite3_result_error_nomem(context);
}
}
}
return SQLITE_OK;
}
static int vec0Column_knn(vec0_vtab *pVtab, vec0_cursor *pCur,
sqlite3_context *context, int i) {
if (!pCur->knn_data) {
sqlite3_result_error(context,
"Internal sqlite-vec error: knn_data is NULL.", -1);
return SQLITE_ERROR;
}
if (i == VEC0_COLUMN_ID) {
i64 rowid = pCur->knn_data->rowids[pCur->knn_data->current_idx];
return vec0_result_id(pVtab, context, rowid);
}
else if (i == vec0_column_distance_idx(pVtab)) {
sqlite3_result_double(
context, pCur->knn_data->distances[pCur->knn_data->current_idx]);
return SQLITE_OK;
}
else if (vec0_column_idx_is_vector(pVtab, i)) {
void *out;
int sz;
int vector_idx = vec0_column_idx_to_vector_idx(pVtab, i);
int rc = vec0_get_vector_data(
pVtab, pCur->knn_data->rowids[pCur->knn_data->current_idx], vector_idx,
&out, &sz);
if (rc != SQLITE_OK) {
return rc;
}
sqlite3_result_blob(context, out, sz, sqlite3_free);
sqlite3_result_subtype(context,
pVtab->vector_columns[vector_idx].element_type);
return SQLITE_OK;
}
else if(vec0_column_idx_is_partition(pVtab, i)) {
int partition_idx = vec0_column_idx_to_partition_idx(pVtab, i);
i64 rowid = pCur->knn_data->rowids[pCur->knn_data->current_idx];
sqlite3_value * v;
int rc = vec0_get_partition_value_for_rowid(pVtab, rowid, partition_idx, &v);
if(rc == SQLITE_OK) {
sqlite3_result_value(context, v);
sqlite3_value_free(v);
}else {
sqlite3_result_error_code(context, rc);
}
}
else if(vec0_column_idx_is_auxiliary(pVtab, i)) {
int auxiliary_idx = vec0_column_idx_to_auxiliary_idx(pVtab, i);
i64 rowid = pCur->knn_data->rowids[pCur->knn_data->current_idx];
sqlite3_value * v;
int rc = vec0_get_auxiliary_value_for_rowid(pVtab, rowid, auxiliary_idx, &v);
if(rc == SQLITE_OK) {
sqlite3_result_value(context, v);
sqlite3_value_free(v);
}else {
sqlite3_result_error_code(context, rc);
}
}
else if(vec0_column_idx_is_metadata(pVtab, i)) {
int metadata_idx = vec0_column_idx_to_metadata_idx(pVtab, i);
i64 rowid = pCur->knn_data->rowids[pCur->knn_data->current_idx];
int rc = vec0_result_metadata_value_for_rowid(pVtab, rowid, metadata_idx, context);
if(rc != SQLITE_OK) {
const char * zErr = sqlite3_mprintf(
"Could not extract metadata value for column %.*s at rowid %lld",
pVtab->metadata_columns[metadata_idx].name_length,
pVtab->metadata_columns[metadata_idx].name, rowid
);
if(zErr) {
sqlite3_result_error(context, zErr, -1);
sqlite3_free((void *) zErr);
}else {
sqlite3_result_error_nomem(context);
}
}
}
return SQLITE_OK;
}
static int vec0Column(sqlite3_vtab_cursor *cur, sqlite3_context *context,
int i) {
vec0_cursor *pCur = (vec0_cursor *)cur;
vec0_vtab *pVtab = (vec0_vtab *)cur->pVtab;
switch (pCur->query_plan) {
case VEC0_QUERY_PLAN_FULLSCAN: {
return vec0Column_fullscan(pVtab, pCur, context, i);
}
case VEC0_QUERY_PLAN_KNN: {
return vec0Column_knn(pVtab, pCur, context, i);
}
case VEC0_QUERY_PLAN_POINT: {
return vec0Column_point(pVtab, pCur, context, i);
}
}
return SQLITE_OK;
}
/**
* @brief Handles the "insert rowid" step of a row insert operation of a vec0
* table.
*
* This function will insert a new row into the _rowids vec0 shadow table.
*
* @param p: virtual table
* @param idValue: Value containing the inserted rowid/id value.
* @param rowid: Output rowid, will point to the "real" i64 rowid
* value that was inserted
* @return int SQLITE_OK on success, error code on failure
*/
int vec0Update_InsertRowidStep(vec0_vtab *p, sqlite3_value *idValue,
i64 *rowid) {
/**
* An insert into a vec0 table can happen a few different ways:
* 1) With default INTEGER primary key: With a supplied i64 rowid
* 2) With default INTEGER primary key: WITHOUT a supplied rowid
* 3) With TEXT primary key: supplied text rowid
*/
int rc;
// Option 3: vtab has a user-defined TEXT primary key, so ensure a text value
// is provided.
if (p->pkIsText) {
if (sqlite3_value_type(idValue) != SQLITE_TEXT) {
// IMP: V04200_21039
vtab_set_error(&p->base,
"The %s virtual table was declared with a TEXT primary "
"key, but a non-TEXT value was provided in an INSERT.",
p->tableName);
return SQLITE_ERROR;
}
return vec0_rowids_insert_id(p, idValue, rowid);
}
// Option 1: User supplied a i64 rowid
if (sqlite3_value_type(idValue) == SQLITE_INTEGER) {
i64 suppliedRowid = sqlite3_value_int64(idValue);
rc = vec0_rowids_insert_rowid(p, suppliedRowid);
if (rc == SQLITE_OK) {
*rowid = suppliedRowid;
}
return rc;
}
// Option 2: User did not suppled a rowid
if (sqlite3_value_type(idValue) != SQLITE_NULL) {
// IMP: V30855_14925
vtab_set_error(&p->base,
"Only integers are allows for primary key values on %s",
p->tableName);
return SQLITE_ERROR;
}
// NULL to get next auto-incremented value
return vec0_rowids_insert_id(p, NULL, rowid);
}
/**
* @brief Determines the "next available" chunk position for a newly inserted
* vec0 row.
*
* This operation may insert a new "blank" chunk the _chunks table, if there is
* no more space in previous chunks.
*
* @param p: virtual table
* @param partitionKeyValues: array of partition key column values, to constrain
* against any partition key columns.
* @param chunk_rowid: Output rowid of the chunk in the _chunks virtual table
* that has the avialabiity.
* @param chunk_offset: Output the index of the available space insert the
* chunk, based on the index of the first available validity bit.
* @param pBlobValidity: Output blob of the validity column of the available
* chunk. Will be opened with read/write permissions.
* @param pValidity: Output buffer of the original chunk's validity column.
* Needs to be cleaned up with sqlite3_free().
* @return int SQLITE_OK on success, error code on failure
*/
int vec0Update_InsertNextAvailableStep(
vec0_vtab *p,
sqlite3_value ** partitionKeyValues,
i64 *chunk_rowid, i64 *chunk_offset,
sqlite3_blob **blobChunksValidity,
const unsigned char **bufferChunksValidity) {
int rc;
i64 validitySize;
*chunk_offset = -1;
rc = vec0_get_latest_chunk_rowid(p, chunk_rowid, partitionKeyValues);
if(rc == SQLITE_EMPTY) {
goto done;
}
if (rc != SQLITE_OK) {
goto cleanup;
}
rc = sqlite3_blob_open(p->db, p->schemaName, p->shadowChunksName, "validity",
*chunk_rowid, 1, blobChunksValidity);
if (rc != SQLITE_OK) {
// IMP: V22053_06123
vtab_set_error(&p->base,
VEC_INTERAL_ERROR
"could not open validity blob on %s.%s.%lld",
p->schemaName, p->shadowChunksName, *chunk_rowid);
goto cleanup;
}
validitySize = sqlite3_blob_bytes(*blobChunksValidity);
if (validitySize != p->chunk_size / CHAR_BIT) {
// IMP: V29362_13432
vtab_set_error(&p->base,
VEC_INTERAL_ERROR
"validity blob size mismatch on "
"%s.%s.%lld, expected %lld but received %lld.",
p->schemaName, p->shadowChunksName, *chunk_rowid,
(i64)(p->chunk_size / CHAR_BIT), validitySize);
rc = SQLITE_ERROR;
goto cleanup;
}
*bufferChunksValidity = sqlite3_malloc(validitySize);
if (!(*bufferChunksValidity)) {
vtab_set_error(&p->base, VEC_INTERAL_ERROR
"Could not allocate memory for validity bitmap");
rc = SQLITE_NOMEM;
goto cleanup;
}
rc = sqlite3_blob_read(*blobChunksValidity, (void *)*bufferChunksValidity,
validitySize, 0);
if (rc != SQLITE_OK) {
vtab_set_error(&p->base,
VEC_INTERAL_ERROR
"Could not read validity bitmap for %s.%s.%lld",
p->schemaName, p->shadowChunksName, *chunk_rowid);
goto cleanup;
}
// find the next available offset, ie first `0` in the bitmap.
for (int i = 0; i < validitySize; i++) {
if ((*bufferChunksValidity)[i] == 0b11111111)
continue;
for (int j = 0; j < CHAR_BIT; j++) {
if (((((*bufferChunksValidity)[i] >> j) & 1) == 0)) {
*chunk_offset = (i * CHAR_BIT) + j;
goto done;
}
}
}
done:
// latest chunk was full, so need to create a new one
if (*chunk_offset == -1) {
rc = vec0_new_chunk(p, partitionKeyValues, chunk_rowid);
if (rc != SQLITE_OK) {
// IMP: V08441_25279
vtab_set_error(&p->base,
VEC_INTERAL_ERROR "Could not insert a new vector chunk");
rc = SQLITE_ERROR; // otherwise raises a DatabaseError and not operational
// error?
goto cleanup;
}
*chunk_offset = 0;
// blobChunksValidity and pValidity are stale, pointing to the previous
// (full) chunk. to re-assign them
rc = sqlite3_blob_close(*blobChunksValidity);
sqlite3_free((void *)*bufferChunksValidity);
*blobChunksValidity = NULL;
*bufferChunksValidity = NULL;
if (rc != SQLITE_OK) {
vtab_set_error(&p->base, VEC_INTERAL_ERROR
"unknown error, blobChunksValidity could not be closed, "
"please file an issue.");
rc = SQLITE_ERROR;
goto cleanup;
}
rc = sqlite3_blob_open(p->db, p->schemaName, p->shadowChunksName,
"validity", *chunk_rowid, 1, blobChunksValidity);
if (rc != SQLITE_OK) {
vtab_set_error(
&p->base,
VEC_INTERAL_ERROR
"Could not open validity blob for newly created chunk %s.%s.%lld",
p->schemaName, p->shadowChunksName, *chunk_rowid);
goto cleanup;
}
validitySize = sqlite3_blob_bytes(*blobChunksValidity);
if (validitySize != p->chunk_size / CHAR_BIT) {
vtab_set_error(&p->base,
VEC_INTERAL_ERROR
"validity blob size mismatch for newly created chunk "
"%s.%s.%lld. Exepcted %lld, got %lld",
p->schemaName, p->shadowChunksName, *chunk_rowid,
p->chunk_size / CHAR_BIT, validitySize);
goto cleanup;
}
*bufferChunksValidity = sqlite3_malloc(validitySize);
rc = sqlite3_blob_read(*blobChunksValidity, (void *)*bufferChunksValidity,
validitySize, 0);
if (rc != SQLITE_OK) {
vtab_set_error(&p->base,
VEC_INTERAL_ERROR
"could not read validity blob newly created chunk "
"%s.%s.%lld",
p->schemaName, p->shadowChunksName, *chunk_rowid);
goto cleanup;
}
}
rc = SQLITE_OK;
cleanup:
return rc;
}
/**
* @brief Write the vector data into the provided vector blob at the given
* offset
*
* @param blobVectors SQLite BLOB to write to
* @param chunk_offset the "offset" (ie validity bitmap position) to write the
* vector to
* @param bVector pointer to the vector containing data
* @param dimensions how many dimensions the vector has
* @param element_type the vector type
* @return result of sqlite3_blob_write, SQLITE_OK on success, otherwise failure
*/
static int
vec0_write_vector_to_vector_blob(sqlite3_blob *blobVectors, i64 chunk_offset,
const void *bVector, size_t dimensions,
enum VectorElementType element_type) {
int n;
int offset;
switch (element_type) {
case SQLITE_VEC_ELEMENT_TYPE_FLOAT32:
n = dimensions * sizeof(f32);
offset = chunk_offset * dimensions * sizeof(f32);
break;
case SQLITE_VEC_ELEMENT_TYPE_INT8:
n = dimensions * sizeof(i8);
offset = chunk_offset * dimensions * sizeof(i8);
break;
case SQLITE_VEC_ELEMENT_TYPE_BIT:
n = dimensions / CHAR_BIT;
offset = chunk_offset * dimensions / CHAR_BIT;
break;
}
return sqlite3_blob_write(blobVectors, bVector, n, offset);
}
/**
* @brief
*
* @param p vec0 virtual table
* @param chunk_rowid: which chunk to write to
* @param chunk_offset: the offset inside the chunk to write the vector to.
* @param rowid: the rowid of the inserting row
* @param vectorDatas: array of the vector data to insert
* @param blobValidity: writeable validity blob of the row's assigned chunk.
* @param validity: snapshot buffer of the valdity column from the row's
* assigned chunk.
* @return int SQLITE_OK on success, error code on failure
*/
int vec0Update_InsertWriteFinalStep(vec0_vtab *p, i64 chunk_rowid,
i64 chunk_offset, i64 rowid,
void *vectorDatas[],
sqlite3_blob *blobChunksValidity,
const unsigned char *bufferChunksValidity) {
int rc, brc;
sqlite3_blob *blobChunksRowids = NULL;
// mark the validity bit for this row in the chunk's validity bitmap
// Get the byte offset of the bitmap
char unsigned bx = bufferChunksValidity[chunk_offset / CHAR_BIT];
// set the bit at the chunk_offset position inside that byte
bx = bx | (1 << (chunk_offset % CHAR_BIT));
// write that 1 byte
rc = sqlite3_blob_write(blobChunksValidity, &bx, 1, chunk_offset / CHAR_BIT);
if (rc != SQLITE_OK) {
vtab_set_error(&p->base, VEC_INTERAL_ERROR "could not mark validity bit ");
return rc;
}
// Go insert the vector data into the vector chunk shadow tables
for (int i = 0; i < p->numVectorColumns; i++) {
sqlite3_blob *blobVectors;
rc = sqlite3_blob_open(p->db, p->schemaName, p->shadowVectorChunksNames[i],
"vectors", chunk_rowid, 1, &blobVectors);
if (rc != SQLITE_OK) {
vtab_set_error(&p->base, "Error opening vector blob at %s.%s.%lld",
p->schemaName, p->shadowVectorChunksNames[i], chunk_rowid);
goto cleanup;
}
i64 expected =
p->chunk_size * vector_column_byte_size(p->vector_columns[i]);
i64 actual = sqlite3_blob_bytes(blobVectors);
if (actual != expected) {
// IMP: V16386_00456
vtab_set_error(
&p->base,
VEC_INTERAL_ERROR
"vector blob size mismatch on %s.%s.%lld. Expected %lld, actual %lld",
p->schemaName, p->shadowVectorChunksNames[i], chunk_rowid, expected,
actual);
rc = SQLITE_ERROR;
// already error, can ignore result code
sqlite3_blob_close(blobVectors);
goto cleanup;
};
rc = vec0_write_vector_to_vector_blob(
blobVectors, chunk_offset, vectorDatas[i],
p->vector_columns[i].dimensions, p->vector_columns[i].element_type);
if (rc != SQLITE_OK) {
vtab_set_error(&p->base,
VEC_INTERAL_ERROR
"could not write vector blob on %s.%s.%lld",
p->schemaName, p->shadowVectorChunksNames[i], chunk_rowid);
rc = SQLITE_ERROR;
// already error, can ignore result code
sqlite3_blob_close(blobVectors);
goto cleanup;
}
rc = sqlite3_blob_close(blobVectors);
if (rc != SQLITE_OK) {
vtab_set_error(&p->base,
VEC_INTERAL_ERROR
"could not close vector blob on %s.%s.%lld",
p->schemaName, p->shadowVectorChunksNames[i], chunk_rowid);
rc = SQLITE_ERROR;
goto cleanup;
}
}
// write the new rowid to the rowids column of the _chunks table
rc = sqlite3_blob_open(p->db, p->schemaName, p->shadowChunksName, "rowids",
chunk_rowid, 1, &blobChunksRowids);
if (rc != SQLITE_OK) {
// IMP: V09221_26060
vtab_set_error(&p->base,
VEC_INTERAL_ERROR "could not open rowids blob on %s.%s.%lld",
p->schemaName, p->shadowChunksName, chunk_rowid);
goto cleanup;
}
i64 expected = p->chunk_size * sizeof(i64);
i64 actual = sqlite3_blob_bytes(blobChunksRowids);
if (expected != actual) {
// IMP: V12779_29618
vtab_set_error(
&p->base,
VEC_INTERAL_ERROR
"rowids blob size mismatch on %s.%s.%lld. Expected %lld, actual %lld",
p->schemaName, p->shadowChunksName, chunk_rowid, expected, actual);
rc = SQLITE_ERROR;
goto cleanup;
}
rc = sqlite3_blob_write(blobChunksRowids, &rowid, sizeof(i64),
chunk_offset * sizeof(i64));
if (rc != SQLITE_OK) {
vtab_set_error(
&p->base, VEC_INTERAL_ERROR "could not write rowids blob on %s.%s.%lld",
p->schemaName, p->shadowChunksName, chunk_rowid);
rc = SQLITE_ERROR;
goto cleanup;
}
// Now with all the vectors inserted, go back and update the _rowids table
// with the new chunk_rowid/chunk_offset values
rc = vec0_rowids_update_position(p, rowid, chunk_rowid, chunk_offset);
cleanup:
brc = sqlite3_blob_close(blobChunksRowids);
if ((rc == SQLITE_OK) && (brc != SQLITE_OK)) {
vtab_set_error(
&p->base, VEC_INTERAL_ERROR "could not close rowids blob on %s.%s.%lld",
p->schemaName, p->shadowChunksName, chunk_rowid);
return brc;
}
return rc;
}
int vec0_write_metadata_value(vec0_vtab *p, int metadata_column_idx, i64 rowid, i64 chunk_id, i64 chunk_offset, sqlite3_value * v, int isupdate) {
int rc;
struct Vec0MetadataColumnDefinition * metadata_column = &p->metadata_columns[metadata_column_idx];
vec0_metadata_column_kind kind = metadata_column->kind;
// verify input value matches column type
switch(kind) {
case VEC0_METADATA_COLUMN_KIND_BOOLEAN: {
if(sqlite3_value_type(v) != SQLITE_INTEGER || ((sqlite3_value_int(v) != 0) && (sqlite3_value_int(v) != 1))) {
rc = SQLITE_ERROR;
vtab_set_error(&p->base, "Expected 0 or 1 for BOOLEAN metadata column %.*s", metadata_column->name_length, metadata_column->name);
goto done;
}
break;
}
case VEC0_METADATA_COLUMN_KIND_INTEGER: {
if(sqlite3_value_type(v) != SQLITE_INTEGER) {
rc = SQLITE_ERROR;
vtab_set_error(&p->base, "Expected integer for INTEGER metadata column %.*s, received %s", metadata_column->name_length, metadata_column->name, type_name(sqlite3_value_type(v)));
goto done;
}
break;
}
case VEC0_METADATA_COLUMN_KIND_FLOAT: {
if(sqlite3_value_type(v) != SQLITE_FLOAT) {
rc = SQLITE_ERROR;
vtab_set_error(&p->base, "Expected float for FLOAT metadata column %.*s, received %s", metadata_column->name_length, metadata_column->name, type_name(sqlite3_value_type(v)));
goto done;
}
break;
}
case VEC0_METADATA_COLUMN_KIND_TEXT: {
if(sqlite3_value_type(v) != SQLITE_TEXT) {
rc = SQLITE_ERROR;
vtab_set_error(&p->base, "Expected text for TEXT metadata column %.*s, received %s", metadata_column->name_length, metadata_column->name, type_name(sqlite3_value_type(v)));
goto done;
}
break;
}
}
sqlite3_blob * blobValue = NULL;
rc = sqlite3_blob_open(p->db, p->schemaName, p->shadowMetadataChunksNames[metadata_column_idx], "data", chunk_id, 1, &blobValue);
if(rc != SQLITE_OK) {
goto done;
}
switch(kind) {
case VEC0_METADATA_COLUMN_KIND_BOOLEAN: {
u8 block;
int value = sqlite3_value_int(v);
rc = sqlite3_blob_read(blobValue, &block, sizeof(u8), (int) (chunk_offset / CHAR_BIT));
if(rc != SQLITE_OK) {
goto done;
}
if (value) {
block |= 1 << (chunk_offset % CHAR_BIT);
} else {
block &= ~(1 << (chunk_offset % CHAR_BIT));
}
rc = sqlite3_blob_write(blobValue, &block, sizeof(u8), chunk_offset / CHAR_BIT);
break;
}
case VEC0_METADATA_COLUMN_KIND_INTEGER: {
i64 value = sqlite3_value_int64(v);
rc = sqlite3_blob_write(blobValue, &value, sizeof(value), chunk_offset * sizeof(i64));
break;
}
case VEC0_METADATA_COLUMN_KIND_FLOAT: {
double value = sqlite3_value_double(v);
rc = sqlite3_blob_write(blobValue, &value, sizeof(value), chunk_offset * sizeof(double));
break;
}
case VEC0_METADATA_COLUMN_KIND_TEXT: {
int prev_n;
rc = sqlite3_blob_read(blobValue, &prev_n, sizeof(int), chunk_offset * VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH);
if(rc != SQLITE_OK) {
goto done;
}
const char * s = (const char *) sqlite3_value_text(v);
int n = sqlite3_value_bytes(v);
u8 view[VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH];
memset(view, 0, VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH);
memcpy(view, &n, sizeof(int));
memcpy(view+4, s, min(n, VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH-4));
rc = sqlite3_blob_write(blobValue, &view, VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH, chunk_offset * VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH);
if(n > VEC0_METADATA_TEXT_VIEW_DATA_LENGTH) {
const char * zSql;
if(isupdate && (prev_n > VEC0_METADATA_TEXT_VIEW_DATA_LENGTH)) {
zSql = sqlite3_mprintf("UPDATE " VEC0_SHADOW_METADATA_TEXT_DATA_NAME " SET data = ?2 WHERE rowid = ?1", p->schemaName, p->tableName, metadata_column_idx);
}else {
zSql = sqlite3_mprintf("INSERT INTO " VEC0_SHADOW_METADATA_TEXT_DATA_NAME " (rowid, data) VALUES (?1, ?2)", p->schemaName, p->tableName, metadata_column_idx);
}
if(!zSql) {
rc = SQLITE_NOMEM;
goto done;
}
sqlite3_stmt * stmt;
rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL);
if(rc != SQLITE_OK) {
goto done;
}
sqlite3_bind_int64(stmt, 1, rowid);
sqlite3_bind_text(stmt, 2, s, n, SQLITE_STATIC);
rc = sqlite3_step(stmt);
sqlite3_finalize(stmt);
if(rc != SQLITE_DONE) {
rc = SQLITE_ERROR;
goto done;
}
}
else if(prev_n > VEC0_METADATA_TEXT_VIEW_DATA_LENGTH) {
const char * zSql = sqlite3_mprintf("DELETE FROM " VEC0_SHADOW_METADATA_TEXT_DATA_NAME " WHERE rowid = ?", p->schemaName, p->tableName, metadata_column_idx);
if(!zSql) {
rc = SQLITE_NOMEM;
goto done;
}
sqlite3_stmt * stmt;
rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL);
if(rc != SQLITE_OK) {
goto done;
}
sqlite3_bind_int64(stmt, 1, rowid);
rc = sqlite3_step(stmt);
sqlite3_finalize(stmt);
if(rc != SQLITE_DONE) {
rc = SQLITE_ERROR;
goto done;
}
}
break;
}
}
if(rc != SQLITE_OK) {
}
rc = sqlite3_blob_close(blobValue);
if(rc != SQLITE_OK) {
goto done;
}
done:
return rc;
}
/**
* @brief Handles INSERT INTO operations on a vec0 table.
*
* @return int SQLITE_OK on success, otherwise error code on failure
*/
int vec0Update_Insert(sqlite3_vtab *pVTab, int argc, sqlite3_value **argv,
sqlite_int64 *pRowid) {
UNUSED_PARAMETER(argc);
vec0_vtab *p = (vec0_vtab *)pVTab;
int rc;
// Rowid for the inserted row, deterimined by the inserted ID + _rowids shadow
// table
i64 rowid;
// Array to hold the vector data of the inserted row. Individual elements will
// have a lifetime bound to the argv[..] values.
void *vectorDatas[VEC0_MAX_VECTOR_COLUMNS];
// Array to hold cleanup functions for vectorDatas[]
vector_cleanup cleanups[VEC0_MAX_VECTOR_COLUMNS];
sqlite3_value * partitionKeyValues[VEC0_MAX_PARTITION_COLUMNS];
// Rowid of the chunk in the _chunks shadow table that the row will be a part
// of.
i64 chunk_rowid;
// offset within the chunk where the rowid belongs
i64 chunk_offset;
// a write-able blob of the validity column for the given chunk. Used to mark
// validity bit
sqlite3_blob *blobChunksValidity = NULL;
// buffer for the valididty column for the given chunk. Maybe not needed here?
const unsigned char *bufferChunksValidity = NULL;
int numReadVectors = 0;
// Read all provided partition key values into partitionKeyValues
for (int i = 0; i < vec0_num_defined_user_columns(p); i++) {
if(p->user_column_kinds[i] != SQLITE_VEC0_USER_COLUMN_KIND_PARTITION) {
continue;
}
int partition_key_idx = p->user_column_idxs[i];
partitionKeyValues[partition_key_idx] = argv[2+VEC0_COLUMN_USERN_START + i];
int new_value_type = sqlite3_value_type(partitionKeyValues[partition_key_idx]);
if((new_value_type != SQLITE_NULL) && (new_value_type != p->paritition_columns[partition_key_idx].type)) {
// IMP: V11454_28292
vtab_set_error(
pVTab,
"Parition key type mismatch: The partition key column %.*s has type %s, but %s was provided.",
p->paritition_columns[partition_key_idx].name_length,
p->paritition_columns[partition_key_idx].name,
type_name(p->paritition_columns[partition_key_idx].type),
type_name(new_value_type)
);
rc = SQLITE_ERROR;
goto cleanup;
}
}
// read all the inserted vectors into vectorDatas, validate their lengths.
for (int i = 0; i < vec0_num_defined_user_columns(p); i++) {
if(p->user_column_kinds[i] != SQLITE_VEC0_USER_COLUMN_KIND_VECTOR) {
continue;
}
int vector_column_idx = p->user_column_idxs[i];
sqlite3_value *valueVector = argv[2 + VEC0_COLUMN_USERN_START + i];
size_t dimensions;
char *pzError;
enum VectorElementType elementType;
rc = vector_from_value(valueVector, &vectorDatas[vector_column_idx], &dimensions,
&elementType, &cleanups[vector_column_idx], &pzError);
if (rc != SQLITE_OK) {
// IMP: V06519_23358
vtab_set_error(
pVTab, "Inserted vector for the \"%.*s\" column is invalid: %z",
p->vector_columns[vector_column_idx].name_length, p->vector_columns[vector_column_idx].name, pzError);
rc = SQLITE_ERROR;
goto cleanup;
}
numReadVectors++;
if (elementType != p->vector_columns[vector_column_idx].element_type) {
// IMP: V08221_25059
vtab_set_error(
pVTab,
"Inserted vector for the \"%.*s\" column is expected to be of type "
"%s, but a %s vector was provided.",
p->vector_columns[i].name_length, p->vector_columns[i].name,
vector_subtype_name(p->vector_columns[i].element_type),
vector_subtype_name(elementType));
rc = SQLITE_ERROR;
goto cleanup;
}
if (dimensions != p->vector_columns[vector_column_idx].dimensions) {
// IMP: V01145_17984
vtab_set_error(
pVTab,
"Dimension mismatch for inserted vector for the \"%.*s\" column. "
"Expected %d dimensions but received %d.",
p->vector_columns[vector_column_idx].name_length, p->vector_columns[vector_column_idx].name,
p->vector_columns[vector_column_idx].dimensions, dimensions);
rc = SQLITE_ERROR;
goto cleanup;
}
}
// Cannot insert a value in the hidden "distance" column
if (sqlite3_value_type(argv[2 + vec0_column_distance_idx(p)]) !=
SQLITE_NULL) {
// IMP: V24228_08298
vtab_set_error(pVTab,
"A value was provided for the hidden \"distance\" column.");
rc = SQLITE_ERROR;
goto cleanup;
}
// Cannot insert a value in the hidden "k" column
if (sqlite3_value_type(argv[2 + vec0_column_k_idx(p)]) != SQLITE_NULL) {
// IMP: V11875_28713
vtab_set_error(pVTab, "A value was provided for the hidden \"k\" column.");
rc = SQLITE_ERROR;
goto cleanup;
}
// Step #1: Insert/get a rowid for this row, from the _rowids table.
rc = vec0Update_InsertRowidStep(p, argv[2 + VEC0_COLUMN_ID], &rowid);
if (rc != SQLITE_OK) {
goto cleanup;
}
// Step #2: Find the next "available" position in the _chunks table for this
// row.
rc = vec0Update_InsertNextAvailableStep(p, partitionKeyValues,
&chunk_rowid, &chunk_offset,
&blobChunksValidity,
&bufferChunksValidity);
if (rc != SQLITE_OK) {
goto cleanup;
}
// Step #3: With the next available chunk position, write out all the vectors
// to their specified location.
rc = vec0Update_InsertWriteFinalStep(p, chunk_rowid, chunk_offset, rowid,
vectorDatas, blobChunksValidity,
bufferChunksValidity);
if (rc != SQLITE_OK) {
goto cleanup;
}
if(p->numAuxiliaryColumns > 0) {
sqlite3_stmt *stmt;
sqlite3_str * s = sqlite3_str_new(NULL);
sqlite3_str_appendf(s, "INSERT INTO " VEC0_SHADOW_AUXILIARY_NAME "(rowid ", p->schemaName, p->tableName);
for(int i = 0; i < p->numAuxiliaryColumns; i++) {
sqlite3_str_appendf(s, ", value%02d", i);
}
sqlite3_str_appendall(s, ") VALUES (? ");
for(int i = 0; i < p->numAuxiliaryColumns; i++) {
sqlite3_str_appendall(s, ", ?");
}
sqlite3_str_appendall(s, ")");
char * zSql = sqlite3_str_finish(s);
// TODO double check error handling ehre
if(!zSql) {
rc = SQLITE_NOMEM;
goto cleanup;
}
rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL);
if(rc != SQLITE_OK) {
goto cleanup;
}
sqlite3_bind_int64(stmt, 1, rowid);
for (int i = 0; i < vec0_num_defined_user_columns(p); i++) {
if(p->user_column_kinds[i] != SQLITE_VEC0_USER_COLUMN_KIND_AUXILIARY) {
continue;
}
int auxiliary_key_idx = p->user_column_idxs[i];
sqlite3_value * v = argv[2+VEC0_COLUMN_USERN_START + i];
int v_type = sqlite3_value_type(v);
if(v_type != SQLITE_NULL && (v_type != p->auxiliary_columns[auxiliary_key_idx].type)) {
sqlite3_finalize(stmt);
rc = SQLITE_CONSTRAINT;
vtab_set_error(
pVTab,
"Auxiliary column type mismatch: The auxiliary column %.*s has type %s, but %s was provided.",
p->auxiliary_columns[auxiliary_key_idx].name_length,
p->auxiliary_columns[auxiliary_key_idx].name,
type_name(p->auxiliary_columns[auxiliary_key_idx].type),
type_name(v_type)
);
goto cleanup;
}
// first 1 is for 1-based indexing on sqlite3_bind_*, second 1 is to account for initial rowid parameter
sqlite3_bind_value(stmt, 1 + 1 + auxiliary_key_idx, v);
}
rc = sqlite3_step(stmt);
if(rc != SQLITE_DONE) {
sqlite3_finalize(stmt);
rc = SQLITE_ERROR;
goto cleanup;
}
sqlite3_finalize(stmt);
}
for(int i = 0; i < vec0_num_defined_user_columns(p); i++) {
if(p->user_column_kinds[i] != SQLITE_VEC0_USER_COLUMN_KIND_METADATA) {
continue;
}
int metadata_idx = p->user_column_idxs[i];
sqlite3_value *v = argv[2 + VEC0_COLUMN_USERN_START + i];
rc = vec0_write_metadata_value(p, metadata_idx, rowid, chunk_rowid, chunk_offset, v, 0);
if(rc != SQLITE_OK) {
goto cleanup;
}
}
*pRowid = rowid;
rc = SQLITE_OK;
cleanup:
for (int i = 0; i < numReadVectors; i++) {
cleanups[i](vectorDatas[i]);
}
sqlite3_free((void *)bufferChunksValidity);
int brc = sqlite3_blob_close(blobChunksValidity);
if ((rc == SQLITE_OK) && (brc != SQLITE_OK)) {
vtab_set_error(&p->base,
VEC_INTERAL_ERROR "unknown error, blobChunksValidity could "
"not be closed, please file an issue");
return brc;
}
return rc;
}
int vec0Update_Delete_ClearValidity(vec0_vtab *p, i64 chunk_id,
u64 chunk_offset) {
int rc, brc;
sqlite3_blob *blobChunksValidity = NULL;
char unsigned bx;
int validityOffset = chunk_offset / CHAR_BIT;
// 2. ensure chunks.validity bit is 1, then set to 0
rc = sqlite3_blob_open(p->db, p->schemaName, p->shadowChunksName, "validity",
chunk_id, 1, &blobChunksValidity);
if (rc != SQLITE_OK) {
// IMP: V26002_10073
vtab_set_error(&p->base, "could not open validity blob for %s.%s.%lld",
p->schemaName, p->shadowChunksName, chunk_id);
return SQLITE_ERROR;
}
// will skip the sqlite3_blob_bytes(blobChunksValidity) check for now,
// the read below would catch it
rc = sqlite3_blob_read(blobChunksValidity, &bx, sizeof(bx), validityOffset);
if (rc != SQLITE_OK) {
// IMP: V21193_05263
vtab_set_error(
&p->base, "could not read validity blob for %s.%s.%lld at %d",
p->schemaName, p->shadowChunksName, chunk_id, validityOffset);
goto cleanup;
}
if (!(bx >> (chunk_offset % CHAR_BIT))) {
// IMP: V21193_05263
rc = SQLITE_ERROR;
vtab_set_error(
&p->base,
"vec0 deletion error: validity bit is not set for %s.%s.%lld at %d",
p->schemaName, p->shadowChunksName, chunk_id, validityOffset);
goto cleanup;
}
char unsigned mask = ~(1 << (chunk_offset % CHAR_BIT));
char result = bx & mask;
rc = sqlite3_blob_write(blobChunksValidity, &result, sizeof(bx),
validityOffset);
if (rc != SQLITE_OK) {
vtab_set_error(
&p->base, "could not write to validity blob for %s.%s.%lld at %d",
p->schemaName, p->shadowChunksName, chunk_id, validityOffset);
goto cleanup;
}
cleanup:
brc = sqlite3_blob_close(blobChunksValidity);
if (rc != SQLITE_OK)
return rc;
if (brc != SQLITE_OK) {
vtab_set_error(&p->base,
"vec0 deletion error: Error commiting validity blob "
"transaction on %s.%s.%lld at %d",
p->schemaName, p->shadowChunksName, chunk_id,
validityOffset);
return brc;
}
return SQLITE_OK;
}
int vec0Update_Delete_ClearRowid(vec0_vtab *p, i64 chunk_id,
u64 chunk_offset) {
int rc, brc;
sqlite3_blob *blobChunksRowids = NULL;
i64 zero = 0;
rc = sqlite3_blob_open(p->db, p->schemaName, p->shadowChunksName, "rowids",
chunk_id, 1, &blobChunksRowids);
if (rc != SQLITE_OK) {
vtab_set_error(&p->base, "could not open rowids blob for %s.%s.%lld",
p->schemaName, p->shadowChunksName, chunk_id);
return SQLITE_ERROR;
}
rc = sqlite3_blob_write(blobChunksRowids, &zero, sizeof(zero),
chunk_offset * sizeof(i64));
if (rc != SQLITE_OK) {
vtab_set_error(&p->base,
"could not write to rowids blob for %s.%s.%lld at %llu",
p->schemaName, p->shadowChunksName, chunk_id, chunk_offset);
}
brc = sqlite3_blob_close(blobChunksRowids);
if (rc != SQLITE_OK)
return rc;
if (brc != SQLITE_OK) {
vtab_set_error(&p->base,
"vec0 deletion error: Error commiting rowids blob "
"transaction on %s.%s.%lld at %llu",
p->schemaName, p->shadowChunksName, chunk_id, chunk_offset);
return brc;
}
return SQLITE_OK;
}
int vec0Update_Delete_ClearVectors(vec0_vtab *p, i64 chunk_id,
u64 chunk_offset) {
int rc, brc;
for (int i = 0; i < p->numVectorColumns; i++) {
sqlite3_blob *blobVectors = NULL;
size_t n = vector_column_byte_size(p->vector_columns[i]);
rc = sqlite3_blob_open(p->db, p->schemaName,
p->shadowVectorChunksNames[i], "vectors",
chunk_id, 1, &blobVectors);
if (rc != SQLITE_OK) {
vtab_set_error(&p->base,
"could not open vector blob for %s.%s.%lld column %d",
p->schemaName, p->shadowVectorChunksNames[i], chunk_id, i);
return SQLITE_ERROR;
}
void *zeroBuf = sqlite3_malloc(n);
if (!zeroBuf) {
sqlite3_blob_close(blobVectors);
return SQLITE_NOMEM;
}
memset(zeroBuf, 0, n);
rc = sqlite3_blob_write(blobVectors, zeroBuf, n, chunk_offset * n);
sqlite3_free(zeroBuf);
if (rc != SQLITE_OK) {
vtab_set_error(
&p->base,
"could not write to vector blob for %s.%s.%lld at %llu column %d",
p->schemaName, p->shadowVectorChunksNames[i], chunk_id,
chunk_offset, i);
}
brc = sqlite3_blob_close(blobVectors);
if (rc != SQLITE_OK)
return rc;
if (brc != SQLITE_OK) {
vtab_set_error(&p->base,
"vec0 deletion error: Error commiting vector blob "
"transaction on %s.%s.%lld column %d",
p->schemaName, p->shadowVectorChunksNames[i], chunk_id, i);
return brc;
}
}
return SQLITE_OK;
}
int vec0Update_Delete_DeleteChunkIfEmpty(vec0_vtab *p, i64 chunk_id,
int *deleted) {
int rc, brc;
sqlite3_blob *blobValidity = NULL;
*deleted = 0;
rc = sqlite3_blob_open(p->db, p->schemaName, p->shadowChunksName, "validity",
chunk_id, 0, &blobValidity);
if (rc != SQLITE_OK) {
vtab_set_error(&p->base,
"could not open validity blob for chunk %lld", chunk_id);
return SQLITE_ERROR;
}
int validitySize = sqlite3_blob_bytes(blobValidity);
unsigned char *validityBuf = sqlite3_malloc(validitySize);
if (!validityBuf) {
sqlite3_blob_close(blobValidity);
return SQLITE_NOMEM;
}
rc = sqlite3_blob_read(blobValidity, validityBuf, validitySize, 0);
brc = sqlite3_blob_close(blobValidity);
if (rc != SQLITE_OK) {
sqlite3_free(validityBuf);
return rc;
}
if (brc != SQLITE_OK) {
sqlite3_free(validityBuf);
return brc;
}
int allZero = 1;
for (int i = 0; i < validitySize; i++) {
if (validityBuf[i] != 0) {
allZero = 0;
break;
}
}
sqlite3_free(validityBuf);
if (!allZero) {
return SQLITE_OK;
}
// All validity bits are zero — delete this chunk and its associated data
char *zSql;
sqlite3_stmt *stmt;
// Delete from _chunks
zSql = sqlite3_mprintf(
"DELETE FROM " VEC0_SHADOW_CHUNKS_NAME " WHERE rowid = ?",
p->schemaName, p->tableName);
if (!zSql)
return SQLITE_NOMEM;
rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL);
sqlite3_free(zSql);
if (rc != SQLITE_OK)
return rc;
sqlite3_bind_int64(stmt, 1, chunk_id);
rc = sqlite3_step(stmt);
sqlite3_finalize(stmt);
if (rc != SQLITE_DONE)
return SQLITE_ERROR;
// Delete from each _vector_chunksNN
for (int i = 0; i < p->numVectorColumns; i++) {
zSql = sqlite3_mprintf(
"DELETE FROM " VEC0_SHADOW_VECTOR_N_NAME " WHERE rowid = ?",
p->schemaName, p->tableName, i);
if (!zSql)
return SQLITE_NOMEM;
rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL);
sqlite3_free(zSql);
if (rc != SQLITE_OK)
return rc;
sqlite3_bind_int64(stmt, 1, chunk_id);
rc = sqlite3_step(stmt);
sqlite3_finalize(stmt);
if (rc != SQLITE_DONE)
return SQLITE_ERROR;
}
// Delete from each _metadatachunksNN
for (int i = 0; i < p->numMetadataColumns; i++) {
zSql = sqlite3_mprintf(
"DELETE FROM " VEC0_SHADOW_METADATA_N_NAME " WHERE rowid = ?",
p->schemaName, p->tableName, i);
if (!zSql)
return SQLITE_NOMEM;
rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL);
sqlite3_free(zSql);
if (rc != SQLITE_OK)
return rc;
sqlite3_bind_int64(stmt, 1, chunk_id);
rc = sqlite3_step(stmt);
sqlite3_finalize(stmt);
if (rc != SQLITE_DONE)
return SQLITE_ERROR;
}
// Invalidate cached stmtLatestChunk so it gets re-prepared on next insert
if (p->stmtLatestChunk) {
sqlite3_finalize(p->stmtLatestChunk);
p->stmtLatestChunk = NULL;
}
*deleted = 1;
return SQLITE_OK;
}
int vec0Update_Delete_DeleteRowids(vec0_vtab *p, i64 rowid) {
int rc;
sqlite3_stmt *stmt = NULL;
char *zSql =
sqlite3_mprintf("DELETE FROM " VEC0_SHADOW_ROWIDS_NAME " WHERE rowid = ?",
p->schemaName, p->tableName);
if (!zSql) {
return SQLITE_NOMEM;
}
rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL);
sqlite3_free(zSql);
if (rc != SQLITE_OK) {
goto cleanup;
}
sqlite3_bind_int64(stmt, 1, rowid);
rc = sqlite3_step(stmt);
if (rc != SQLITE_DONE) {
goto cleanup;
}
rc = SQLITE_OK;
cleanup:
sqlite3_finalize(stmt);
return rc;
}
int vec0Update_Delete_DeleteAux(vec0_vtab *p, i64 rowid) {
int rc;
sqlite3_stmt *stmt = NULL;
char *zSql =
sqlite3_mprintf("DELETE FROM " VEC0_SHADOW_AUXILIARY_NAME " WHERE rowid = ?",
p->schemaName, p->tableName);
if (!zSql) {
return SQLITE_NOMEM;
}
rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL);
sqlite3_free(zSql);
if (rc != SQLITE_OK) {
goto cleanup;
}
sqlite3_bind_int64(stmt, 1, rowid);
rc = sqlite3_step(stmt);
if (rc != SQLITE_DONE) {
goto cleanup;
}
rc = SQLITE_OK;
cleanup:
sqlite3_finalize(stmt);
return rc;
}
int vec0Update_Delete_ClearMetadata(vec0_vtab *p, int metadata_idx, i64 rowid, i64 chunk_id,
u64 chunk_offset) {
int rc;
sqlite3_blob * blobValue;
vec0_metadata_column_kind kind = p->metadata_columns[metadata_idx].kind;
rc = sqlite3_blob_open(p->db, p->schemaName, p->shadowMetadataChunksNames[metadata_idx], "data", chunk_id, 1, &blobValue);
if(rc != SQLITE_OK) {
return rc;
}
switch(kind) {
case VEC0_METADATA_COLUMN_KIND_BOOLEAN: {
u8 block;
rc = sqlite3_blob_read(blobValue, &block, sizeof(u8), (int) (chunk_offset / CHAR_BIT));
if(rc != SQLITE_OK) {
goto done;
}
block &= ~(1 << (chunk_offset % CHAR_BIT));
rc = sqlite3_blob_write(blobValue, &block, sizeof(u8), chunk_offset / CHAR_BIT);
break;
}
case VEC0_METADATA_COLUMN_KIND_INTEGER: {
i64 v = 0;
rc = sqlite3_blob_write(blobValue, &v, sizeof(v), chunk_offset * sizeof(i64));
break;
}
case VEC0_METADATA_COLUMN_KIND_FLOAT: {
double v = 0;
rc = sqlite3_blob_write(blobValue, &v, sizeof(v), chunk_offset * sizeof(double));
break;
}
case VEC0_METADATA_COLUMN_KIND_TEXT: {
int n;
rc = sqlite3_blob_read(blobValue, &n, sizeof(int), chunk_offset * VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH);
if(rc != SQLITE_OK) {
goto done;
}
u8 view[VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH];
memset(view, 0, VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH);
rc = sqlite3_blob_write(blobValue, &view, sizeof(view), chunk_offset * VEC0_METADATA_TEXT_VIEW_BUFFER_LENGTH);
if(rc != SQLITE_OK) {
goto done;
}
if(n > VEC0_METADATA_TEXT_VIEW_DATA_LENGTH) {
const char * zSql = sqlite3_mprintf("DELETE FROM " VEC0_SHADOW_METADATA_TEXT_DATA_NAME " WHERE rowid = ?", p->schemaName, p->tableName, metadata_idx);
if(!zSql) {
rc = SQLITE_NOMEM;
goto done;
}
sqlite3_stmt * stmt;
rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL);
if(rc != SQLITE_OK) {
goto done;
}
sqlite3_bind_int64(stmt, 1, rowid);
rc = sqlite3_step(stmt);
if(rc != SQLITE_DONE) {
rc = SQLITE_ERROR;
goto done;
}
sqlite3_finalize(stmt);
}
break;
}
}
int rc2;
done:
rc2 = sqlite3_blob_close(blobValue);
if(rc == SQLITE_OK) {
return rc2;
}
return rc;
}
int vec0Update_Delete(sqlite3_vtab *pVTab, sqlite3_value *idValue) {
vec0_vtab *p = (vec0_vtab *)pVTab;
int rc;
i64 rowid;
i64 chunk_id;
i64 chunk_offset;
if (p->pkIsText) {
rc = vec0_rowid_from_id(p, idValue, &rowid);
if (rc != SQLITE_OK) {
return rc;
}
} else {
rowid = sqlite3_value_int64(idValue);
}
// 1. Find chunk position for given rowid
// 2. Ensure that validity bit for position is 1, then set to 0
// 3. Zero out rowid in chunks.rowid
// 4. Zero out vector data in all vector column chunks
// 5. Delete value in _rowids table
// 1. get chunk_id and chunk_offset from _rowids
rc = vec0_get_chunk_position(p, rowid, NULL, &chunk_id, &chunk_offset);
if (rc != SQLITE_OK) {
return rc;
}
// 2. clear validity bit
rc = vec0Update_Delete_ClearValidity(p, chunk_id, chunk_offset);
if (rc != SQLITE_OK) {
return rc;
}
// 3. zero out rowid in chunks.rowids
rc = vec0Update_Delete_ClearRowid(p, chunk_id, chunk_offset);
if (rc != SQLITE_OK) {
return rc;
}
// 4. zero out any data in vector chunks tables
rc = vec0Update_Delete_ClearVectors(p, chunk_id, chunk_offset);
if (rc != SQLITE_OK) {
return rc;
}
// 5. delete from _rowids table
rc = vec0Update_Delete_DeleteRowids(p, rowid);
if (rc != SQLITE_OK) {
return rc;
}
// 6. delete any auxiliary rows
if(p->numAuxiliaryColumns > 0) {
rc = vec0Update_Delete_DeleteAux(p, rowid);
if (rc != SQLITE_OK) {
return rc;
}
}
// 7. delete metadata
for(int i = 0; i < p->numMetadataColumns; i++) {
rc = vec0Update_Delete_ClearMetadata(p, i, rowid, chunk_id, chunk_offset);
if (rc != SQLITE_OK) {
return rc;
}
}
// 8. reclaim chunk if fully empty
{
int chunkDeleted;
rc = vec0Update_Delete_DeleteChunkIfEmpty(p, chunk_id, &chunkDeleted);
if (rc != SQLITE_OK) {
return rc;
}
}
return SQLITE_OK;
}
int vec0Update_UpdateAuxColumn(vec0_vtab *p, int auxiliary_column_idx, sqlite3_value * value, i64 rowid) {
int rc;
sqlite3_stmt *stmt;
const char * zSql = sqlite3_mprintf("UPDATE " VEC0_SHADOW_AUXILIARY_NAME " SET value%02d = ? WHERE rowid = ?", p->schemaName, p->tableName, auxiliary_column_idx);
if(!zSql) {
return SQLITE_NOMEM;
}
rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmt, NULL);
if(rc != SQLITE_OK) {
return rc;
}
sqlite3_bind_value(stmt, 1, value);
sqlite3_bind_int64(stmt, 2, rowid);
rc = sqlite3_step(stmt);
if(rc != SQLITE_DONE) {
sqlite3_finalize(stmt);
return SQLITE_ERROR;
}
sqlite3_finalize(stmt);
return SQLITE_OK;
}
int vec0Update_UpdateVectorColumn(vec0_vtab *p, i64 chunk_id, i64 chunk_offset,
int i, sqlite3_value *valueVector) {
int rc;
sqlite3_blob *blobVectors = NULL;
char *pzError;
size_t dimensions;
enum VectorElementType elementType;
void *vector;
vector_cleanup cleanup = vector_cleanup_noop;
// https://github.com/asg017/sqlite-vec/issues/53
rc = vector_from_value(valueVector, &vector, &dimensions, &elementType,
&cleanup, &pzError);
if (rc != SQLITE_OK) {
// IMP: V15203_32042
vtab_set_error(
&p->base, "Updated vector for the \"%.*s\" column is invalid: %z",
p->vector_columns[i].name_length, p->vector_columns[i].name, pzError);
rc = SQLITE_ERROR;
goto cleanup;
}
if (elementType != p->vector_columns[i].element_type) {
// IMP: V03643_20481
vtab_set_error(
&p->base,
"Updated vector for the \"%.*s\" column is expected to be of type "
"%s, but a %s vector was provided.",
p->vector_columns[i].name_length, p->vector_columns[i].name,
vector_subtype_name(p->vector_columns[i].element_type),
vector_subtype_name(elementType));
rc = SQLITE_ERROR;
goto cleanup;
}
if (dimensions != p->vector_columns[i].dimensions) {
// IMP: V25739_09810
vtab_set_error(
&p->base,
"Dimension mismatch for new updated vector for the \"%.*s\" column. "
"Expected %d dimensions but received %d.",
p->vector_columns[i].name_length, p->vector_columns[i].name,
p->vector_columns[i].dimensions, dimensions);
rc = SQLITE_ERROR;
goto cleanup;
}
rc = sqlite3_blob_open(p->db, p->schemaName, p->shadowVectorChunksNames[i],
"vectors", chunk_id, 1, &blobVectors);
if (rc != SQLITE_OK) {
vtab_set_error(&p->base, "Could not open vectors blob for %s.%s.%lld",
p->schemaName, p->shadowVectorChunksNames[i], chunk_id);
goto cleanup;
}
rc = vec0_write_vector_to_vector_blob(blobVectors, chunk_offset, vector,
p->vector_columns[i].dimensions,
p->vector_columns[i].element_type);
if (rc != SQLITE_OK) {
vtab_set_error(&p->base, "Could not write to vectors blob for %s.%s.%lld",
p->schemaName, p->shadowVectorChunksNames[i], chunk_id);
goto cleanup;
}
cleanup:
cleanup(vector);
int brc = sqlite3_blob_close(blobVectors);
if (rc != SQLITE_OK) {
return rc;
}
if (brc != SQLITE_OK) {
vtab_set_error(
&p->base,
"Could not commit blob transaction for vectors blob for %s.%s.%lld",
p->schemaName, p->shadowVectorChunksNames[i], chunk_id);
return brc;
}
return SQLITE_OK;
}
int vec0Update_Update(sqlite3_vtab *pVTab, int argc, sqlite3_value **argv) {
UNUSED_PARAMETER(argc);
vec0_vtab *p = (vec0_vtab *)pVTab;
int rc;
i64 chunk_id;
i64 chunk_offset;
i64 rowid;
if (p->pkIsText) {
const char *a = (const char *)sqlite3_value_text(argv[0]);
const char *b = (const char *)sqlite3_value_text(argv[1]);
// IMP: V08886_25725
if ((sqlite3_value_bytes(argv[0]) != sqlite3_value_bytes(argv[1])) ||
strncmp(a, b, sqlite3_value_bytes(argv[0])) != 0) {
vtab_set_error(pVTab,
"UPDATEs on vec0 primary key values are not allowed.");
return SQLITE_ERROR;
}
rc = vec0_rowid_from_id(p, argv[0], &rowid);
if (rc != SQLITE_OK) {
return rc;
}
} else {
rowid = sqlite3_value_int64(argv[0]);
}
// 1) get chunk_id and chunk_offset from _rowids
rc = vec0_get_chunk_position(p, rowid, NULL, &chunk_id, &chunk_offset);
if (rc != SQLITE_OK) {
return rc;
}
// 2) update any partition key values
for (int i = 0; i < vec0_num_defined_user_columns(p); i++) {
if(p->user_column_kinds[i] != SQLITE_VEC0_USER_COLUMN_KIND_PARTITION) {
continue;
}
sqlite3_value * value = argv[2+VEC0_COLUMN_USERN_START + i];
if(sqlite3_value_nochange(value)) {
continue;
}
vtab_set_error(pVTab, "UPDATE on partition key columns are not supported yet. ");
return SQLITE_ERROR;
}
// 3) handle auxiliary column updates
for (int i = 0; i < vec0_num_defined_user_columns(p); i++) {
if(p->user_column_kinds[i] != SQLITE_VEC0_USER_COLUMN_KIND_AUXILIARY) {
continue;
}
int auxiliary_column_idx = p->user_column_idxs[i];
sqlite3_value * value = argv[2+VEC0_COLUMN_USERN_START + i];
if(sqlite3_value_nochange(value)) {
continue;
}
rc = vec0Update_UpdateAuxColumn(p, auxiliary_column_idx, value, rowid);
if(rc != SQLITE_OK) {
return SQLITE_ERROR;
}
}
// 4) handle metadata column updates
for (int i = 0; i < vec0_num_defined_user_columns(p); i++) {
if(p->user_column_kinds[i] != SQLITE_VEC0_USER_COLUMN_KIND_METADATA) {
continue;
}
int metadata_column_idx = p->user_column_idxs[i];
sqlite3_value * value = argv[2+VEC0_COLUMN_USERN_START + i];
if(sqlite3_value_nochange(value)) {
continue;
}
rc = vec0_write_metadata_value(p, metadata_column_idx, rowid, chunk_id, chunk_offset, value, 1);
if(rc != SQLITE_OK) {
return rc;
}
}
// 5) iterate over all new vectors, update the vectors
for (int i = 0; i < vec0_num_defined_user_columns(p); i++) {
if(p->user_column_kinds[i] != SQLITE_VEC0_USER_COLUMN_KIND_VECTOR) {
continue;
}
int vector_idx = p->user_column_idxs[i];
sqlite3_value *valueVector = argv[2 + VEC0_COLUMN_USERN_START + i];
// in vec0Column, we check sqlite3_vtab_nochange() on vector columns.
// If the vector column isn't being changed, we return NULL;
// That's not great, that means vector columns can never be NULLABLE
// (bc we cant distinguish if an updated vector is truly NULL or nochange).
// Also it means that if someone tries to run `UPDATE v SET X = NULL`,
// we can't effectively detect and raise an error.
// A better solution would be to use a custom result_type for "empty",
// but subtypes don't appear to survive xColumn -> xUpdate, it's always 0.
// So for now, we'll just use NULL and warn people to not SET X = NULL
// in the docs.
if (sqlite3_value_type(valueVector) == SQLITE_NULL) {
continue;
}
rc = vec0Update_UpdateVectorColumn(p, chunk_id, chunk_offset, vector_idx,
valueVector);
if (rc != SQLITE_OK) {
return SQLITE_ERROR;
}
}
return SQLITE_OK;
}
static int vec0Update(sqlite3_vtab *pVTab, int argc, sqlite3_value **argv,
sqlite_int64 *pRowid) {
// DELETE operation
if (argc == 1 && sqlite3_value_type(argv[0]) != SQLITE_NULL) {
return vec0Update_Delete(pVTab, argv[0]);
}
// INSERT operation
else if (argc > 1 && sqlite3_value_type(argv[0]) == SQLITE_NULL) {
return vec0Update_Insert(pVTab, argc, argv, pRowid);
}
// UPDATE operation
else if (argc > 1 && sqlite3_value_type(argv[0]) != SQLITE_NULL) {
return vec0Update_Update(pVTab, argc, argv);
} else {
vtab_set_error(pVTab, "Unrecognized xUpdate operation provided for vec0.");
return SQLITE_ERROR;
}
}
static int vec0ShadowName(const char *zName) {
static const char *azName[] = {
"rowids", "chunks", "auxiliary", "info",
// Up to VEC0_MAX_METADATA_COLUMNS
// TODO be smarter about this man
"metadatachunks00",
"metadatachunks01",
"metadatachunks02",
"metadatachunks03",
"metadatachunks04",
"metadatachunks05",
"metadatachunks06",
"metadatachunks07",
"metadatachunks08",
"metadatachunks09",
"metadatachunks10",
"metadatachunks11",
"metadatachunks12",
"metadatachunks13",
"metadatachunks14",
"metadatachunks15",
// Up to
"metadatatext00",
"metadatatext01",
"metadatatext02",
"metadatatext03",
"metadatatext04",
"metadatatext05",
"metadatatext06",
"metadatatext07",
"metadatatext08",
"metadatatext09",
"metadatatext10",
"metadatatext11",
"metadatatext12",
"metadatatext13",
"metadatatext14",
"metadatatext15",
};
for (size_t i = 0; i < sizeof(azName) / sizeof(azName[0]); i++) {
if (sqlite3_stricmp(zName, azName[i]) == 0)
return 1;
}
//for(size_t i = 0; i < )"vector_chunks", "metadatachunks"
return 0;
}
static int vec0Begin(sqlite3_vtab *pVTab) {
UNUSED_PARAMETER(pVTab);
return SQLITE_OK;
}
static int vec0Sync(sqlite3_vtab *pVTab) {
UNUSED_PARAMETER(pVTab);
vec0_vtab *p = (vec0_vtab *)pVTab;
if (p->stmtLatestChunk) {
sqlite3_finalize(p->stmtLatestChunk);
p->stmtLatestChunk = NULL;
}
if (p->stmtRowidsInsertRowid) {
sqlite3_finalize(p->stmtRowidsInsertRowid);
p->stmtRowidsInsertRowid = NULL;
}
if (p->stmtRowidsInsertId) {
sqlite3_finalize(p->stmtRowidsInsertId);
p->stmtRowidsInsertId = NULL;
}
if (p->stmtRowidsUpdatePosition) {
sqlite3_finalize(p->stmtRowidsUpdatePosition);
p->stmtRowidsUpdatePosition = NULL;
}
if (p->stmtRowidsGetChunkPosition) {
sqlite3_finalize(p->stmtRowidsGetChunkPosition);
p->stmtRowidsGetChunkPosition = NULL;
}
return SQLITE_OK;
}
static int vec0Commit(sqlite3_vtab *pVTab) {
UNUSED_PARAMETER(pVTab);
return SQLITE_OK;
}
static int vec0Rollback(sqlite3_vtab *pVTab) {
UNUSED_PARAMETER(pVTab);
return SQLITE_OK;
}
static sqlite3_module vec0Module = {
/* iVersion */ 3,
/* xCreate */ vec0Create,
/* xConnect */ vec0Connect,
/* xBestIndex */ vec0BestIndex,
/* xDisconnect */ vec0Disconnect,
/* xDestroy */ vec0Destroy,
/* xOpen */ vec0Open,
/* xClose */ vec0Close,
/* xFilter */ vec0Filter,
/* xNext */ vec0Next,
/* xEof */ vec0Eof,
/* xColumn */ vec0Column,
/* xRowid */ vec0Rowid,
/* xUpdate */ vec0Update,
/* xBegin */ vec0Begin,
/* xSync */ vec0Sync,
/* xCommit */ vec0Commit,
/* xRollback */ vec0Rollback,
/* xFindFunction */ 0,
/* xRename */ 0, // https://github.com/asg017/sqlite-vec/issues/43
/* xSavepoint */ 0,
/* xRelease */ 0,
/* xRollbackTo */ 0,
/* xShadowName */ vec0ShadowName,
#if SQLITE_VERSION_NUMBER >= 3044000
/* xIntegrity */ 0, // https://github.com/asg017/sqlite-vec/issues/44
#endif
};
#pragma endregion
static char *POINTER_NAME_STATIC_BLOB_DEF = "vec0-static_blob_def";
struct static_blob_definition {
void *p;
size_t dimensions;
size_t nvectors;
enum VectorElementType element_type;
};
static void vec_static_blob_from_raw(sqlite3_context *context, int argc,
sqlite3_value **argv) {
assert(argc == 4);
struct static_blob_definition *p;
p = sqlite3_malloc(sizeof(*p));
if (!p) {
sqlite3_result_error_nomem(context);
return;
}
memset(p, 0, sizeof(*p));
p->p = (void *)sqlite3_value_int64(argv[0]);
p->element_type = SQLITE_VEC_ELEMENT_TYPE_FLOAT32;
p->dimensions = sqlite3_value_int64(argv[2]);
p->nvectors = sqlite3_value_int64(argv[3]);
sqlite3_result_pointer(context, p, POINTER_NAME_STATIC_BLOB_DEF,
sqlite3_free);
}
#pragma region vec_static_blobs() table function
#define MAX_STATIC_BLOBS 16
typedef struct static_blob static_blob;
struct static_blob {
char *name;
void *p;
size_t dimensions;
size_t nvectors;
enum VectorElementType element_type;
};
typedef struct vec_static_blob_data vec_static_blob_data;
struct vec_static_blob_data {
static_blob static_blobs[MAX_STATIC_BLOBS];
};
typedef struct vec_static_blobs_vtab vec_static_blobs_vtab;
struct vec_static_blobs_vtab {
sqlite3_vtab base;
vec_static_blob_data *data;
};
typedef struct vec_static_blobs_cursor vec_static_blobs_cursor;
struct vec_static_blobs_cursor {
sqlite3_vtab_cursor base;
sqlite3_int64 iRowid;
};
static int vec_static_blobsConnect(sqlite3 *db, void *pAux, int argc,
const char *const *argv,
sqlite3_vtab **ppVtab, char **pzErr) {
UNUSED_PARAMETER(argc);
UNUSED_PARAMETER(argv);
UNUSED_PARAMETER(pzErr);
vec_static_blobs_vtab *pNew;
#define VEC_STATIC_BLOBS_NAME 0
#define VEC_STATIC_BLOBS_DATA 1
#define VEC_STATIC_BLOBS_DIMENSIONS 2
#define VEC_STATIC_BLOBS_COUNT 3
int rc = sqlite3_declare_vtab(
db, "CREATE TABLE x(name, data, dimensions hidden, count hidden)");
if (rc == SQLITE_OK) {
pNew = sqlite3_malloc(sizeof(*pNew));
*ppVtab = (sqlite3_vtab *)pNew;
if (pNew == 0)
return SQLITE_NOMEM;
memset(pNew, 0, sizeof(*pNew));
pNew->data = pAux;
}
return rc;
}
static int vec_static_blobsDisconnect(sqlite3_vtab *pVtab) {
vec_static_blobs_vtab *p = (vec_static_blobs_vtab *)pVtab;
sqlite3_free(p);
return SQLITE_OK;
}
static int vec_static_blobsUpdate(sqlite3_vtab *pVTab, int argc,
sqlite3_value **argv, sqlite_int64 *pRowid) {
UNUSED_PARAMETER(pRowid);
vec_static_blobs_vtab *p = (vec_static_blobs_vtab *)pVTab;
// DELETE operation
if (argc == 1 && sqlite3_value_type(argv[0]) != SQLITE_NULL) {
return SQLITE_ERROR;
}
// INSERT operation
else if (argc > 1 && sqlite3_value_type(argv[0]) == SQLITE_NULL) {
const char *key =
(const char *)sqlite3_value_text(argv[2 + VEC_STATIC_BLOBS_NAME]);
int idx = -1;
for (int i = 0; i < MAX_STATIC_BLOBS; i++) {
if (!p->data->static_blobs[i].name) {
p->data->static_blobs[i].name = sqlite3_mprintf("%s", key);
idx = i;
break;
}
}
if (idx < 0)
abort();
struct static_blob_definition *def = sqlite3_value_pointer(
argv[2 + VEC_STATIC_BLOBS_DATA], POINTER_NAME_STATIC_BLOB_DEF);
p->data->static_blobs[idx].p = def->p;
p->data->static_blobs[idx].dimensions = def->dimensions;
p->data->static_blobs[idx].nvectors = def->nvectors;
p->data->static_blobs[idx].element_type = def->element_type;
return SQLITE_OK;
}
// UPDATE operation
else if (argc > 1 && sqlite3_value_type(argv[0]) != SQLITE_NULL) {
return SQLITE_ERROR;
}
return SQLITE_ERROR;
}
static int vec_static_blobsOpen(sqlite3_vtab *p,
sqlite3_vtab_cursor **ppCursor) {
UNUSED_PARAMETER(p);
vec_static_blobs_cursor *pCur;
pCur = sqlite3_malloc(sizeof(*pCur));
if (pCur == 0)
return SQLITE_NOMEM;
memset(pCur, 0, sizeof(*pCur));
*ppCursor = &pCur->base;
return SQLITE_OK;
}
static int vec_static_blobsClose(sqlite3_vtab_cursor *cur) {
vec_static_blobs_cursor *pCur = (vec_static_blobs_cursor *)cur;
sqlite3_free(pCur);
return SQLITE_OK;
}
static int vec_static_blobsBestIndex(sqlite3_vtab *pVTab,
sqlite3_index_info *pIdxInfo) {
UNUSED_PARAMETER(pVTab);
pIdxInfo->idxNum = 1;
pIdxInfo->estimatedCost = (double)10;
pIdxInfo->estimatedRows = 10;
return SQLITE_OK;
}
static int vec_static_blobsNext(sqlite3_vtab_cursor *cur);
static int vec_static_blobsFilter(sqlite3_vtab_cursor *pVtabCursor, int idxNum,
const char *idxStr, int argc,
sqlite3_value **argv) {
UNUSED_PARAMETER(idxNum);
UNUSED_PARAMETER(idxStr);
UNUSED_PARAMETER(argc);
UNUSED_PARAMETER(argv);
vec_static_blobs_cursor *pCur = (vec_static_blobs_cursor *)pVtabCursor;
pCur->iRowid = -1;
vec_static_blobsNext(pVtabCursor);
return SQLITE_OK;
}
static int vec_static_blobsRowid(sqlite3_vtab_cursor *cur,
sqlite_int64 *pRowid) {
vec_static_blobs_cursor *pCur = (vec_static_blobs_cursor *)cur;
*pRowid = pCur->iRowid;
return SQLITE_OK;
}
static int vec_static_blobsNext(sqlite3_vtab_cursor *cur) {
vec_static_blobs_cursor *pCur = (vec_static_blobs_cursor *)cur;
vec_static_blobs_vtab *p = (vec_static_blobs_vtab *)pCur->base.pVtab;
pCur->iRowid++;
while (pCur->iRowid < MAX_STATIC_BLOBS) {
if (p->data->static_blobs[pCur->iRowid].name) {
return SQLITE_OK;
}
pCur->iRowid++;
}
return SQLITE_OK;
}
static int vec_static_blobsEof(sqlite3_vtab_cursor *cur) {
vec_static_blobs_cursor *pCur = (vec_static_blobs_cursor *)cur;
return pCur->iRowid >= MAX_STATIC_BLOBS;
}
static int vec_static_blobsColumn(sqlite3_vtab_cursor *cur,
sqlite3_context *context, int i) {
vec_static_blobs_cursor *pCur = (vec_static_blobs_cursor *)cur;
vec_static_blobs_vtab *p = (vec_static_blobs_vtab *)cur->pVtab;
switch (i) {
case VEC_STATIC_BLOBS_NAME:
sqlite3_result_text(context, p->data->static_blobs[pCur->iRowid].name, -1,
SQLITE_TRANSIENT);
break;
case VEC_STATIC_BLOBS_DATA:
sqlite3_result_null(context);
break;
case VEC_STATIC_BLOBS_DIMENSIONS:
sqlite3_result_int64(context,
p->data->static_blobs[pCur->iRowid].dimensions);
break;
case VEC_STATIC_BLOBS_COUNT:
sqlite3_result_int64(context, p->data->static_blobs[pCur->iRowid].nvectors);
break;
}
return SQLITE_OK;
}
static sqlite3_module vec_static_blobsModule = {
/* iVersion */ 3,
/* xCreate */ 0,
/* xConnect */ vec_static_blobsConnect,
/* xBestIndex */ vec_static_blobsBestIndex,
/* xDisconnect */ vec_static_blobsDisconnect,
/* xDestroy */ 0,
/* xOpen */ vec_static_blobsOpen,
/* xClose */ vec_static_blobsClose,
/* xFilter */ vec_static_blobsFilter,
/* xNext */ vec_static_blobsNext,
/* xEof */ vec_static_blobsEof,
/* xColumn */ vec_static_blobsColumn,
/* xRowid */ vec_static_blobsRowid,
/* xUpdate */ vec_static_blobsUpdate,
/* xBegin */ 0,
/* xSync */ 0,
/* xCommit */ 0,
/* xRollback */ 0,
/* xFindMethod */ 0,
/* xRename */ 0,
/* xSavepoint */ 0,
/* xRelease */ 0,
/* xRollbackTo */ 0,
/* xShadowName */ 0,
#if SQLITE_VERSION_NUMBER >= 3044000
/* xIntegrity */ 0
#endif
};
#pragma endregion
#pragma region vec_static_blob_entries() table function
typedef struct vec_static_blob_entries_vtab vec_static_blob_entries_vtab;
struct vec_static_blob_entries_vtab {
sqlite3_vtab base;
static_blob *blob;
};
typedef enum {
VEC_SBE__QUERYPLAN_FULLSCAN = 1,
VEC_SBE__QUERYPLAN_KNN = 2
} vec_sbe_query_plan;
struct sbe_query_knn_data {
i64 k;
i64 k_used;
// Array of rowids of size k. Must be freed with sqlite3_free().
i32 *rowids;
// Array of distances of size k. Must be freed with sqlite3_free().
f32 *distances;
i64 current_idx;
};
void sbe_query_knn_data_clear(struct sbe_query_knn_data *knn_data) {
if (!knn_data)
return;
if (knn_data->rowids) {
sqlite3_free(knn_data->rowids);
knn_data->rowids = NULL;
}
if (knn_data->distances) {
sqlite3_free(knn_data->distances);
knn_data->distances = NULL;
}
}
typedef struct vec_static_blob_entries_cursor vec_static_blob_entries_cursor;
struct vec_static_blob_entries_cursor {
sqlite3_vtab_cursor base;
sqlite3_int64 iRowid;
vec_sbe_query_plan query_plan;
struct sbe_query_knn_data *knn_data;
};
static int vec_static_blob_entriesConnect(sqlite3 *db, void *pAux, int argc,
const char *const *argv,
sqlite3_vtab **ppVtab, char **pzErr) {
UNUSED_PARAMETER(argc);
UNUSED_PARAMETER(argv);
UNUSED_PARAMETER(pzErr);
vec_static_blob_data *blob_data = pAux;
int idx = -1;
for (int i = 0; i < MAX_STATIC_BLOBS; i++) {
if (!blob_data->static_blobs[i].name)
continue;
if (strncmp(blob_data->static_blobs[i].name, argv[3],
strlen(blob_data->static_blobs[i].name)) == 0) {
idx = i;
break;
}
}
if (idx < 0)
abort();
vec_static_blob_entries_vtab *pNew;
#define VEC_STATIC_BLOB_ENTRIES_VECTOR 0
#define VEC_STATIC_BLOB_ENTRIES_DISTANCE 1
#define VEC_STATIC_BLOB_ENTRIES_K 2
int rc = sqlite3_declare_vtab(
db, "CREATE TABLE x(vector, distance hidden, k hidden)");
if (rc == SQLITE_OK) {
pNew = sqlite3_malloc(sizeof(*pNew));
*ppVtab = (sqlite3_vtab *)pNew;
if (pNew == 0)
return SQLITE_NOMEM;
memset(pNew, 0, sizeof(*pNew));
pNew->blob = &blob_data->static_blobs[idx];
}
return rc;
}
static int vec_static_blob_entriesCreate(sqlite3 *db, void *pAux, int argc,
const char *const *argv,
sqlite3_vtab **ppVtab, char **pzErr) {
return vec_static_blob_entriesConnect(db, pAux, argc, argv, ppVtab, pzErr);
}
static int vec_static_blob_entriesDisconnect(sqlite3_vtab *pVtab) {
vec_static_blob_entries_vtab *p = (vec_static_blob_entries_vtab *)pVtab;
sqlite3_free(p);
return SQLITE_OK;
}
static int vec_static_blob_entriesOpen(sqlite3_vtab *p,
sqlite3_vtab_cursor **ppCursor) {
UNUSED_PARAMETER(p);
vec_static_blob_entries_cursor *pCur;
pCur = sqlite3_malloc(sizeof(*pCur));
if (pCur == 0)
return SQLITE_NOMEM;
memset(pCur, 0, sizeof(*pCur));
*ppCursor = &pCur->base;
return SQLITE_OK;
}
static int vec_static_blob_entriesClose(sqlite3_vtab_cursor *cur) {
vec_static_blob_entries_cursor *pCur = (vec_static_blob_entries_cursor *)cur;
sqlite3_free(pCur->knn_data);
sqlite3_free(pCur);
return SQLITE_OK;
}
static int vec_static_blob_entriesBestIndex(sqlite3_vtab *pVTab,
sqlite3_index_info *pIdxInfo) {
vec_static_blob_entries_vtab *p = (vec_static_blob_entries_vtab *)pVTab;
int iMatchTerm = -1;
int iLimitTerm = -1;
// int iRowidTerm = -1; // https://github.com/asg017/sqlite-vec/issues/47
int iKTerm = -1;
for (int i = 0; i < pIdxInfo->nConstraint; i++) {
if (!pIdxInfo->aConstraint[i].usable)
continue;
int iColumn = pIdxInfo->aConstraint[i].iColumn;
int op = pIdxInfo->aConstraint[i].op;
if (op == SQLITE_INDEX_CONSTRAINT_MATCH &&
iColumn == VEC_STATIC_BLOB_ENTRIES_VECTOR) {
if (iMatchTerm > -1) {
// https://github.com/asg017/sqlite-vec/issues/51
return SQLITE_ERROR;
}
iMatchTerm = i;
}
if (op == SQLITE_INDEX_CONSTRAINT_LIMIT) {
iLimitTerm = i;
}
if (op == SQLITE_INDEX_CONSTRAINT_EQ &&
iColumn == VEC_STATIC_BLOB_ENTRIES_K) {
iKTerm = i;
}
}
if (iMatchTerm >= 0) {
if (iLimitTerm < 0 && iKTerm < 0) {
// https://github.com/asg017/sqlite-vec/issues/51
return SQLITE_ERROR;
}
if (iLimitTerm >= 0 && iKTerm >= 0) {
return SQLITE_ERROR; // limit or k, not both
}
if (pIdxInfo->nOrderBy < 1) {
vtab_set_error(pVTab, "ORDER BY distance required");
return SQLITE_CONSTRAINT;
}
if (pIdxInfo->nOrderBy > 1) {
// https://github.com/asg017/sqlite-vec/issues/51
vtab_set_error(pVTab, "more than 1 ORDER BY clause provided");
return SQLITE_CONSTRAINT;
}
if (pIdxInfo->aOrderBy[0].iColumn != VEC_STATIC_BLOB_ENTRIES_DISTANCE) {
vtab_set_error(pVTab, "ORDER BY must be on the distance column");
return SQLITE_CONSTRAINT;
}
if (pIdxInfo->aOrderBy[0].desc) {
vtab_set_error(pVTab,
"Only ascending in ORDER BY distance clause is supported, "
"DESC is not supported yet.");
return SQLITE_CONSTRAINT;
}
pIdxInfo->idxNum = VEC_SBE__QUERYPLAN_KNN;
pIdxInfo->estimatedCost = (double)10;
pIdxInfo->estimatedRows = 10;
pIdxInfo->orderByConsumed = 1;
pIdxInfo->aConstraintUsage[iMatchTerm].argvIndex = 1;
pIdxInfo->aConstraintUsage[iMatchTerm].omit = 1;
if (iLimitTerm >= 0) {
pIdxInfo->aConstraintUsage[iLimitTerm].argvIndex = 2;
pIdxInfo->aConstraintUsage[iLimitTerm].omit = 1;
} else {
pIdxInfo->aConstraintUsage[iKTerm].argvIndex = 2;
pIdxInfo->aConstraintUsage[iKTerm].omit = 1;
}
} else {
pIdxInfo->idxNum = VEC_SBE__QUERYPLAN_FULLSCAN;
pIdxInfo->estimatedCost = (double)p->blob->nvectors;
pIdxInfo->estimatedRows = p->blob->nvectors;
}
return SQLITE_OK;
}
static int vec_static_blob_entriesFilter(sqlite3_vtab_cursor *pVtabCursor,
int idxNum, const char *idxStr,
int argc, sqlite3_value **argv) {
UNUSED_PARAMETER(idxStr);
assert(argc >= 0 && argc <= 3);
vec_static_blob_entries_cursor *pCur =
(vec_static_blob_entries_cursor *)pVtabCursor;
vec_static_blob_entries_vtab *p =
(vec_static_blob_entries_vtab *)pCur->base.pVtab;
if (idxNum == VEC_SBE__QUERYPLAN_KNN) {
assert(argc == 2);
pCur->query_plan = VEC_SBE__QUERYPLAN_KNN;
struct sbe_query_knn_data *knn_data;
knn_data = sqlite3_malloc(sizeof(*knn_data));
if (!knn_data) {
return SQLITE_NOMEM;
}
memset(knn_data, 0, sizeof(*knn_data));
void *queryVector;
size_t dimensions;
enum VectorElementType elementType;
vector_cleanup cleanup;
char *err;
int rc = vector_from_value(argv[0], &queryVector, &dimensions, &elementType,
&cleanup, &err);
if (rc != SQLITE_OK) {
return SQLITE_ERROR;
}
if (elementType != p->blob->element_type) {
return SQLITE_ERROR;
}
if (dimensions != p->blob->dimensions) {
return SQLITE_ERROR;
}
i64 k = min(sqlite3_value_int64(argv[1]), (i64)p->blob->nvectors);
if (k < 0) {
// HANDLE https://github.com/asg017/sqlite-vec/issues/55
return SQLITE_ERROR;
}
if (k == 0) {
knn_data->k = 0;
pCur->knn_data = knn_data;
return SQLITE_OK;
}
size_t bsize = (p->blob->nvectors + 7) & ~7;
i32 *topk_rowids = sqlite3_malloc(k * sizeof(i32));
if (!topk_rowids) {
// HANDLE https://github.com/asg017/sqlite-vec/issues/55
return SQLITE_ERROR;
}
f32 *distances = sqlite3_malloc(bsize * sizeof(f32));
if (!distances) {
// HANDLE https://github.com/asg017/sqlite-vec/issues/55
return SQLITE_ERROR;
}
for (size_t i = 0; i < p->blob->nvectors; i++) {
// https://github.com/asg017/sqlite-vec/issues/52
float *v = ((float *)p->blob->p) + (i * p->blob->dimensions);
distances[i] =
distance_l2_sqr_float(v, (float *)queryVector, &p->blob->dimensions);
}
u8 *candidates = bitmap_new(bsize);
assert(candidates);
u8 *taken = bitmap_new(bsize);
assert(taken);
bitmap_fill(candidates, bsize);
for (size_t i = bsize; i >= p->blob->nvectors; i--) {
bitmap_set(candidates, i, 0);
}
i32 k_used = 0;
min_idx(distances, bsize, candidates, topk_rowids, k, taken, &k_used);
knn_data->current_idx = 0;
knn_data->distances = distances;
knn_data->k = k;
knn_data->rowids = topk_rowids;
pCur->knn_data = knn_data;
} else {
pCur->query_plan = VEC_SBE__QUERYPLAN_FULLSCAN;
pCur->iRowid = 0;
}
return SQLITE_OK;
}
static int vec_static_blob_entriesRowid(sqlite3_vtab_cursor *cur,
sqlite_int64 *pRowid) {
vec_static_blob_entries_cursor *pCur = (vec_static_blob_entries_cursor *)cur;
switch (pCur->query_plan) {
case VEC_SBE__QUERYPLAN_FULLSCAN: {
*pRowid = pCur->iRowid;
return SQLITE_OK;
}
case VEC_SBE__QUERYPLAN_KNN: {
i32 rowid = ((i32 *)pCur->knn_data->rowids)[pCur->knn_data->current_idx];
*pRowid = (sqlite3_int64)rowid;
return SQLITE_OK;
}
}
return SQLITE_ERROR;
}
static int vec_static_blob_entriesNext(sqlite3_vtab_cursor *cur) {
vec_static_blob_entries_cursor *pCur = (vec_static_blob_entries_cursor *)cur;
switch (pCur->query_plan) {
case VEC_SBE__QUERYPLAN_FULLSCAN: {
pCur->iRowid++;
return SQLITE_OK;
}
case VEC_SBE__QUERYPLAN_KNN: {
pCur->knn_data->current_idx++;
return SQLITE_OK;
}
}
return SQLITE_ERROR;
}
static int vec_static_blob_entriesEof(sqlite3_vtab_cursor *cur) {
vec_static_blob_entries_cursor *pCur = (vec_static_blob_entries_cursor *)cur;
vec_static_blob_entries_vtab *p =
(vec_static_blob_entries_vtab *)pCur->base.pVtab;
switch (pCur->query_plan) {
case VEC_SBE__QUERYPLAN_FULLSCAN: {
return (size_t)pCur->iRowid >= p->blob->nvectors;
}
case VEC_SBE__QUERYPLAN_KNN: {
return pCur->knn_data->current_idx >= pCur->knn_data->k;
}
}
return SQLITE_ERROR;
}
static int vec_static_blob_entriesColumn(sqlite3_vtab_cursor *cur,
sqlite3_context *context, int i) {
vec_static_blob_entries_cursor *pCur = (vec_static_blob_entries_cursor *)cur;
vec_static_blob_entries_vtab *p = (vec_static_blob_entries_vtab *)cur->pVtab;
switch (pCur->query_plan) {
case VEC_SBE__QUERYPLAN_FULLSCAN: {
switch (i) {
case VEC_STATIC_BLOB_ENTRIES_VECTOR:
sqlite3_result_blob(
context,
((unsigned char *)p->blob->p) +
(pCur->iRowid * p->blob->dimensions * sizeof(float)),
p->blob->dimensions * sizeof(float), SQLITE_TRANSIENT);
sqlite3_result_subtype(context, p->blob->element_type);
break;
}
return SQLITE_OK;
}
case VEC_SBE__QUERYPLAN_KNN: {
switch (i) {
case VEC_STATIC_BLOB_ENTRIES_VECTOR: {
i32 rowid = ((i32 *)pCur->knn_data->rowids)[pCur->knn_data->current_idx];
sqlite3_result_blob(context,
((unsigned char *)p->blob->p) +
(rowid * p->blob->dimensions * sizeof(float)),
p->blob->dimensions * sizeof(float),
SQLITE_TRANSIENT);
sqlite3_result_subtype(context, p->blob->element_type);
break;
}
}
return SQLITE_OK;
}
}
return SQLITE_ERROR;
}
static sqlite3_module vec_static_blob_entriesModule = {
/* iVersion */ 3,
/* xCreate */
vec_static_blob_entriesCreate, // handle rm?
// https://github.com/asg017/sqlite-vec/issues/55
/* xConnect */ vec_static_blob_entriesConnect,
/* xBestIndex */ vec_static_blob_entriesBestIndex,
/* xDisconnect */ vec_static_blob_entriesDisconnect,
/* xDestroy */ vec_static_blob_entriesDisconnect,
/* xOpen */ vec_static_blob_entriesOpen,
/* xClose */ vec_static_blob_entriesClose,
/* xFilter */ vec_static_blob_entriesFilter,
/* xNext */ vec_static_blob_entriesNext,
/* xEof */ vec_static_blob_entriesEof,
/* xColumn */ vec_static_blob_entriesColumn,
/* xRowid */ vec_static_blob_entriesRowid,
/* xUpdate */ 0,
/* xBegin */ 0,
/* xSync */ 0,
/* xCommit */ 0,
/* xRollback */ 0,
/* xFindMethod */ 0,
/* xRename */ 0,
/* xSavepoint */ 0,
/* xRelease */ 0,
/* xRollbackTo */ 0,
/* xShadowName */ 0,
#if SQLITE_VERSION_NUMBER >= 3044000
/* xIntegrity */ 0
#endif
};
#pragma endregion
#ifdef SQLITE_VEC_ENABLE_AVX
#define SQLITE_VEC_DEBUG_BUILD_AVX "avx"
#else
#define SQLITE_VEC_DEBUG_BUILD_AVX ""
#endif
#ifdef SQLITE_VEC_ENABLE_NEON
#define SQLITE_VEC_DEBUG_BUILD_NEON "neon"
#else
#define SQLITE_VEC_DEBUG_BUILD_NEON ""
#endif
#define SQLITE_VEC_DEBUG_BUILD \
SQLITE_VEC_DEBUG_BUILD_AVX " " SQLITE_VEC_DEBUG_BUILD_NEON
#define SQLITE_VEC_DEBUG_STRING \
"Version: " SQLITE_VEC_VERSION "\n" \
"Date: " SQLITE_VEC_DATE "\n" \
"Commit: " SQLITE_VEC_SOURCE "\n" \
"Build flags: " SQLITE_VEC_DEBUG_BUILD
SQLITE_VEC_API int sqlite3_vec_init(sqlite3 *db, char **pzErrMsg,
const sqlite3_api_routines *pApi) {
#ifndef SQLITE_CORE
SQLITE_EXTENSION_INIT2(pApi);
#endif
int rc = SQLITE_OK;
#define DEFAULT_FLAGS (SQLITE_UTF8 | SQLITE_INNOCUOUS | SQLITE_DETERMINISTIC)
rc = sqlite3_create_function_v2(db, "vec_version", 0, DEFAULT_FLAGS,
SQLITE_VEC_VERSION, _static_text_func, NULL,
NULL, NULL);
if (rc != SQLITE_OK) {
return rc;
}
rc = sqlite3_create_function_v2(db, "vec_debug", 0, DEFAULT_FLAGS,
SQLITE_VEC_DEBUG_STRING, _static_text_func,
NULL, NULL, NULL);
if (rc != SQLITE_OK) {
return rc;
}
static struct {
const char *zFName;
void (*xFunc)(sqlite3_context *, int, sqlite3_value **);
int nArg;
int flags;
} aFunc[] = {
// clang-format off
//{"vec_version", _static_text_func, 0, DEFAULT_FLAGS, (void *) SQLITE_VEC_VERSION },
//{"vec_debug", _static_text_func, 0, DEFAULT_FLAGS, (void *) SQLITE_VEC_DEBUG_STRING },
{"vec_distance_l2", vec_distance_l2, 2, DEFAULT_FLAGS | SQLITE_SUBTYPE, },
{"vec_distance_l1", vec_distance_l1, 2, DEFAULT_FLAGS | SQLITE_SUBTYPE, },
{"vec_distance_hamming",vec_distance_hamming, 2, DEFAULT_FLAGS | SQLITE_SUBTYPE, },
{"vec_distance_cosine", vec_distance_cosine, 2, DEFAULT_FLAGS | SQLITE_SUBTYPE, },
{"vec_length", vec_length, 1, DEFAULT_FLAGS | SQLITE_SUBTYPE, },
{"vec_type", vec_type, 1, DEFAULT_FLAGS, },
{"vec_to_json", vec_to_json, 1, DEFAULT_FLAGS | SQLITE_SUBTYPE | SQLITE_RESULT_SUBTYPE, },
{"vec_add", vec_add, 2, DEFAULT_FLAGS | SQLITE_SUBTYPE | SQLITE_RESULT_SUBTYPE, },
{"vec_sub", vec_sub, 2, DEFAULT_FLAGS | SQLITE_SUBTYPE | SQLITE_RESULT_SUBTYPE, },
{"vec_slice", vec_slice, 3, DEFAULT_FLAGS | SQLITE_SUBTYPE | SQLITE_RESULT_SUBTYPE, },
{"vec_normalize", vec_normalize, 1, DEFAULT_FLAGS | SQLITE_SUBTYPE | SQLITE_RESULT_SUBTYPE, },
{"vec_f32", vec_f32, 1, DEFAULT_FLAGS | SQLITE_SUBTYPE | SQLITE_RESULT_SUBTYPE, },
{"vec_bit", vec_bit, 1, DEFAULT_FLAGS | SQLITE_SUBTYPE | SQLITE_RESULT_SUBTYPE, },
{"vec_int8", vec_int8, 1, DEFAULT_FLAGS | SQLITE_SUBTYPE | SQLITE_RESULT_SUBTYPE, },
{"vec_quantize_int8", vec_quantize_int8, 2, DEFAULT_FLAGS | SQLITE_SUBTYPE | SQLITE_RESULT_SUBTYPE, },
{"vec_quantize_binary", vec_quantize_binary, 1, DEFAULT_FLAGS | SQLITE_SUBTYPE | SQLITE_RESULT_SUBTYPE, },
// clang-format on
};
static struct {
char *name;
const sqlite3_module *module;
void *p;
void (*xDestroy)(void *);
} aMod[] = {
// clang-format off
{"vec0", &vec0Module, NULL, NULL},
{"vec_each", &vec_eachModule, NULL, NULL},
// clang-format on
};
for (unsigned long i = 0; i < countof(aFunc) && rc == SQLITE_OK; i++) {
rc = sqlite3_create_function_v2(db, aFunc[i].zFName, aFunc[i].nArg,
aFunc[i].flags, NULL, aFunc[i].xFunc, NULL,
NULL, NULL);
if (rc != SQLITE_OK) {
*pzErrMsg = sqlite3_mprintf("Error creating function %s: %s",
aFunc[i].zFName, sqlite3_errmsg(db));
return rc;
}
}
for (unsigned long i = 0; i < countof(aMod) && rc == SQLITE_OK; i++) {
rc = sqlite3_create_module_v2(db, aMod[i].name, aMod[i].module, NULL, NULL);
if (rc != SQLITE_OK) {
*pzErrMsg = sqlite3_mprintf("Error creating module %s: %s", aMod[i].name,
sqlite3_errmsg(db));
return rc;
}
}
return SQLITE_OK;
}
#ifndef SQLITE_VEC_OMIT_FS
SQLITE_VEC_API int sqlite3_vec_numpy_init(sqlite3 *db, char **pzErrMsg,
const sqlite3_api_routines *pApi) {
UNUSED_PARAMETER(pzErrMsg);
#ifndef SQLITE_CORE
SQLITE_EXTENSION_INIT2(pApi);
#endif
int rc = SQLITE_OK;
rc = sqlite3_create_function_v2(db, "vec_npy_file", 1, SQLITE_RESULT_SUBTYPE,
NULL, vec_npy_file, NULL, NULL, NULL);
if(rc != SQLITE_OK) {
return rc;
}
rc = sqlite3_create_module_v2(db, "vec_npy_each", &vec_npy_eachModule, NULL, NULL);
return rc;
}
#endif
SQLITE_VEC_API int
sqlite3_vec_static_blobs_init(sqlite3 *db, char **pzErrMsg,
const sqlite3_api_routines *pApi) {
UNUSED_PARAMETER(pzErrMsg);
#ifndef SQLITE_CORE
SQLITE_EXTENSION_INIT2(pApi);
#endif
int rc = SQLITE_OK;
vec_static_blob_data *static_blob_data;
static_blob_data = sqlite3_malloc(sizeof(*static_blob_data));
if (!static_blob_data) {
return SQLITE_NOMEM;
}
memset(static_blob_data, 0, sizeof(*static_blob_data));
rc = sqlite3_create_function_v2(
db, "vec_static_blob_from_raw", 4,
DEFAULT_FLAGS | SQLITE_SUBTYPE | SQLITE_RESULT_SUBTYPE, NULL,
vec_static_blob_from_raw, NULL, NULL, NULL);
if (rc != SQLITE_OK)
return rc;
rc = sqlite3_create_module_v2(db, "vec_static_blobs", &vec_static_blobsModule,
static_blob_data, sqlite3_free);
if (rc != SQLITE_OK)
return rc;
rc = sqlite3_create_module_v2(db, "vec_static_blob_entries",
&vec_static_blob_entriesModule,
static_blob_data, NULL);
if (rc != SQLITE_OK)
return rc;
return rc;
}
================================================
FILE: sqlite-vec.h.tmpl
================================================
#ifndef SQLITE_VEC_H
#define SQLITE_VEC_H
#ifndef SQLITE_CORE
#include "sqlite3ext.h"
#else
#include "sqlite3.h"
#endif
#ifdef SQLITE_VEC_STATIC
#define SQLITE_VEC_API
#else
#ifdef _WIN32
#define SQLITE_VEC_API __declspec(dllexport)
#else
#define SQLITE_VEC_API
#endif
#endif
#define SQLITE_VEC_VERSION "v${VERSION}"
// TODO rm
#define SQLITE_VEC_DATE "${DATE}"
#define SQLITE_VEC_SOURCE "${SOURCE}"
#define SQLITE_VEC_VERSION_MAJOR ${VERSION_MAJOR}
#define SQLITE_VEC_VERSION_MINOR ${VERSION_MINOR}
#define SQLITE_VEC_VERSION_PATCH ${VERSION_PATCH}
#ifdef __cplusplus
extern "C" {
#endif
SQLITE_VEC_API int sqlite3_vec_init(sqlite3 *db, char **pzErrMsg,
const sqlite3_api_routines *pApi);
#ifdef __cplusplus
} /* end of the 'extern "C"' block */
#endif
#endif /* ifndef SQLITE_VEC_H */
================================================
FILE: test.sql
================================================
.load dist/vec0main
.bail on
.mode qbox
.load ./memstat
.echo on
select name, value from sqlite_memstat where name = 'MEMORY_USED';
create virtual table v using vec0(
vector float[1],
name1 text,
name2 text,
age int,
chunk_size=8
);
select name, value from sqlite_memstat where name = 'MEMORY_USED';
insert into v(vector, name1, name2, age) values
('[1]', 'alex', 'xxxx', 1),
('[2]', 'alex', 'aaaa', 2),
('[3]', 'alex', 'aaaa', 3),
('[4]', 'brian', 'aaaa', 1),
('[5]', 'brian', 'aaaa', 2),
('[6]', 'brian', 'aaaa', 3),
('[7]', 'craig', 'aaaa', 1),
('[8]', 'craig', 'xxxx', 2),
('[9]', 'craig', 'xxxx', 3),
('[10]', '123456789012345', 'xxxx', 3);
select name, value from sqlite_memstat where name = 'MEMORY_USED';
select rowid, name1, name2, age, vec_to_json(vector)
from v
where vector match '[0]'
and k = 5
and name1 in ('alex', 'brian', 'craig')
--and name2 in ('aaaa', 'xxxx')
and age in (1, 2, 3, 2222,3333,4444);
select name, value from sqlite_memstat where name = 'MEMORY_USED';
select rowid, name1, name2, age, vec_to_json(vector)
from v
where vector match '[0]'
and k = 5
and name1 in ('123456789012345', 'superfluous');
.exit
create virtual table v using vec0(
vector float[1],
+description text
);
insert into v(rowid, vector, description) values (1, '[1]', 'aaa');
select * from v;
.exit
create virtual table vec_articles using vec0(
article_id integer primary key,
year integer partition key,
headline_embedding float[1],
+headline text,
+url text,
word_count integer,
print_section text,
print_page integer,
pub_date text,
);
insert into vec_articles values (1111, 2020, '[1]', 'headline', 'https://...', 200, 'A', 1, '2020-01-01');
select * from vec_articles;
.exit
create table movies(movie_id integer primary key, synopsis text);
INSERT INTO movies(movie_id, synopsis)
VALUES
(1, 'A family is haunted by demonic spirits after moving into a new house, requiring the help of paranormal investigators.'),
(2, 'Two dim-witted friends embark on a cross-country road trip to return a briefcase full of money to its owner.'),
(3, 'A team of explorers travels through a wormhole in space in an attempt to ensure humanity’s survival.'),
(4, 'A young hobbit embarks on a journey with a fellowship to destroy a powerful ring and save Middle-earth from darkness.'),
(5, 'A documentary about the dangers of global warming, featuring former U.S. Vice President Al Gore.'),
(6, 'After the death of her secretive mother, a woman discovers terrifying secrets about her family lineage.'),
(7, 'A clueless but charismatic TV anchorman struggles to stay relevant in the world of broadcast journalism.'),
(8, 'A young blade runner uncovers a long-buried secret that leads him to track down former blade runner Rick Deckard.'),
(9, 'A young boy discovers he is a wizard and attends a magical school, where he learns about his destiny.'),
(10, 'A rock climber attempts to scale El Capitan in Yosemite National Park without the use of ropes or safety gear.'),
(11, 'A young African-American man uncovers a disturbing secret when he visits his white girlfriend''s family estate.'),
(12, 'Three friends wake up from a bachelor party in Las Vegas with no memory of the previous night and must retrace their steps.'),
(13, 'A computer hacker learns about the true nature of his reality and his role in the war against its controllers.'),
(14, 'In post-Civil War Spain, a young girl escapes into an eerie but captivating fantasy world.'),
(15, 'A documentary that explores racial inequality in the United States, focusing on the prison system and mass incarceration.'),
(16, 'A young woman is followed by an unknown supernatural force after a sexual encounter.'),
(17, 'Two immature but well-meaning stepbrothers become instant rivals when their single parents marry.'),
(18, 'A thief with the ability to enter people''s dreams is tasked with planting an idea into a target''s subconscious.'),
(19, 'A mute woman forms a unique relationship with a mysterious aquatic creature being held in a secret research facility.'),
(20, 'A documentary about the life and legacy of Fred Rogers, the beloved host of the children''s TV show "Mister Rogers'' Neighborhood."');
create virtual table vec_movies using vec0(
movie_id integer primary key,
synopsis_embedding float[1],
+title text,
genre text,
num_reviews int,
mean_rating float,
chunk_size=8
);
.schema
/*
insert into vec_movies(movie_id, synopsis_embedding, num_reviews, mean_rating) values
(1, '[1]', 153, 4.6),
(2, '[2]', 382, 2.6),
(3, '[3]', 53, 5.0),
(4, '[4]', 210, 4.2),
(5, '[5]', 93, 3.4),
(6, '[6]', 167, 4.7),
(7, '[7]', 482, 2.9),
(8, '[8]', 301, 5.0),
(9, '[9]', 134, 4.1),
(10, '[10]', 66, 3.2),
(11, '[11]', 88, 4.9),
(12, '[12]', 59, 2.8),
(13, '[13]', 423, 4.5),
(14, '[14]', 275, 3.6),
(15, '[15]', 191, 4.4),
(16, '[16]', 314, 4.3),
(17, '[17]', 74, 3.0),
(18, '[18]', 201, 5.0),
(19, '[19]', 399, 2.7),
(20, '[20]', 186, 4.8);
*/
/*
INSERT INTO vec_movies(movie_id, synopsis_embedding, genre, num_reviews, mean_rating)
VALUES
(1, '[1]', 'horror', 153, 4.6),
(2, '[2]', 'comedy', 382, 2.6),
(3, '[3]', 'scifi', 53, 5.0),
(4, '[4]', 'fantasy', 210, 4.2),
(5, '[5]', 'documentary', 93, 3.4),
(6, '[6]', 'horror', 167, 4.7),
(7, '[7]', 'comedy', 482, 2.9),
(8, '[8]', 'scifi', 301, 5.0),
(9, '[9]', 'fantasy', 134, 4.1),
(10, '[10]', 'documentary', 66, 3.2),
(11, '[11]', 'horror', 88, 4.9),
(12, '[12]', 'comedy', 59, 2.8),
(13, '[13]', 'scifi', 423, 4.5),
(14, '[14]', 'fantasy', 275, 3.6),
(15, '[15]', 'documentary', 191, 4.4),
(16, '[16]', 'horror', 314, 4.3),
(17, '[17]', 'comedy', 74, 3.0),
(18, '[18]', 'scifi', 201, 5.0),
(19, '[19]', 'fantasy', 399, 2.7),
(20, '[20]', 'documentary', 186, 4.8);
*/
INSERT INTO vec_movies(movie_id, synopsis_embedding, genre, title, num_reviews, mean_rating)
VALUES
(1, '[1]', 'horror', 'The Conjuring', 153, 4.6),
(2, '[2]', 'comedy', 'Dumb and Dumber', 382, 2.6),
(3, '[3]', 'scifi', 'Interstellar', 53, 5.0),
(4, '[4]', 'fantasy', 'The Lord of the Rings: The Fellowship of the Ring', 210, 4.2),
(5, '[5]', 'documentary', 'An Inconvenient Truth', 93, 3.4),
(6, '[6]', 'horror', 'Hereditary', 167, 4.7),
(7, '[7]', 'comedy', 'Anchorman: The Legend of Ron Burgundy', 482, 2.9),
(8, '[8]', 'scifi', 'Blade Runner 2049', 301, 5.0),
(9, '[9]', 'fantasy', 'Harry Potter and the Sorcerer''s Stone', 134, 4.1),
(10, '[10]', 'documentary', 'Free Solo', 66, 3.2),
(11, '[11]', 'horror', 'Get Out', 88, 4.9),
(12, '[12]', 'comedy', 'The Hangover', 59, 2.8),
(13, '[13]', 'scifi', 'The Matrix', 423, 4.5),
(14, '[14]', 'fantasy', 'Pan''s Labyrinth', 275, 3.6),
(15, '[15]', 'documentary', '13th', 191, 4.4),
(16, '[16]', 'horror', 'It Follows', 314, 4.3),
(17, '[17]', 'comedy', 'Step Brothers', 74, 3.0),
(18, '[18]', 'scifi', 'Inception', 201, 5.0),
(19, '[19]', 'fantasy', 'The Shape of Water', 399, 2.7),
(20, '[20]', 'documentary', 'Won''t You Be My Neighbor?', 186, 4.8),
(21, '[21]', 'scifi', 'Gravity', 342, 4.0),
(22, '[22]', 'scifi', 'Dune', 451, 4.4),
(23, '[23]', 'scifi', 'The Martian', 522, 4.6),
(24, '[24]', 'horror', 'A Quiet Place', 271, 4.3),
(25, '[25]', 'fantasy', 'The Chronicles of Narnia: The Lion, the Witch and the Wardrobe', 310, 3.9);
--select * from vec_movies;
--select * from vec_movies_metadata_chunks00;
create virtual table vec_chunks using vec0(
user_id integer partition key,
+contents text,
contents_embedding float[1],
);
INSERT INTO vec_chunks (rowid, user_id, contents, contents_embedding) VALUES
(1, 123, 'Our PTO policy allows employees to take both vacation and sick leave as needed.', '[1]'),
(2, 123, 'Employees must provide notice at least two weeks in advance for planned vacations.', '[2]'),
(3, 123, 'Sick leave can be taken without advance notice, but employees must inform their manager.', '[3]'),
(4, 123, 'Unused PTO can be carried over to the following year, up to a maximum of 40 hours.', '[4]'),
(5, 123, 'PTO must be used in increments of at least 4 hours.', '[5]'),
(6, 456, 'New employees are granted 10 days of PTO during their first year of employment.', '[6]'),
(7, 456, 'After the first year, employees earn an additional day of PTO for each year of service.', '[7]'),
(8, 789, 'PTO requests will be reviewed by the HR department and are subject to approval.', '[8]'),
(9, 789, 'The company reserves the right to deny PTO requests during peak operational periods.', '[9]'),
(10, 456, 'If PTO is denied, the employee will be given an alternative time to take leave.', '[10]'),
(11, 789, 'Employees who are out of PTO must request unpaid leave for any additional time off.', '[11]'),
(12, 789, 'In case of a family emergency, employees can request emergency leave.', '[12]'),
(13, 456, 'Emergency leave may be granted for personal or family illness, or other critical situations.', '[13]'),
(14, 789, 'The maximum length of emergency leave is subject to company discretion.', '[14]'),
(15, 123, 'All PTO balances will be displayed on the employee self-service portal.', '[15]'),
(16, 456, 'Employees who are terminated will be paid for unused PTO, as per state law.', '[16]'),
(17, 123, 'Part-time employees are eligible for PTO on a pro-rata basis.', '[17]'),
(18, 789, 'The company encourages employees to use their PTO to maintain work-life balance.', '[18]'),
(19, 456, 'Employees should not book travel plans until their PTO request has been approved.', '[19]'),
(20, 123, 'Managers are responsible for tracking their team members'' PTO usage.', '[20]');
select rowid, user_id, contents, distance
from vec_chunks
where contents_embedding match '[19]'
and user_id = 123
and k = 5;
.exit
-- PARTITION KEY and auxiliar columns!
create virtual table vec_chunks using vec0(
-- internally shard the vector index by user
user_id integer partition key,
-- store the chunk text pre-embedding as an "auxiliary column"
+contents text,
contents_embeddings float[1024],
);
select rowid, user_id, contents, distance
from vec_chunks
where contents_embedding match '[...]'
and user_id = 123
and k = 5;
/*
┌───────┬─────────┬──────────────────────────────────────────────────────────────┬──────────┐
│ rowid │ user_id │ contents │ distance │
├───────┼─────────┼──────────────────────────────────────────────────────────────┼──────────┤
│ 20 │ 123 │ 'Managers are responsible for tracking their team members'' │ 1.0 │
│ │ │ PTO usage.' │ │
├───────┼─────────┼──────────────────────────────────────────────────────────────┼──────────┤
│ 17 │ 123 │ 'Part-time employees are eligible for PTO on a pro-rata basi │ 2.0 │
│ │ │ s.' │ │
├───────┼─────────┼──────────────────────────────────────────────────────────────┼──────────┤
│ 15 │ 123 │ 'All PTO balances will be displayed on the employee self-ser │ 4.0 │
│ │ │ vice portal.' │ │
├───────┼─────────┼──────────────────────────────────────────────────────────────┼──────────┤
│ 5 │ 123 │ 'PTO must be used in increments of at least 4 hours.' │ 14.0 │
├───────┼─────────┼──────────────────────────────────────────────────────────────┼──────────┤
│ 4 │ 123 │ 'Unused PTO can be carried over to the following year, up to │ 15.0 │
│ │ │ a maximum of 40 hours.' │ │
└───────┴─────────┴──────────────────────────────────────────────────────────────┴──────────┘
*/
-- metadata filters!
create virtual table vec_movies using vec0(
movie_id integer primary key,
synopsis_embedding float[1024],
genre text,
num_reviews int,
mean_rating float
);
select
movie_id,
title,
genre,
num_reviews,
mean_rating,
distance
from vec_movies
where synopsis_embedding match '[15.5]'
and genre = 'scifi'
and num_reviews between 100 and 500
and mean_rating > 3.5
and k = 5;
/*
┌──────────┬─────────────────────┬─────────┬─────────────┬──────────────────┬──────────┐
│ movie_id │ title │ genre │ num_reviews │ mean_rating │ distance │
├──────────┼─────────────────────┼─────────┼─────────────┼──────────────────┼──────────┤
│ 13 │ 'The Matrix' │ 'scifi' │ 423 │ 4.5 │ 2.5 │
│ 18 │ 'Inception' │ 'scifi' │ 201 │ 5.0 │ 2.5 │
│ 21 │ 'Gravity' │ 'scifi' │ 342 │ 4.0 │ 5.5 │
│ 22 │ 'Dune' │ 'scifi' │ 451 │ 4.40000009536743 │ 6.5 │
│ 8 │ 'Blade Runner 2049' │ 'scifi' │ 301 │ 5.0 │ 7.5 │
└──────────┴─────────────────────┴─────────┴─────────────┴──────────────────┴──────────┘
*/
.exit
create virtual table vec_movies using vec0(
movie_id integer primary key,
synopsis_embedding float[768],
genre text,
num_reviews int,
mean_rating float,
);
.exit
create virtual table vec_chunks using vec0(
chunk_id integer primary key,
contents_embedding float[1],
+contents text
);
insert into vec_chunks(chunk_id, contents_embedding, contents) values
(1, '[1]', 'alex'),
(2, '[2]', 'brian'),
(3, '[3]', 'craig'),
(4, '[4]', 'dylan');
select * from vec_chunks;
select chunk_id, contents, distance
from vec_chunks
where contents_embedding match '[5]'
and k = 3;
.exit
create virtual table v using vec0(a float[1]);
select count(*) from v_chunks;
insert into v(a) values ('[1.11]');
select * from v;
drop table v;
create virtual table v using vec0(
v_aaa float[1],
partk_xxx int partition key,
v_bbb float[2],
partk_yyy text partition key,
chunk_size=32
);
insert into v(rowid, v_aaa, partk_xxx, v_bbb, partk_yyy) values
(1, '[.1]', 999, '[.11, .11]', 'alex'),
(2, '[.2]', 999, '[.22, .22]', 'alex'),
(3, '[.3]', 999, '[.33, .33]', 'brian');
select rowid, vec_to_json(v_aaa), partk_xxx, vec_to_json(v_bbb), partk_yyy from v;
select * from v;
select * from v where rowid = 2;
update v
set v_aaa = '[.222]',
v_bbb = '[.222, .222]'
where rowid = 2;
select rowid, vec_to_json(v_aaa), partk_xxx, vec_to_json(v_bbb), partk_yyy from v;
select chunk_id, size, sequence_id, partition00, partition01, (validity), length(rowids) from v_chunks;
--explain query plan
select *, distance
from v
where v_aaa match '[.5]'
and partk_xxx = 999
and partk_yyy = 'alex'
--and partk_xxx != 20
and k = 5;
================================================
FILE: tests/.gitignore
================================================
target/
================================================
FILE: tests/.python-version
================================================
3.12
================================================
FILE: tests/Cargo.toml
================================================
[package]
name = "tests"
version = "0.1.0"
edition = "2021"
[dependencies]
[build-dependencies]
cc = "1.0"
[[bin]]
name = "unittest"
path = "unittest.rs"
================================================
FILE: tests/__snapshots__/test-auxiliary.ambr
================================================
# serializer version: 1
# name: test_constructor_limit[max 16 auxiliary columns]
dict({
'error': 'OperationalError',
'message': 'vec0 constructor error: More than 16 auxiliary columns were provided',
})
# ---
# name: test_deletes
OrderedDict({
'sql': 'select rowid, * from v',
'rows': list([
OrderedDict({
'rowid': 1,
'vector': b'\x00\x00\x80?',
'name': 'alex',
}),
OrderedDict({
'rowid': 2,
'vector': b'\x00\x00\x00@',
'name': 'brian',
}),
OrderedDict({
'rowid': 3,
'vector': b'\x00\x00@@',
'name': 'craig',
}),
]),
})
# ---
# name: test_deletes.1
dict({
'v_auxiliary': OrderedDict({
'sql': 'select * from v_auxiliary',
'rows': list([
OrderedDict({
'rowid': 1,
'value00': 'alex',
}),
OrderedDict({
'rowid': 2,
'value00': 'brian',
}),
OrderedDict({
'rowid': 3,
'value00': 'craig',
}),
]),
}),
'v_chunks': OrderedDict({
'sql': 'select * from v_chunks',
'rows': list([
OrderedDict({
'chunk_id': 1,
'size': 8,
'validity': b'\x07',
'rowids': b'\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'v_rowids': OrderedDict({
'sql': 'select * from v_rowids',
'rows': list([
OrderedDict({
'rowid': 1,
'id': None,
'chunk_id': 1,
'chunk_offset': 0,
}),
OrderedDict({
'rowid': 2,
'id': None,
'chunk_id': 1,
'chunk_offset': 1,
}),
OrderedDict({
'rowid': 3,
'id': None,
'chunk_id': 1,
'chunk_offset': 2,
}),
]),
}),
'v_vector_chunks00': OrderedDict({
'sql': 'select * from v_vector_chunks00',
'rows': list([
OrderedDict({
'rowid': 1,
'vectors': b'\x00\x00\x80?\x00\x00\x00@\x00\x00@@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
})
# ---
# name: test_deletes.2
OrderedDict({
'sql': 'delete from v where rowid = 1',
'rows': list([
]),
})
# ---
# name: test_deletes.3
OrderedDict({
'sql': 'select rowid, * from v',
'rows': list([
OrderedDict({
'rowid': 2,
'vector': b'\x00\x00\x00@',
'name': 'brian',
}),
OrderedDict({
'rowid': 3,
'vector': b'\x00\x00@@',
'name': 'craig',
}),
]),
})
# ---
# name: test_deletes.4
dict({
'v_auxiliary': OrderedDict({
'sql': 'select * from v_auxiliary',
'rows': list([
OrderedDict({
'rowid': 2,
'value00': 'brian',
}),
OrderedDict({
'rowid': 3,
'value00': 'craig',
}),
]),
}),
'v_chunks': OrderedDict({
'sql': 'select * from v_chunks',
'rows': list([
OrderedDict({
'chunk_id': 1,
'size': 8,
'validity': b'\x06',
'rowids': b'\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'v_rowids': OrderedDict({
'sql': 'select * from v_rowids',
'rows': list([
OrderedDict({
'rowid': 2,
'id': None,
'chunk_id': 1,
'chunk_offset': 1,
}),
OrderedDict({
'rowid': 3,
'id': None,
'chunk_id': 1,
'chunk_offset': 2,
}),
]),
}),
'v_vector_chunks00': OrderedDict({
'sql': 'select * from v_vector_chunks00',
'rows': list([
OrderedDict({
'rowid': 1,
'vectors': b'\x00\x00\x00\x00\x00\x00\x00@\x00\x00@@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
})
# ---
# name: test_knn
OrderedDict({
'sql': 'select * from v',
'rows': list([
OrderedDict({
'rowid': 1,
'vector': b'\x00\x00\x80?',
'name': 'alex',
}),
OrderedDict({
'rowid': 2,
'vector': b'\x00\x00\x00@',
'name': 'brian',
}),
OrderedDict({
'rowid': 3,
'vector': b'\x00\x00@@',
'name': 'craig',
}),
]),
})
# ---
# name: test_knn[illegal KNN w/ aux]
dict({
'error': 'OperationalError',
'message': 'An illegal WHERE constraint was provided on a vec0 auxiliary column in a KNN query.',
})
# ---
# name: test_knn[legal KNN w/ aux]
OrderedDict({
'sql': "select *, distance from v where vector match '[5]' and k = 10",
'rows': list([
OrderedDict({
'rowid': 3,
'vector': b'\x00\x00@@',
'name': 'craig',
'distance': 2.0,
}),
OrderedDict({
'rowid': 2,
'vector': b'\x00\x00\x00@',
'name': 'brian',
'distance': 3.0,
}),
OrderedDict({
'rowid': 1,
'vector': b'\x00\x00\x80?',
'name': 'alex',
'distance': 4.0,
}),
]),
})
# ---
# name: test_normal.1
OrderedDict({
'sql': 'select * from v',
'rows': list([
OrderedDict({
'rowid': 1,
'a': b'\x11\x11\x11\x11',
'name': 'alex',
}),
OrderedDict({
'rowid': 2,
'a': b'""""',
'name': 'brian',
}),
OrderedDict({
'rowid': 3,
'a': b'3333',
'name': 'craig',
}),
]),
})
# ---
# name: test_normal.2
dict({
'v_auxiliary': OrderedDict({
'sql': 'select * from v_auxiliary',
'rows': list([
OrderedDict({
'rowid': 1,
'value00': 'alex',
}),
OrderedDict({
'rowid': 2,
'value00': 'brian',
}),
OrderedDict({
'rowid': 3,
'value00': 'craig',
}),
]),
}),
'v_chunks': OrderedDict({
'sql': 'select * from v_chunks',
'rows': list([
OrderedDict({
'chunk_id': 1,
'size': 8,
'validity': b'\x07',
'rowids': b'\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'v_rowids': OrderedDict({
'sql': 'select * from v_rowids',
'rows': list([
OrderedDict({
'rowid': 1,
'id': None,
'chunk_id': 1,
'chunk_offset': 0,
}),
OrderedDict({
'rowid': 2,
'id': None,
'chunk_id': 1,
'chunk_offset': 1,
}),
OrderedDict({
'rowid': 3,
'id': None,
'chunk_id': 1,
'chunk_offset': 2,
}),
]),
}),
'v_vector_chunks00': OrderedDict({
'sql': 'select * from v_vector_chunks00',
'rows': list([
OrderedDict({
'rowid': 1,
'vectors': b'\x11\x11\x11\x11""""3333\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
})
# ---
# name: test_normal[sqlite_master post drop]
OrderedDict({
'sql': 'select * from sqlite_master order by name',
'rows': list([
OrderedDict({
'type': 'table',
'name': 'sqlite_sequence',
'tbl_name': 'sqlite_sequence',
'rootpage': 5,
'sql': 'CREATE TABLE sqlite_sequence(name,seq)',
}),
]),
})
# ---
# name: test_normal[sqlite_master]
OrderedDict({
'sql': 'select * from sqlite_master order by name',
'rows': list([
OrderedDict({
'type': 'index',
'name': 'sqlite_autoindex_v_info_1',
'tbl_name': 'v_info',
'rootpage': 3,
'sql': None,
}),
OrderedDict({
'type': 'index',
'name': 'sqlite_autoindex_v_vector_chunks00_1',
'tbl_name': 'v_vector_chunks00',
'rootpage': 8,
'sql': None,
}),
OrderedDict({
'type': 'table',
'name': 'sqlite_sequence',
'tbl_name': 'sqlite_sequence',
'rootpage': 5,
'sql': 'CREATE TABLE sqlite_sequence(name,seq)',
}),
OrderedDict({
'type': 'table',
'name': 'v',
'tbl_name': 'v',
'rootpage': 0,
'sql': 'CREATE VIRTUAL TABLE v using vec0(a float[1], +name text, chunk_size=8)',
}),
OrderedDict({
'type': 'table',
'name': 'v_auxiliary',
'tbl_name': 'v_auxiliary',
'rootpage': 9,
'sql': 'CREATE TABLE "v_auxiliary"( rowid integer PRIMARY KEY , value00)',
}),
OrderedDict({
'type': 'table',
'name': 'v_chunks',
'tbl_name': 'v_chunks',
'rootpage': 4,
'sql': 'CREATE TABLE "v_chunks"(chunk_id INTEGER PRIMARY KEY AUTOINCREMENT,size INTEGER NOT NULL,validity BLOB NOT NULL,rowids BLOB NOT NULL)',
}),
OrderedDict({
'type': 'table',
'name': 'v_info',
'tbl_name': 'v_info',
'rootpage': 2,
'sql': 'CREATE TABLE "v_info" (key text primary key, value any)',
}),
OrderedDict({
'type': 'table',
'name': 'v_rowids',
'tbl_name': 'v_rowids',
'rootpage': 6,
'sql': 'CREATE TABLE "v_rowids"(rowid INTEGER PRIMARY KEY AUTOINCREMENT,id,chunk_id INTEGER,chunk_offset INTEGER)',
}),
OrderedDict({
'type': 'table',
'name': 'v_vector_chunks00',
'tbl_name': 'v_vector_chunks00',
'rootpage': 7,
'sql': 'CREATE TABLE "v_vector_chunks00"(rowid PRIMARY KEY,vectors BLOB NOT NULL)',
}),
]),
})
# ---
# name: test_types
OrderedDict({
'sql': 'select * from v',
'rows': list([
]),
})
# ---
# name: test_types.1
OrderedDict({
'sql': 'insert into v(vector, aux_int, aux_float, aux_text, aux_blob) values (?, ?, ?, ?, ?)',
'rows': list([
]),
})
# ---
# name: test_types.2
OrderedDict({
'sql': 'select * from v',
'rows': list([
OrderedDict({
'rowid': 1,
'vector': b'\x11\x11\x11\x11',
'aux_int': 1,
'aux_float': 1.22,
'aux_text': 'text',
'aux_blob': b'blob',
}),
]),
})
# ---
# name: test_types.3
dict({
'error': 'IntegrityError',
'message': 'Auxiliary column type mismatch: The auxiliary column aux_int has type INTEGER, but TEXT was provided.',
})
# ---
# name: test_types.4
dict({
'error': 'IntegrityError',
'message': 'Auxiliary column type mismatch: The auxiliary column aux_float has type FLOAT, but TEXT was provided.',
})
# ---
# name: test_types.5
dict({
'error': 'IntegrityError',
'message': 'Auxiliary column type mismatch: The auxiliary column aux_text has type TEXT, but INTEGER was provided.',
})
# ---
# name: test_types.6
dict({
'error': 'IntegrityError',
'message': 'Auxiliary column type mismatch: The auxiliary column aux_blob has type BLOB, but INTEGER was provided.',
})
# ---
# name: test_types.7
OrderedDict({
'sql': 'insert into v(vector, aux_int, aux_float, aux_text, aux_blob) values (?, ?, ?, ?, ?)',
'rows': list([
]),
})
# ---
# name: test_types.8
OrderedDict({
'sql': 'select * from v',
'rows': list([
OrderedDict({
'rowid': 1,
'vector': b'\x11\x11\x11\x11',
'aux_int': 1,
'aux_float': 1.22,
'aux_text': 'text',
'aux_blob': b'blob',
}),
OrderedDict({
'rowid': 2,
'vector': b'\x11\x11\x11\x11',
'aux_int': None,
'aux_float': None,
'aux_text': None,
'aux_blob': None,
}),
]),
})
# ---
# name: test_updates
OrderedDict({
'sql': 'select rowid, * from v',
'rows': list([
OrderedDict({
'rowid': 1,
'vector': b'\x00\x00\x80?',
'name': 'alex',
}),
OrderedDict({
'rowid': 2,
'vector': b'\x00\x00\x00@',
'name': 'brian',
}),
OrderedDict({
'rowid': 3,
'vector': b'\x00\x00@@',
'name': 'craig',
}),
]),
})
# ---
# name: test_updates.1
dict({
'v_auxiliary': OrderedDict({
'sql': 'select * from v_auxiliary',
'rows': list([
OrderedDict({
'rowid': 1,
'value00': 'alex',
}),
OrderedDict({
'rowid': 2,
'value00': 'brian',
}),
OrderedDict({
'rowid': 3,
'value00': 'craig',
}),
]),
}),
'v_chunks': OrderedDict({
'sql': 'select * from v_chunks',
'rows': list([
OrderedDict({
'chunk_id': 1,
'size': 8,
'validity': b'\x07',
'rowids': b'\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'v_rowids': OrderedDict({
'sql': 'select * from v_rowids',
'rows': list([
OrderedDict({
'rowid': 1,
'id': None,
'chunk_id': 1,
'chunk_offset': 0,
}),
OrderedDict({
'rowid': 2,
'id': None,
'chunk_id': 1,
'chunk_offset': 1,
}),
OrderedDict({
'rowid': 3,
'id': None,
'chunk_id': 1,
'chunk_offset': 2,
}),
]),
}),
'v_vector_chunks00': OrderedDict({
'sql': 'select * from v_vector_chunks00',
'rows': list([
OrderedDict({
'rowid': 1,
'vectors': b'\x00\x00\x80?\x00\x00\x00@\x00\x00@@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
})
# ---
# name: test_updates.2
OrderedDict({
'sql': "update v set name = 'ALEX' where rowid = 1",
'rows': list([
]),
})
# ---
# name: test_updates.3
OrderedDict({
'sql': 'select rowid, * from v',
'rows': list([
OrderedDict({
'rowid': 1,
'vector': b'\x00\x00\x80?',
'name': 'ALEX',
}),
OrderedDict({
'rowid': 2,
'vector': b'\x00\x00\x00@',
'name': 'brian',
}),
OrderedDict({
'rowid': 3,
'vector': b'\x00\x00@@',
'name': 'craig',
}),
]),
})
# ---
# name: test_updates.4
dict({
'v_auxiliary': OrderedDict({
'sql': 'select * from v_auxiliary',
'rows': list([
OrderedDict({
'rowid': 1,
'value00': 'ALEX',
}),
OrderedDict({
'rowid': 2,
'value00': 'brian',
}),
OrderedDict({
'rowid': 3,
'value00': 'craig',
}),
]),
}),
'v_chunks': OrderedDict({
'sql': 'select * from v_chunks',
'rows': list([
OrderedDict({
'chunk_id': 1,
'size': 8,
'validity': b'\x07',
'rowids': b'\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'v_rowids': OrderedDict({
'sql': 'select * from v_rowids',
'rows': list([
OrderedDict({
'rowid': 1,
'id': None,
'chunk_id': 1,
'chunk_offset': 0,
}),
OrderedDict({
'rowid': 2,
'id': None,
'chunk_id': 1,
'chunk_offset': 1,
}),
OrderedDict({
'rowid': 3,
'id': None,
'chunk_id': 1,
'chunk_offset': 2,
}),
]),
}),
'v_vector_chunks00': OrderedDict({
'sql': 'select * from v_vector_chunks00',
'rows': list([
OrderedDict({
'rowid': 1,
'vectors': b'\x00\x00\x80?\x00\x00\x00@\x00\x00@@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
})
# ---
================================================
FILE: tests/__snapshots__/test-general.ambr
================================================
# serializer version: 1
# name: test_info
OrderedDict({
'sql': 'select key, typeof(value) from v_info order by 1',
'rows': list([
OrderedDict({
'key': 'CREATE_VERSION',
'typeof(value)': 'text',
}),
OrderedDict({
'key': 'CREATE_VERSION_MAJOR',
'typeof(value)': 'integer',
}),
OrderedDict({
'key': 'CREATE_VERSION_MINOR',
'typeof(value)': 'integer',
}),
OrderedDict({
'key': 'CREATE_VERSION_PATCH',
'typeof(value)': 'integer',
}),
]),
})
# ---
# name: test_shadow
OrderedDict({
'sql': 'select * from sqlite_master order by name',
'rows': list([
OrderedDict({
'type': 'index',
'name': 'sqlite_autoindex_v_info_1',
'tbl_name': 'v_info',
'rootpage': 3,
'sql': None,
}),
OrderedDict({
'type': 'index',
'name': 'sqlite_autoindex_v_metadatachunks00_1',
'tbl_name': 'v_metadatachunks00',
'rootpage': 10,
'sql': None,
}),
OrderedDict({
'type': 'index',
'name': 'sqlite_autoindex_v_metadatatext00_1',
'tbl_name': 'v_metadatatext00',
'rootpage': 12,
'sql': None,
}),
OrderedDict({
'type': 'index',
'name': 'sqlite_autoindex_v_vector_chunks00_1',
'tbl_name': 'v_vector_chunks00',
'rootpage': 8,
'sql': None,
}),
OrderedDict({
'type': 'table',
'name': 'sqlite_sequence',
'tbl_name': 'sqlite_sequence',
'rootpage': 5,
'sql': 'CREATE TABLE sqlite_sequence(name,seq)',
}),
OrderedDict({
'type': 'table',
'name': 'v',
'tbl_name': 'v',
'rootpage': 0,
'sql': 'CREATE VIRTUAL TABLE v using vec0(a float[1], partition text partition key, metadata text, +name text, chunk_size=8)',
}),
OrderedDict({
'type': 'table',
'name': 'v_auxiliary',
'tbl_name': 'v_auxiliary',
'rootpage': 13,
'sql': 'CREATE TABLE "v_auxiliary"( rowid integer PRIMARY KEY , value00)',
}),
OrderedDict({
'type': 'table',
'name': 'v_chunks',
'tbl_name': 'v_chunks',
'rootpage': 4,
'sql': 'CREATE TABLE "v_chunks"(chunk_id INTEGER PRIMARY KEY AUTOINCREMENT,size INTEGER NOT NULL,sequence_id integer,partition00,validity BLOB NOT NULL, rowids BLOB NOT NULL)',
}),
OrderedDict({
'type': 'table',
'name': 'v_info',
'tbl_name': 'v_info',
'rootpage': 2,
'sql': 'CREATE TABLE "v_info" (key text primary key, value any)',
}),
OrderedDict({
'type': 'table',
'name': 'v_metadatachunks00',
'tbl_name': 'v_metadatachunks00',
'rootpage': 9,
'sql': 'CREATE TABLE "v_metadatachunks00"(rowid PRIMARY KEY, data BLOB NOT NULL)',
}),
OrderedDict({
'type': 'table',
'name': 'v_metadatatext00',
'tbl_name': 'v_metadatatext00',
'rootpage': 11,
'sql': 'CREATE TABLE "v_metadatatext00"(rowid PRIMARY KEY, data TEXT)',
}),
OrderedDict({
'type': 'table',
'name': 'v_rowids',
'tbl_name': 'v_rowids',
'rootpage': 6,
'sql': 'CREATE TABLE "v_rowids"(rowid INTEGER PRIMARY KEY AUTOINCREMENT,id,chunk_id INTEGER,chunk_offset INTEGER)',
}),
OrderedDict({
'type': 'table',
'name': 'v_vector_chunks00',
'tbl_name': 'v_vector_chunks00',
'rootpage': 7,
'sql': 'CREATE TABLE "v_vector_chunks00"(rowid PRIMARY KEY,vectors BLOB NOT NULL)',
}),
]),
})
# ---
# name: test_shadow.1
OrderedDict({
'sql': "select * from pragma_table_list where type = 'shadow' order by name",
'rows': list([
OrderedDict({
'schema': 'main',
'name': 'v_auxiliary',
'type': 'shadow',
'ncol': 2,
'wr': 0,
'strict': 0,
}),
OrderedDict({
'schema': 'main',
'name': 'v_chunks',
'type': 'shadow',
'ncol': 6,
'wr': 0,
'strict': 0,
}),
OrderedDict({
'schema': 'main',
'name': 'v_info',
'type': 'shadow',
'ncol': 2,
'wr': 0,
'strict': 0,
}),
OrderedDict({
'schema': 'main',
'name': 'v_metadatachunks00',
'type': 'shadow',
'ncol': 2,
'wr': 0,
'strict': 0,
}),
OrderedDict({
'schema': 'main',
'name': 'v_metadatatext00',
'type': 'shadow',
'ncol': 2,
'wr': 0,
'strict': 0,
}),
OrderedDict({
'schema': 'main',
'name': 'v_rowids',
'type': 'shadow',
'ncol': 4,
'wr': 0,
'strict': 0,
}),
]),
})
# ---
# name: test_shadow.2
OrderedDict({
'sql': "select * from pragma_table_list where type = 'shadow' order by name",
'rows': list([
]),
})
# ---
================================================
FILE: tests/__snapshots__/test-insert-delete.ambr
================================================
# serializer version: 1
# name: test_info_table_contents
OrderedDict({
'sql': "select key, value from v_info where key not like 'CREATE_VERSION%' order by key",
'rows': list([
]),
})
# ---
# name: test_insert_creates_chunks_and_vectors[rowids_count]
OrderedDict({
'sql': 'select count(*) as cnt from v_rowids',
'rows': list([
OrderedDict({
'cnt': 5,
}),
]),
})
# ---
# name: test_insert_creates_chunks_and_vectors[vector_chunks_count]
OrderedDict({
'sql': 'select count(*) as cnt from v_vector_chunks00',
'rows': list([
OrderedDict({
'cnt': 1,
}),
]),
})
# ---
# name: test_insert_text_primary_key[rowids]
OrderedDict({
'sql': 'select rowid, id, chunk_id, chunk_offset from v_rowids order by rowid',
'rows': list([
OrderedDict({
'rowid': 1,
'id': 'doc_a',
'chunk_id': 1,
'chunk_offset': 0,
}),
OrderedDict({
'rowid': 2,
'id': 'doc_b',
'chunk_id': 1,
'chunk_offset': 1,
}),
]),
})
# ---
================================================
FILE: tests/__snapshots__/test-knn-distance-constraints.ambr
================================================
# serializer version: 1
# name: test_normal
OrderedDict({
'sql': 'SELECT * FROM v',
'rows': list([
OrderedDict({
'rowid': 1,
'embedding': b'\x00\x00\x80?',
'is_odd': 1,
}),
OrderedDict({
'rowid': 2,
'embedding': b'\x00\x00\x00@',
'is_odd': 0,
}),
OrderedDict({
'rowid': 3,
'embedding': b'\x00\x00@@',
'is_odd': 1,
}),
OrderedDict({
'rowid': 4,
'embedding': b'\x00\x00\x80@',
'is_odd': 0,
}),
OrderedDict({
'rowid': 5,
'embedding': b'\x00\x00\xa0@',
'is_odd': 1,
}),
OrderedDict({
'rowid': 6,
'embedding': b'\x00\x00\xc0@',
'is_odd': 0,
}),
OrderedDict({
'rowid': 7,
'embedding': b'\x00\x00\xe0@',
'is_odd': 1,
}),
OrderedDict({
'rowid': 8,
'embedding': b'\x00\x00\x00A',
'is_odd': 0,
}),
OrderedDict({
'rowid': 9,
'embedding': b'\x00\x00\x10A',
'is_odd': 1,
}),
OrderedDict({
'rowid': 10,
'embedding': b'\x00\x00 A',
'is_odd': 0,
}),
OrderedDict({
'rowid': 11,
'embedding': b'\x00\x000A',
'is_odd': 1,
}),
OrderedDict({
'rowid': 12,
'embedding': b'\x00\x00@A',
'is_odd': 0,
}),
OrderedDict({
'rowid': 13,
'embedding': b'\x00\x00PA',
'is_odd': 1,
}),
OrderedDict({
'rowid': 14,
'embedding': b'\x00\x00`A',
'is_odd': 0,
}),
OrderedDict({
'rowid': 15,
'embedding': b'\x00\x00pA',
'is_odd': 1,
}),
OrderedDict({
'rowid': 16,
'embedding': b'\x00\x00\x80A',
'is_odd': 0,
}),
OrderedDict({
'rowid': 17,
'embedding': b'\x00\x00\x88A',
'is_odd': 1,
}),
]),
})
# ---
# name: test_normal.1
OrderedDict({
'sql': 'select rowid, distance from v where embedding match ? and k = ? ',
'rows': list([
OrderedDict({
'rowid': 1,
'distance': 0.0,
}),
OrderedDict({
'rowid': 2,
'distance': 1.0,
}),
OrderedDict({
'rowid': 3,
'distance': 2.0,
}),
OrderedDict({
'rowid': 4,
'distance': 3.0,
}),
OrderedDict({
'rowid': 5,
'distance': 4.0,
}),
]),
})
# ---
# name: test_normal.2
OrderedDict({
'sql': 'select rowid, distance from v where embedding match ? and k = ? AND distance > 5',
'rows': list([
OrderedDict({
'rowid': 7,
'distance': 6.0,
}),
OrderedDict({
'rowid': 8,
'distance': 7.0,
}),
OrderedDict({
'rowid': 9,
'distance': 8.0,
}),
OrderedDict({
'rowid': 10,
'distance': 9.0,
}),
OrderedDict({
'rowid': 11,
'distance': 10.0,
}),
]),
})
# ---
# name: test_normal.3
OrderedDict({
'sql': 'select rowid, distance from v where embedding match ? and k = ? AND distance >= 5',
'rows': list([
OrderedDict({
'rowid': 6,
'distance': 5.0,
}),
OrderedDict({
'rowid': 7,
'distance': 6.0,
}),
OrderedDict({
'rowid': 8,
'distance': 7.0,
}),
OrderedDict({
'rowid': 9,
'distance': 8.0,
}),
OrderedDict({
'rowid': 10,
'distance': 9.0,
}),
]),
})
# ---
# name: test_normal.4
OrderedDict({
'sql': 'select rowid, distance from v where embedding match ? and k = ? AND distance < 3',
'rows': list([
OrderedDict({
'rowid': 1,
'distance': 0.0,
}),
OrderedDict({
'rowid': 2,
'distance': 1.0,
}),
OrderedDict({
'rowid': 3,
'distance': 2.0,
}),
]),
})
# ---
# name: test_normal.5
OrderedDict({
'sql': 'select rowid, distance from v where embedding match ? and k = ? AND distance <= 3',
'rows': list([
OrderedDict({
'rowid': 1,
'distance': 0.0,
}),
OrderedDict({
'rowid': 2,
'distance': 1.0,
}),
OrderedDict({
'rowid': 3,
'distance': 2.0,
}),
OrderedDict({
'rowid': 4,
'distance': 3.0,
}),
]),
})
# ---
# name: test_normal.6
OrderedDict({
'sql': 'select rowid, distance from v where embedding match ? and k = ? AND distance > 7 AND distance <= 10',
'rows': list([
OrderedDict({
'rowid': 9,
'distance': 8.0,
}),
OrderedDict({
'rowid': 10,
'distance': 9.0,
}),
OrderedDict({
'rowid': 11,
'distance': 10.0,
}),
]),
})
# ---
# name: test_normal.7
OrderedDict({
'sql': 'select rowid, distance from v where embedding match ? and k = ? AND distance BETWEEN 7 AND 10',
'rows': list([
OrderedDict({
'rowid': 8,
'distance': 7.0,
}),
OrderedDict({
'rowid': 9,
'distance': 8.0,
}),
OrderedDict({
'rowid': 10,
'distance': 9.0,
}),
OrderedDict({
'rowid': 11,
'distance': 10.0,
}),
]),
})
# ---
# name: test_normal.8
OrderedDict({
'sql': 'select rowid, distance from v where embedding match ? and k = ? AND is_odd == TRUE AND distance BETWEEN 7 AND 10',
'rows': list([
OrderedDict({
'rowid': 9,
'distance': 8.0,
}),
OrderedDict({
'rowid': 11,
'distance': 10.0,
}),
]),
})
# ---
================================================
FILE: tests/__snapshots__/test-metadata.ambr
================================================
# serializer version: 1
# name: test_constructor_limit[max 16 metadata columns]
dict({
'error': 'OperationalError',
'message': 'vec0 constructor error: More than 16 metadata columns were provided',
})
# ---
# name: test_deletes
OrderedDict({
'sql': 'insert into v(rowid, vector, b, n, f, t) values (?, ?, ?, ?, ?, ?)',
'rows': list([
]),
})
# ---
# name: test_deletes.1
OrderedDict({
'sql': 'insert into v(rowid, vector, b, n, f, t) values (?, ?, ?, ?, ?, ?)',
'rows': list([
]),
})
# ---
# name: test_deletes.10
dict({
'v_chunks': OrderedDict({
'sql': 'select * from v_chunks',
'rows': list([
OrderedDict({
'chunk_id': 1,
'size': 8,
'validity': b'\x06',
'rowids': b'\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'v_metadatachunks00': OrderedDict({
'sql': 'select * from v_metadatachunks00',
'rows': list([
OrderedDict({
'rowid': 1,
'data': b'\x06',
}),
]),
}),
'v_metadatachunks01': OrderedDict({
'sql': 'select * from v_metadatachunks01',
'rows': list([
OrderedDict({
'rowid': 1,
'data': b'\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'v_metadatachunks02': OrderedDict({
'sql': 'select * from v_metadatachunks02',
'rows': list([
OrderedDict({
'rowid': 1,
'data': b'\x00\x00\x00\x00\x00\x00\x00\x00\x9a\x99\x99\x99\x99\x99\x01@ffffff\n@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'v_metadatachunks03': OrderedDict({
'sql': 'select * from v_metadatachunks03',
'rows': list([
OrderedDict({
'rowid': 1,
'data': b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00test2\x00\x00\x00\x00\x00\x00\x00\r\x00\x00\x00123456789012\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'v_metadatatext03': OrderedDict({
'sql': 'select * from v_metadatatext03',
'rows': list([
OrderedDict({
'rowid': 3,
'data': '1234567890123',
}),
]),
}),
'v_rowids': OrderedDict({
'sql': 'select * from v_rowids',
'rows': list([
OrderedDict({
'rowid': 2,
'id': None,
'chunk_id': 1,
'chunk_offset': 1,
}),
OrderedDict({
'rowid': 3,
'id': None,
'chunk_id': 1,
'chunk_offset': 2,
}),
]),
}),
'v_vector_chunks00': OrderedDict({
'sql': 'select * from v_vector_chunks00',
'rows': list([
OrderedDict({
'rowid': 1,
'vectors': b'\x00\x00\x00\x00""""3333\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
})
# ---
# name: test_deletes.2
OrderedDict({
'sql': 'insert into v(rowid, vector, b, n, f, t) values (?, ?, ?, ?, ?, ?)',
'rows': list([
]),
})
# ---
# name: test_deletes.3
OrderedDict({
'sql': 'select * from v',
'rows': list([
OrderedDict({
'rowid': 1,
'vector': b'\x11\x11\x11\x11',
'b': 1,
'n': 1,
'f': 1.1,
't': 'test1',
}),
OrderedDict({
'rowid': 2,
'vector': b'""""',
'b': 1,
'n': 2,
'f': 2.2,
't': 'test2',
}),
OrderedDict({
'rowid': 3,
'vector': b'3333',
'b': 1,
'n': 3,
'f': 3.3,
't': '1234567890123',
}),
]),
})
# ---
# name: test_deletes.4
dict({
'v_chunks': OrderedDict({
'sql': 'select * from v_chunks',
'rows': list([
OrderedDict({
'chunk_id': 1,
'size': 8,
'validity': b'\x07',
'rowids': b'\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'v_metadatachunks00': OrderedDict({
'sql': 'select * from v_metadatachunks00',
'rows': list([
OrderedDict({
'rowid': 1,
'data': b'\x07',
}),
]),
}),
'v_metadatachunks01': OrderedDict({
'sql': 'select * from v_metadatachunks01',
'rows': list([
OrderedDict({
'rowid': 1,
'data': b'\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'v_metadatachunks02': OrderedDict({
'sql': 'select * from v_metadatachunks02',
'rows': list([
OrderedDict({
'rowid': 1,
'data': b'\x9a\x99\x99\x99\x99\x99\xf1?\x9a\x99\x99\x99\x99\x99\x01@ffffff\n@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'v_metadatachunks03': OrderedDict({
'sql': 'select * from v_metadatachunks03',
'rows': list([
OrderedDict({
'rowid': 1,
'data': b'\x05\x00\x00\x00test1\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00test2\x00\x00\x00\x00\x00\x00\x00\r\x00\x00\x00123456789012\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'v_metadatatext03': OrderedDict({
'sql': 'select * from v_metadatatext03',
'rows': list([
OrderedDict({
'rowid': 3,
'data': '1234567890123',
}),
]),
}),
'v_rowids': OrderedDict({
'sql': 'select * from v_rowids',
'rows': list([
OrderedDict({
'rowid': 1,
'id': None,
'chunk_id': 1,
'chunk_offset': 0,
}),
OrderedDict({
'rowid': 2,
'id': None,
'chunk_id': 1,
'chunk_offset': 1,
}),
OrderedDict({
'rowid': 3,
'id': None,
'chunk_id': 1,
'chunk_offset': 2,
}),
]),
}),
'v_vector_chunks00': OrderedDict({
'sql': 'select * from v_vector_chunks00',
'rows': list([
OrderedDict({
'rowid': 1,
'vectors': b'\x11\x11\x11\x11""""3333\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
})
# ---
# name: test_deletes.5
OrderedDict({
'sql': 'DELETE FROM v where rowid = 1',
'rows': list([
]),
})
# ---
# name: test_deletes.6
OrderedDict({
'sql': 'select * from v',
'rows': list([
OrderedDict({
'rowid': 2,
'vector': b'""""',
'b': 1,
'n': 2,
'f': 2.2,
't': 'test2',
}),
OrderedDict({
'rowid': 3,
'vector': b'3333',
'b': 1,
'n': 3,
'f': 3.3,
't': '1234567890123',
}),
]),
})
# ---
# name: test_deletes.7
dict({
'v_chunks': OrderedDict({
'sql': 'select * from v_chunks',
'rows': list([
OrderedDict({
'chunk_id': 1,
'size': 8,
'validity': b'\x06',
'rowids': b'\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'v_metadatachunks00': OrderedDict({
'sql': 'select * from v_metadatachunks00',
'rows': list([
OrderedDict({
'rowid': 1,
'data': b'\x06',
}),
]),
}),
'v_metadatachunks01': OrderedDict({
'sql': 'select * from v_metadatachunks01',
'rows': list([
OrderedDict({
'rowid': 1,
'data': b'\x00\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'v_metadatachunks02': OrderedDict({
'sql': 'select * from v_metadatachunks02',
'rows': list([
OrderedDict({
'rowid': 1,
'data': b'\x00\x00\x00\x00\x00\x00\x00\x00\x9a\x99\x99\x99\x99\x99\x01@ffffff\n@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'v_metadatachunks03': OrderedDict({
'sql': 'select * from v_metadatachunks03',
'rows': list([
OrderedDict({
'rowid': 1,
'data': b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00test2\x00\x00\x00\x00\x00\x00\x00\r\x00\x00\x00123456789012\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'v_metadatatext03': OrderedDict({
'sql': 'select * from v_metadatatext03',
'rows': list([
OrderedDict({
'rowid': 3,
'data': '1234567890123',
}),
]),
}),
'v_rowids': OrderedDict({
'sql': 'select * from v_rowids',
'rows': list([
OrderedDict({
'rowid': 2,
'id': None,
'chunk_id': 1,
'chunk_offset': 1,
}),
OrderedDict({
'rowid': 3,
'id': None,
'chunk_id': 1,
'chunk_offset': 2,
}),
]),
}),
'v_vector_chunks00': OrderedDict({
'sql': 'select * from v_vector_chunks00',
'rows': list([
OrderedDict({
'rowid': 1,
'vectors': b'\x00\x00\x00\x00""""3333\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
})
# ---
# name: test_deletes.8
OrderedDict({
'sql': 'DELETE FROM v where rowid = 3',
'rows': list([
]),
})
# ---
# name: test_deletes.9
OrderedDict({
'sql': 'select * from v',
'rows': list([
OrderedDict({
'rowid': 2,
'vector': b'""""',
'b': 1,
'n': 2,
'f': 2.2,
't': 'test2',
}),
OrderedDict({
'rowid': 3,
'vector': b'3333',
'b': 1,
'n': 3,
'f': 3.3,
't': '1234567890123',
}),
]),
})
# ---
# name: test_errors
OrderedDict({
'sql': 'select * from v',
'rows': list([
OrderedDict({
'rowid': 1,
'vector': b'\x00\x00\x80?',
't': 'aaaaaaaaaaaax',
}),
]),
})
# ---
# name: test_errors.1
dict({
'error': 'OperationalError',
'message': 'Could not extract metadata value for column t at rowid 1',
})
# ---
# name: test_idxstr
OrderedDict({
'sql': "select * from vec_movies where synopsis_embedding match '' and k = 0 and is_favorited = true",
'plan': list([
dict({
'detail': 'SCAN vec_movies VIRTUAL TABLE INDEX 0:3{___}___&Aa_',
'id': 2,
'parent': 0,
}),
]),
})
# ---
# name: test_idxstr[knn-constraint-float !=]
OrderedDict({
'sql': "select * from vec_movies where synopsis_embedding match '' and k = 0 and mean_rating != NULL",
'plan': list([
dict({
'detail': 'SCAN vec_movies VIRTUAL TABLE INDEX 0:3{___}___&Df_',
'id': 2,
'parent': 0,
}),
]),
})
# ---
# name: test_idxstr[knn-constraint-float <=]
OrderedDict({
'sql': "select * from vec_movies where synopsis_embedding match '' and k = 0 and mean_rating <= NULL",
'plan': list([
dict({
'detail': 'SCAN vec_movies VIRTUAL TABLE INDEX 0:3{___}___&Dc_',
'id': 2,
'parent': 0,
}),
]),
})
# ---
# name: test_idxstr[knn-constraint-float <]
OrderedDict({
'sql': "select * from vec_movies where synopsis_embedding match '' and k = 0 and mean_rating < NULL",
'plan': list([
dict({
'detail': 'SCAN vec_movies VIRTUAL TABLE INDEX 0:3{___}___&Dd_',
'id': 2,
'parent': 0,
}),
]),
})
# ---
# name: test_idxstr[knn-constraint-float >=]
OrderedDict({
'sql': "select * from vec_movies where synopsis_embedding match '' and k = 0 and mean_rating >= NULL",
'plan': list([
dict({
'detail': 'SCAN vec_movies VIRTUAL TABLE INDEX 0:3{___}___&De_',
'id': 2,
'parent': 0,
}),
]),
})
# ---
# name: test_idxstr[knn-constraint-float >]
OrderedDict({
'sql': "select * from vec_movies where synopsis_embedding match '' and k = 0 and mean_rating > NULL",
'plan': list([
dict({
'detail': 'SCAN vec_movies VIRTUAL TABLE INDEX 0:3{___}___&Db_',
'id': 2,
'parent': 0,
}),
]),
})
# ---
# name: test_idxstr[knn-constraint-int !=]
OrderedDict({
'sql': "select * from vec_movies where synopsis_embedding match '' and k = 0 and num_reviews != NULL",
'plan': list([
dict({
'detail': 'SCAN vec_movies VIRTUAL TABLE INDEX 0:3{___}___&Cf_',
'id': 2,
'parent': 0,
}),
]),
})
# ---
# name: test_idxstr[knn-constraint-int <=]
OrderedDict({
'sql': "select * from vec_movies where synopsis_embedding match '' and k = 0 and num_reviews <= NULL",
'plan': list([
dict({
'detail': 'SCAN vec_movies VIRTUAL TABLE INDEX 0:3{___}___&Cc_',
'id': 2,
'parent': 0,
}),
]),
})
# ---
# name: test_idxstr[knn-constraint-int <]
OrderedDict({
'sql': "select * from vec_movies where synopsis_embedding match '' and k = 0 and num_reviews < NULL",
'plan': list([
dict({
'detail': 'SCAN vec_movies VIRTUAL TABLE INDEX 0:3{___}___&Cd_',
'id': 2,
'parent': 0,
}),
]),
})
# ---
# name: test_idxstr[knn-constraint-int >=]
OrderedDict({
'sql': "select * from vec_movies where synopsis_embedding match '' and k = 0 and num_reviews >= NULL",
'plan': list([
dict({
'detail': 'SCAN vec_movies VIRTUAL TABLE INDEX 0:3{___}___&Ce_',
'id': 2,
'parent': 0,
}),
]),
})
# ---
# name: test_idxstr[knn-constraint-int >]
OrderedDict({
'sql': "select * from vec_movies where synopsis_embedding match '' and k = 0 and num_reviews > NULL",
'plan': list([
dict({
'detail': 'SCAN vec_movies VIRTUAL TABLE INDEX 0:3{___}___&Cb_',
'id': 2,
'parent': 0,
}),
]),
})
# ---
# name: test_idxstr[knn-constraint-text !=]
OrderedDict({
'sql': "select * from vec_movies where synopsis_embedding match '' and k = 0 and genre != NULL",
'plan': list([
dict({
'detail': 'SCAN vec_movies VIRTUAL TABLE INDEX 0:3{___}___&Bf_',
'id': 2,
'parent': 0,
}),
]),
})
# ---
# name: test_idxstr[knn-constraint-text <=]
OrderedDict({
'sql': "select * from vec_movies where synopsis_embedding match '' and k = 0 and genre <= NULL",
'plan': list([
dict({
'detail': 'SCAN vec_movies VIRTUAL TABLE INDEX 0:3{___}___&Bc_',
'id': 2,
'parent': 0,
}),
]),
})
# ---
# name: test_idxstr[knn-constraint-text <]
OrderedDict({
'sql': "select * from vec_movies where synopsis_embedding match '' and k = 0 and genre < NULL",
'plan': list([
dict({
'detail': 'SCAN vec_movies VIRTUAL TABLE INDEX 0:3{___}___&Bd_',
'id': 2,
'parent': 0,
}),
]),
})
# ---
# name: test_idxstr[knn-constraint-text >=]
OrderedDict({
'sql': "select * from vec_movies where synopsis_embedding match '' and k = 0 and genre >= NULL",
'plan': list([
dict({
'detail': 'SCAN vec_movies VIRTUAL TABLE INDEX 0:3{___}___&Be_',
'id': 2,
'parent': 0,
}),
]),
})
# ---
# name: test_idxstr[knn-constraint-text >]
OrderedDict({
'sql': "select * from vec_movies where synopsis_embedding match '' and k = 0 and genre > NULL",
'plan': list([
dict({
'detail': 'SCAN vec_movies VIRTUAL TABLE INDEX 0:3{___}___&Bb_',
'id': 2,
'parent': 0,
}),
]),
})
# ---
# name: test_knn.1
dict({
'error': 'OperationalError',
'message': 'An illegal WHERE constraint was provided on a vec0 metadata column in a KNN query. Only one of EQUALS, GREATER_THAN, LESS_THAN_OR_EQUAL, LESS_THAN, GREATER_THAN_OR_EQUAL, NOT_EQUALS is allowed.',
})
# ---
# name: test_knn[sqlite_master]
OrderedDict({
'sql': "select * from sqlite_master where type = 'table' order by name",
'rows': list([
OrderedDict({
'type': 'table',
'name': 'sqlite_sequence',
'tbl_name': 'sqlite_sequence',
'rootpage': 5,
'sql': 'CREATE TABLE sqlite_sequence(name,seq)',
}),
OrderedDict({
'type': 'table',
'name': 'v',
'tbl_name': 'v',
'rootpage': 0,
'sql': 'CREATE VIRTUAL TABLE v using vec0(vector float[1], name text, chunk_size=8)',
}),
OrderedDict({
'type': 'table',
'name': 'v_chunks',
'tbl_name': 'v_chunks',
'rootpage': 4,
'sql': 'CREATE TABLE "v_chunks"(chunk_id INTEGER PRIMARY KEY AUTOINCREMENT,size INTEGER NOT NULL,validity BLOB NOT NULL,rowids BLOB NOT NULL)',
}),
OrderedDict({
'type': 'table',
'name': 'v_info',
'tbl_name': 'v_info',
'rootpage': 2,
'sql': 'CREATE TABLE "v_info" (key text primary key, value any)',
}),
OrderedDict({
'type': 'table',
'name': 'v_metadatachunks00',
'tbl_name': 'v_metadatachunks00',
'rootpage': 9,
'sql': 'CREATE TABLE "v_metadatachunks00"(rowid PRIMARY KEY, data BLOB NOT NULL)',
}),
OrderedDict({
'type': 'table',
'name': 'v_metadatatext00',
'tbl_name': 'v_metadatatext00',
'rootpage': 11,
'sql': 'CREATE TABLE "v_metadatatext00"(rowid PRIMARY KEY, data TEXT)',
}),
OrderedDict({
'type': 'table',
'name': 'v_rowids',
'tbl_name': 'v_rowids',
'rootpage': 6,
'sql': 'CREATE TABLE "v_rowids"(rowid INTEGER PRIMARY KEY AUTOINCREMENT,id,chunk_id INTEGER,chunk_offset INTEGER)',
}),
OrderedDict({
'type': 'table',
'name': 'v_vector_chunks00',
'tbl_name': 'v_vector_chunks00',
'rootpage': 7,
'sql': 'CREATE TABLE "v_vector_chunks00"(rowid PRIMARY KEY,vectors BLOB NOT NULL)',
}),
]),
})
# ---
# name: test_long_text_knn[eq-bb]
OrderedDict({
'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name = ?",
'rows': list([
]),
})
# ---
# name: test_long_text_knn[eq-bbbb]
OrderedDict({
'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name = ?",
'rows': list([
OrderedDict({
'rowid': 3,
'name': 'bbbb',
'distance': 97.0,
}),
]),
})
# ---
# name: test_long_text_knn[eq-bbbbbb]
OrderedDict({
'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name = ?",
'rows': list([
]),
})
# ---
# name: test_long_text_knn[eq-bbbbbbbbbbbb_aaa]
OrderedDict({
'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name = ?",
'rows': list([
]),
})
# ---
# name: test_long_text_knn[eq-bbbbbbbbbbbb_bbb]
OrderedDict({
'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name = ?",
'rows': list([
OrderedDict({
'rowid': 4,
'name': 'bbbbbbbbbbbb_bbb',
'distance': 96.0,
}),
]),
})
# ---
# name: test_long_text_knn[eq-bbbbbbbbbbbb_ccc]
OrderedDict({
'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name = ?",
'rows': list([
]),
})
# ---
# name: test_long_text_knn[eq-longlonglonglonglonglonglong]
OrderedDict({
'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name = ?",
'rows': list([
]),
})
# ---
# name: test_long_text_knn[ge-bb]
OrderedDict({
'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name >= ?",
'rows': list([
OrderedDict({
'rowid': 6,
'name': 'cccccccccccc_ccc',
'distance': 94.0,
}),
OrderedDict({
'rowid': 5,
'name': 'cccc',
'distance': 95.0,
}),
OrderedDict({
'rowid': 4,
'name': 'bbbbbbbbbbbb_bbb',
'distance': 96.0,
}),
OrderedDict({
'rowid': 3,
'name': 'bbbb',
'distance': 97.0,
}),
]),
})
# ---
# name: test_long_text_knn[ge-bbbb]
OrderedDict({
'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name >= ?",
'rows': list([
OrderedDict({
'rowid': 6,
'name': 'cccccccccccc_ccc',
'distance': 94.0,
}),
OrderedDict({
'rowid': 5,
'name': 'cccc',
'distance': 95.0,
}),
OrderedDict({
'rowid': 4,
'name': 'bbbbbbbbbbbb_bbb',
'distance': 96.0,
}),
OrderedDict({
'rowid': 3,
'name': 'bbbb',
'distance': 97.0,
}),
]),
})
# ---
# name: test_long_text_knn[ge-bbbbbb]
OrderedDict({
'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name >= ?",
'rows': list([
OrderedDict({
'rowid': 6,
'name': 'cccccccccccc_ccc',
'distance': 94.0,
}),
OrderedDict({
'rowid': 5,
'name': 'cccc',
'distance': 95.0,
}),
OrderedDict({
'rowid': 4,
'name': 'bbbbbbbbbbbb_bbb',
'distance': 96.0,
}),
]),
})
# ---
# name: test_long_text_knn[ge-bbbbbbbbbbbb_aaa]
OrderedDict({
'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name >= ?",
'rows': list([
OrderedDict({
'rowid': 6,
'name': 'cccccccccccc_ccc',
'distance': 94.0,
}),
OrderedDict({
'rowid': 5,
'name': 'cccc',
'distance': 95.0,
}),
OrderedDict({
'rowid': 4,
'name': 'bbbbbbbbbbbb_bbb',
'distance': 96.0,
}),
]),
})
# ---
# name: test_long_text_knn[ge-bbbbbbbbbbbb_bbb]
OrderedDict({
'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name >= ?",
'rows': list([
OrderedDict({
'rowid': 6,
'name': 'cccccccccccc_ccc',
'distance': 94.0,
}),
OrderedDict({
'rowid': 5,
'name': 'cccc',
'distance': 95.0,
}),
OrderedDict({
'rowid': 4,
'name': 'bbbbbbbbbbbb_bbb',
'distance': 96.0,
}),
]),
})
# ---
# name: test_long_text_knn[ge-bbbbbbbbbbbb_ccc]
OrderedDict({
'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name >= ?",
'rows': list([
OrderedDict({
'rowid': 6,
'name': 'cccccccccccc_ccc',
'distance': 94.0,
}),
OrderedDict({
'rowid': 5,
'name': 'cccc',
'distance': 95.0,
}),
]),
})
# ---
# name: test_long_text_knn[ge-longlonglonglonglonglonglong]
OrderedDict({
'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name >= ?",
'rows': list([
]),
})
# ---
# name: test_long_text_knn[gt-bb]
OrderedDict({
'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name > ?",
'rows': list([
OrderedDict({
'rowid': 6,
'name': 'cccccccccccc_ccc',
'distance': 94.0,
}),
OrderedDict({
'rowid': 5,
'name': 'cccc',
'distance': 95.0,
}),
OrderedDict({
'rowid': 4,
'name': 'bbbbbbbbbbbb_bbb',
'distance': 96.0,
}),
OrderedDict({
'rowid': 3,
'name': 'bbbb',
'distance': 97.0,
}),
]),
})
# ---
# name: test_long_text_knn[gt-bbbb]
OrderedDict({
'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name > ?",
'rows': list([
OrderedDict({
'rowid': 6,
'name': 'cccccccccccc_ccc',
'distance': 94.0,
}),
OrderedDict({
'rowid': 5,
'name': 'cccc',
'distance': 95.0,
}),
OrderedDict({
'rowid': 4,
'name': 'bbbbbbbbbbbb_bbb',
'distance': 96.0,
}),
]),
})
# ---
# name: test_long_text_knn[gt-bbbbbb]
OrderedDict({
'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name > ?",
'rows': list([
OrderedDict({
'rowid': 6,
'name': 'cccccccccccc_ccc',
'distance': 94.0,
}),
OrderedDict({
'rowid': 5,
'name': 'cccc',
'distance': 95.0,
}),
OrderedDict({
'rowid': 4,
'name': 'bbbbbbbbbbbb_bbb',
'distance': 96.0,
}),
]),
})
# ---
# name: test_long_text_knn[gt-bbbbbbbbbbbb_aaa]
OrderedDict({
'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name > ?",
'rows': list([
OrderedDict({
'rowid': 6,
'name': 'cccccccccccc_ccc',
'distance': 94.0,
}),
OrderedDict({
'rowid': 5,
'name': 'cccc',
'distance': 95.0,
}),
OrderedDict({
'rowid': 4,
'name': 'bbbbbbbbbbbb_bbb',
'distance': 96.0,
}),
]),
})
# ---
# name: test_long_text_knn[gt-bbbbbbbbbbbb_bbb]
OrderedDict({
'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name > ?",
'rows': list([
OrderedDict({
'rowid': 6,
'name': 'cccccccccccc_ccc',
'distance': 94.0,
}),
OrderedDict({
'rowid': 5,
'name': 'cccc',
'distance': 95.0,
}),
]),
})
# ---
# name: test_long_text_knn[gt-bbbbbbbbbbbb_ccc]
OrderedDict({
'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name > ?",
'rows': list([
OrderedDict({
'rowid': 6,
'name': 'cccccccccccc_ccc',
'distance': 94.0,
}),
OrderedDict({
'rowid': 5,
'name': 'cccc',
'distance': 95.0,
}),
]),
})
# ---
# name: test_long_text_knn[gt-longlonglonglonglonglonglong]
OrderedDict({
'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name > ?",
'rows': list([
]),
})
# ---
# name: test_long_text_knn[le-bb]
OrderedDict({
'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name <= ?",
'rows': list([
OrderedDict({
'rowid': 2,
'name': 'aaaaaaaaaaaa_aaa',
'distance': 98.0,
}),
OrderedDict({
'rowid': 1,
'name': 'aaaa',
'distance': 99.0,
}),
]),
})
# ---
# name: test_long_text_knn[le-bbbb]
OrderedDict({
'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name <= ?",
'rows': list([
OrderedDict({
'rowid': 3,
'name': 'bbbb',
'distance': 97.0,
}),
OrderedDict({
'rowid': 2,
'name': 'aaaaaaaaaaaa_aaa',
'distance': 98.0,
}),
OrderedDict({
'rowid': 1,
'name': 'aaaa',
'distance': 99.0,
}),
]),
})
# ---
# name: test_long_text_knn[le-bbbbbb]
OrderedDict({
'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name <= ?",
'rows': list([
OrderedDict({
'rowid': 3,
'name': 'bbbb',
'distance': 97.0,
}),
OrderedDict({
'rowid': 2,
'name': 'aaaaaaaaaaaa_aaa',
'distance': 98.0,
}),
OrderedDict({
'rowid': 1,
'name': 'aaaa',
'distance': 99.0,
}),
]),
})
# ---
# name: test_long_text_knn[le-bbbbbbbbbbbb_aaa]
OrderedDict({
'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name <= ?",
'rows': list([
OrderedDict({
'rowid': 3,
'name': 'bbbb',
'distance': 97.0,
}),
OrderedDict({
'rowid': 2,
'name': 'aaaaaaaaaaaa_aaa',
'distance': 98.0,
}),
OrderedDict({
'rowid': 1,
'name': 'aaaa',
'distance': 99.0,
}),
]),
})
# ---
# name: test_long_text_knn[le-bbbbbbbbbbbb_bbb]
OrderedDict({
'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name <= ?",
'rows': list([
OrderedDict({
'rowid': 4,
'name': 'bbbbbbbbbbbb_bbb',
'distance': 96.0,
}),
OrderedDict({
'rowid': 3,
'name': 'bbbb',
'distance': 97.0,
}),
OrderedDict({
'rowid': 2,
'name': 'aaaaaaaaaaaa_aaa',
'distance': 98.0,
}),
OrderedDict({
'rowid': 1,
'name': 'aaaa',
'distance': 99.0,
}),
]),
})
# ---
# name: test_long_text_knn[le-bbbbbbbbbbbb_ccc]
OrderedDict({
'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name <= ?",
'rows': list([
OrderedDict({
'rowid': 4,
'name': 'bbbbbbbbbbbb_bbb',
'distance': 96.0,
}),
OrderedDict({
'rowid': 3,
'name': 'bbbb',
'distance': 97.0,
}),
OrderedDict({
'rowid': 2,
'name': 'aaaaaaaaaaaa_aaa',
'distance': 98.0,
}),
OrderedDict({
'rowid': 1,
'name': 'aaaa',
'distance': 99.0,
}),
]),
})
# ---
# name: test_long_text_knn[le-longlonglonglonglonglonglong]
OrderedDict({
'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name <= ?",
'rows': list([
OrderedDict({
'rowid': 6,
'name': 'cccccccccccc_ccc',
'distance': 94.0,
}),
OrderedDict({
'rowid': 5,
'name': 'cccc',
'distance': 95.0,
}),
OrderedDict({
'rowid': 4,
'name': 'bbbbbbbbbbbb_bbb',
'distance': 96.0,
}),
OrderedDict({
'rowid': 3,
'name': 'bbbb',
'distance': 97.0,
}),
OrderedDict({
'rowid': 2,
'name': 'aaaaaaaaaaaa_aaa',
'distance': 98.0,
}),
]),
})
# ---
# name: test_long_text_knn[lt-bb]
OrderedDict({
'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name < ?",
'rows': list([
OrderedDict({
'rowid': 2,
'name': 'aaaaaaaaaaaa_aaa',
'distance': 98.0,
}),
OrderedDict({
'rowid': 1,
'name': 'aaaa',
'distance': 99.0,
}),
]),
})
# ---
# name: test_long_text_knn[lt-bbbb]
OrderedDict({
'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name < ?",
'rows': list([
OrderedDict({
'rowid': 2,
'name': 'aaaaaaaaaaaa_aaa',
'distance': 98.0,
}),
OrderedDict({
'rowid': 1,
'name': 'aaaa',
'distance': 99.0,
}),
]),
})
# ---
# name: test_long_text_knn[lt-bbbbbb]
OrderedDict({
'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name < ?",
'rows': list([
OrderedDict({
'rowid': 3,
'name': 'bbbb',
'distance': 97.0,
}),
OrderedDict({
'rowid': 2,
'name': 'aaaaaaaaaaaa_aaa',
'distance': 98.0,
}),
OrderedDict({
'rowid': 1,
'name': 'aaaa',
'distance': 99.0,
}),
]),
})
# ---
# name: test_long_text_knn[lt-bbbbbbbbbbbb_aaa]
OrderedDict({
'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name < ?",
'rows': list([
OrderedDict({
'rowid': 3,
'name': 'bbbb',
'distance': 97.0,
}),
OrderedDict({
'rowid': 2,
'name': 'aaaaaaaaaaaa_aaa',
'distance': 98.0,
}),
OrderedDict({
'rowid': 1,
'name': 'aaaa',
'distance': 99.0,
}),
]),
})
# ---
# name: test_long_text_knn[lt-bbbbbbbbbbbb_bbb]
OrderedDict({
'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name < ?",
'rows': list([
OrderedDict({
'rowid': 3,
'name': 'bbbb',
'distance': 97.0,
}),
OrderedDict({
'rowid': 2,
'name': 'aaaaaaaaaaaa_aaa',
'distance': 98.0,
}),
OrderedDict({
'rowid': 1,
'name': 'aaaa',
'distance': 99.0,
}),
]),
})
# ---
# name: test_long_text_knn[lt-bbbbbbbbbbbb_ccc]
OrderedDict({
'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name < ?",
'rows': list([
OrderedDict({
'rowid': 4,
'name': 'bbbbbbbbbbbb_bbb',
'distance': 96.0,
}),
OrderedDict({
'rowid': 3,
'name': 'bbbb',
'distance': 97.0,
}),
OrderedDict({
'rowid': 2,
'name': 'aaaaaaaaaaaa_aaa',
'distance': 98.0,
}),
OrderedDict({
'rowid': 1,
'name': 'aaaa',
'distance': 99.0,
}),
]),
})
# ---
# name: test_long_text_knn[lt-longlonglonglonglonglonglong]
OrderedDict({
'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name < ?",
'rows': list([
OrderedDict({
'rowid': 6,
'name': 'cccccccccccc_ccc',
'distance': 94.0,
}),
OrderedDict({
'rowid': 5,
'name': 'cccc',
'distance': 95.0,
}),
OrderedDict({
'rowid': 4,
'name': 'bbbbbbbbbbbb_bbb',
'distance': 96.0,
}),
OrderedDict({
'rowid': 3,
'name': 'bbbb',
'distance': 97.0,
}),
OrderedDict({
'rowid': 2,
'name': 'aaaaaaaaaaaa_aaa',
'distance': 98.0,
}),
]),
})
# ---
# name: test_long_text_knn[ne-bb]
OrderedDict({
'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name != ?",
'rows': list([
OrderedDict({
'rowid': 6,
'name': 'cccccccccccc_ccc',
'distance': 94.0,
}),
OrderedDict({
'rowid': 5,
'name': 'cccc',
'distance': 95.0,
}),
OrderedDict({
'rowid': 4,
'name': 'bbbbbbbbbbbb_bbb',
'distance': 96.0,
}),
OrderedDict({
'rowid': 3,
'name': 'bbbb',
'distance': 97.0,
}),
OrderedDict({
'rowid': 2,
'name': 'aaaaaaaaaaaa_aaa',
'distance': 98.0,
}),
]),
})
# ---
# name: test_long_text_knn[ne-bbbb]
OrderedDict({
'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name != ?",
'rows': list([
OrderedDict({
'rowid': 6,
'name': 'cccccccccccc_ccc',
'distance': 94.0,
}),
OrderedDict({
'rowid': 5,
'name': 'cccc',
'distance': 95.0,
}),
OrderedDict({
'rowid': 4,
'name': 'bbbbbbbbbbbb_bbb',
'distance': 96.0,
}),
OrderedDict({
'rowid': 2,
'name': 'aaaaaaaaaaaa_aaa',
'distance': 98.0,
}),
OrderedDict({
'rowid': 1,
'name': 'aaaa',
'distance': 99.0,
}),
]),
})
# ---
# name: test_long_text_knn[ne-bbbbbb]
OrderedDict({
'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name != ?",
'rows': list([
OrderedDict({
'rowid': 6,
'name': 'cccccccccccc_ccc',
'distance': 94.0,
}),
OrderedDict({
'rowid': 5,
'name': 'cccc',
'distance': 95.0,
}),
OrderedDict({
'rowid': 4,
'name': 'bbbbbbbbbbbb_bbb',
'distance': 96.0,
}),
OrderedDict({
'rowid': 3,
'name': 'bbbb',
'distance': 97.0,
}),
OrderedDict({
'rowid': 2,
'name': 'aaaaaaaaaaaa_aaa',
'distance': 98.0,
}),
]),
})
# ---
# name: test_long_text_knn[ne-bbbbbbbbbbbb_aaa]
OrderedDict({
'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name != ?",
'rows': list([
OrderedDict({
'rowid': 6,
'name': 'cccccccccccc_ccc',
'distance': 94.0,
}),
OrderedDict({
'rowid': 5,
'name': 'cccc',
'distance': 95.0,
}),
OrderedDict({
'rowid': 4,
'name': 'bbbbbbbbbbbb_bbb',
'distance': 96.0,
}),
OrderedDict({
'rowid': 3,
'name': 'bbbb',
'distance': 97.0,
}),
OrderedDict({
'rowid': 2,
'name': 'aaaaaaaaaaaa_aaa',
'distance': 98.0,
}),
]),
})
# ---
# name: test_long_text_knn[ne-bbbbbbbbbbbb_bbb]
OrderedDict({
'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name != ?",
'rows': list([
OrderedDict({
'rowid': 6,
'name': 'cccccccccccc_ccc',
'distance': 94.0,
}),
OrderedDict({
'rowid': 5,
'name': 'cccc',
'distance': 95.0,
}),
OrderedDict({
'rowid': 3,
'name': 'bbbb',
'distance': 97.0,
}),
OrderedDict({
'rowid': 2,
'name': 'aaaaaaaaaaaa_aaa',
'distance': 98.0,
}),
OrderedDict({
'rowid': 1,
'name': 'aaaa',
'distance': 99.0,
}),
]),
})
# ---
# name: test_long_text_knn[ne-bbbbbbbbbbbb_ccc]
OrderedDict({
'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name != ?",
'rows': list([
OrderedDict({
'rowid': 6,
'name': 'cccccccccccc_ccc',
'distance': 94.0,
}),
OrderedDict({
'rowid': 5,
'name': 'cccc',
'distance': 95.0,
}),
OrderedDict({
'rowid': 4,
'name': 'bbbbbbbbbbbb_bbb',
'distance': 96.0,
}),
OrderedDict({
'rowid': 3,
'name': 'bbbb',
'distance': 97.0,
}),
OrderedDict({
'rowid': 2,
'name': 'aaaaaaaaaaaa_aaa',
'distance': 98.0,
}),
]),
})
# ---
# name: test_long_text_knn[ne-longlonglonglonglonglonglong]
OrderedDict({
'sql': "select rowid, name, distance from v where vector match '[100]' and k = 5 and name != ?",
'rows': list([
OrderedDict({
'rowid': 6,
'name': 'cccccccccccc_ccc',
'distance': 94.0,
}),
OrderedDict({
'rowid': 5,
'name': 'cccc',
'distance': 95.0,
}),
OrderedDict({
'rowid': 4,
'name': 'bbbbbbbbbbbb_bbb',
'distance': 96.0,
}),
OrderedDict({
'rowid': 3,
'name': 'bbbb',
'distance': 97.0,
}),
OrderedDict({
'rowid': 2,
'name': 'aaaaaaaaaaaa_aaa',
'distance': 98.0,
}),
]),
})
# ---
# name: test_long_text_updates
dict({
'v_chunks': OrderedDict({
'sql': 'select * from v_chunks',
'rows': list([
]),
}),
'v_metadatachunks00': OrderedDict({
'sql': 'select * from v_metadatachunks00',
'rows': list([
]),
}),
'v_metadatatext00': OrderedDict({
'sql': 'select * from v_metadatatext00',
'rows': list([
]),
}),
'v_rowids': OrderedDict({
'sql': 'select * from v_rowids',
'rows': list([
]),
}),
'v_vector_chunks00': OrderedDict({
'sql': 'select * from v_vector_chunks00',
'rows': list([
]),
}),
})
# ---
# name: test_long_text_updates.1
OrderedDict({
'sql': 'select * from v',
'rows': list([
OrderedDict({
'rowid': 1,
'vector': b'\x11\x11\x11\x11',
'name': '123456789a12',
}),
OrderedDict({
'rowid': 2,
'vector': b'\x11\x11\x11\x11',
'name': '123456789a123',
}),
]),
})
# ---
# name: test_long_text_updates.2
dict({
'v_chunks': OrderedDict({
'sql': 'select * from v_chunks',
'rows': list([
OrderedDict({
'chunk_id': 1,
'size': 8,
'validity': b'\x03',
'rowids': b'\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'v_metadatachunks00': OrderedDict({
'sql': 'select * from v_metadatachunks00',
'rows': list([
OrderedDict({
'rowid': 1,
'data': b'\x0c\x00\x00\x00123456789a12\r\x00\x00\x00123456789a12\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'v_metadatatext00': OrderedDict({
'sql': 'select * from v_metadatatext00',
'rows': list([
OrderedDict({
'rowid': 2,
'data': '123456789a123',
}),
]),
}),
'v_rowids': OrderedDict({
'sql': 'select * from v_rowids',
'rows': list([
OrderedDict({
'rowid': 1,
'id': None,
'chunk_id': 1,
'chunk_offset': 0,
}),
OrderedDict({
'rowid': 2,
'id': None,
'chunk_id': 1,
'chunk_offset': 1,
}),
]),
}),
'v_vector_chunks00': OrderedDict({
'sql': 'select * from v_vector_chunks00',
'rows': list([
OrderedDict({
'rowid': 1,
'vectors': b'\x11\x11\x11\x11\x11\x11\x11\x11\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
})
# ---
# name: test_normal.1
dict({
'v_chunks': OrderedDict({
'sql': 'select * from v_chunks',
'rows': list([
]),
}),
'v_metadatachunks00': OrderedDict({
'sql': 'select * from v_metadatachunks00',
'rows': list([
]),
}),
'v_metadatachunks01': OrderedDict({
'sql': 'select * from v_metadatachunks01',
'rows': list([
]),
}),
'v_metadatachunks02': OrderedDict({
'sql': 'select * from v_metadatachunks02',
'rows': list([
]),
}),
'v_metadatachunks03': OrderedDict({
'sql': 'select * from v_metadatachunks03',
'rows': list([
]),
}),
'v_metadatatext03': OrderedDict({
'sql': 'select * from v_metadatatext03',
'rows': list([
]),
}),
'v_rowids': OrderedDict({
'sql': 'select * from v_rowids',
'rows': list([
]),
}),
'v_vector_chunks00': OrderedDict({
'sql': 'select * from v_vector_chunks00',
'rows': list([
]),
}),
})
# ---
# name: test_normal.2
OrderedDict({
'sql': 'insert into v(vector, b, n, f, t) values (?, ?, ?, ?, ?)',
'rows': list([
]),
})
# ---
# name: test_normal.3
OrderedDict({
'sql': 'insert into v(vector, b, n, f, t) values (?, ?, ?, ?, ?)',
'rows': list([
]),
})
# ---
# name: test_normal.4
OrderedDict({
'sql': 'insert into v(vector, b, n, f, t) values (?, ?, ?, ?, ?)',
'rows': list([
]),
})
# ---
# name: test_normal.5
OrderedDict({
'sql': 'select * from v',
'rows': list([
OrderedDict({
'rowid': 1,
'vector': b'\x11\x11\x11\x11',
'b': 1,
'n': 1,
'f': 1.1,
't': 'one',
}),
OrderedDict({
'rowid': 2,
'vector': b'""""',
'b': 1,
'n': 2,
'f': 2.2,
't': 'two',
}),
OrderedDict({
'rowid': 3,
'vector': b'3333',
'b': 1,
'n': 3,
'f': 3.3,
't': 'three',
}),
]),
})
# ---
# name: test_normal.6
dict({
'v_chunks': OrderedDict({
'sql': 'select * from v_chunks',
'rows': list([
OrderedDict({
'chunk_id': 1,
'size': 8,
'validity': b'\x07',
'rowids': b'\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'v_metadatachunks00': OrderedDict({
'sql': 'select * from v_metadatachunks00',
'rows': list([
OrderedDict({
'rowid': 1,
'data': b'\x07',
}),
]),
}),
'v_metadatachunks01': OrderedDict({
'sql': 'select * from v_metadatachunks01',
'rows': list([
OrderedDict({
'rowid': 1,
'data': b'\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'v_metadatachunks02': OrderedDict({
'sql': 'select * from v_metadatachunks02',
'rows': list([
OrderedDict({
'rowid': 1,
'data': b'\x9a\x99\x99\x99\x99\x99\xf1?\x9a\x99\x99\x99\x99\x99\x01@ffffff\n@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'v_metadatachunks03': OrderedDict({
'sql': 'select * from v_metadatachunks03',
'rows': list([
OrderedDict({
'rowid': 1,
'data': b'\x03\x00\x00\x00one\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00two\x00\x00\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00three\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'v_metadatatext03': OrderedDict({
'sql': 'select * from v_metadatatext03',
'rows': list([
]),
}),
'v_rowids': OrderedDict({
'sql': 'select * from v_rowids',
'rows': list([
OrderedDict({
'rowid': 1,
'id': None,
'chunk_id': 1,
'chunk_offset': 0,
}),
OrderedDict({
'rowid': 2,
'id': None,
'chunk_id': 1,
'chunk_offset': 1,
}),
OrderedDict({
'rowid': 3,
'id': None,
'chunk_id': 1,
'chunk_offset': 2,
}),
]),
}),
'v_vector_chunks00': OrderedDict({
'sql': 'select * from v_vector_chunks00',
'rows': list([
OrderedDict({
'rowid': 1,
'vectors': b'\x11\x11\x11\x11""""3333\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
})
# ---
# name: test_normal.7
OrderedDict({
'sql': 'drop table v',
'rows': list([
]),
})
# ---
# name: test_normal.8
OrderedDict({
'sql': 'select * from sqlite_master',
'rows': list([
OrderedDict({
'type': 'table',
'name': 'sqlite_sequence',
'tbl_name': 'sqlite_sequence',
'rootpage': 5,
'sql': 'CREATE TABLE sqlite_sequence(name,seq)',
}),
]),
})
# ---
# name: test_normal[sqlite_master]
OrderedDict({
'sql': "select * from sqlite_master where type = 'table' order by name",
'rows': list([
OrderedDict({
'type': 'table',
'name': 'sqlite_sequence',
'tbl_name': 'sqlite_sequence',
'rootpage': 5,
'sql': 'CREATE TABLE sqlite_sequence(name,seq)',
}),
OrderedDict({
'type': 'table',
'name': 'v',
'tbl_name': 'v',
'rootpage': 0,
'sql': 'CREATE VIRTUAL TABLE v using vec0(vector float[1], b boolean, n int, f float, t text, chunk_size=8)',
}),
OrderedDict({
'type': 'table',
'name': 'v_chunks',
'tbl_name': 'v_chunks',
'rootpage': 4,
'sql': 'CREATE TABLE "v_chunks"(chunk_id INTEGER PRIMARY KEY AUTOINCREMENT,size INTEGER NOT NULL,validity BLOB NOT NULL,rowids BLOB NOT NULL)',
}),
OrderedDict({
'type': 'table',
'name': 'v_info',
'tbl_name': 'v_info',
'rootpage': 2,
'sql': 'CREATE TABLE "v_info" (key text primary key, value any)',
}),
OrderedDict({
'type': 'table',
'name': 'v_metadatachunks00',
'tbl_name': 'v_metadatachunks00',
'rootpage': 9,
'sql': 'CREATE TABLE "v_metadatachunks00"(rowid PRIMARY KEY, data BLOB NOT NULL)',
}),
OrderedDict({
'type': 'table',
'name': 'v_metadatachunks01',
'tbl_name': 'v_metadatachunks01',
'rootpage': 11,
'sql': 'CREATE TABLE "v_metadatachunks01"(rowid PRIMARY KEY, data BLOB NOT NULL)',
}),
OrderedDict({
'type': 'table',
'name': 'v_metadatachunks02',
'tbl_name': 'v_metadatachunks02',
'rootpage': 13,
'sql': 'CREATE TABLE "v_metadatachunks02"(rowid PRIMARY KEY, data BLOB NOT NULL)',
}),
OrderedDict({
'type': 'table',
'name': 'v_metadatachunks03',
'tbl_name': 'v_metadatachunks03',
'rootpage': 15,
'sql': 'CREATE TABLE "v_metadatachunks03"(rowid PRIMARY KEY, data BLOB NOT NULL)',
}),
OrderedDict({
'type': 'table',
'name': 'v_metadatatext03',
'tbl_name': 'v_metadatatext03',
'rootpage': 17,
'sql': 'CREATE TABLE "v_metadatatext03"(rowid PRIMARY KEY, data TEXT)',
}),
OrderedDict({
'type': 'table',
'name': 'v_rowids',
'tbl_name': 'v_rowids',
'rootpage': 6,
'sql': 'CREATE TABLE "v_rowids"(rowid INTEGER PRIMARY KEY AUTOINCREMENT,id,chunk_id INTEGER,chunk_offset INTEGER)',
}),
OrderedDict({
'type': 'table',
'name': 'v_vector_chunks00',
'tbl_name': 'v_vector_chunks00',
'rootpage': 7,
'sql': 'CREATE TABLE "v_vector_chunks00"(rowid PRIMARY KEY,vectors BLOB NOT NULL)',
}),
]),
})
# ---
# name: test_stress
dict({
'vec_movies_auxiliary': OrderedDict({
'sql': 'select * from vec_movies_auxiliary',
'rows': list([
OrderedDict({
'rowid': 1,
'value00': 'The Conjuring',
}),
OrderedDict({
'rowid': 2,
'value00': 'Dumb and Dumber',
}),
OrderedDict({
'rowid': 3,
'value00': 'Interstellar',
}),
OrderedDict({
'rowid': 4,
'value00': 'The Lord of the Rings: The Fellowship of the Ring',
}),
OrderedDict({
'rowid': 5,
'value00': 'An Inconvenient Truth',
}),
OrderedDict({
'rowid': 6,
'value00': 'Hereditary',
}),
OrderedDict({
'rowid': 7,
'value00': 'Anchorman: The Legend of Ron Burgundy',
}),
OrderedDict({
'rowid': 8,
'value00': 'Blade Runner 2049',
}),
OrderedDict({
'rowid': 9,
'value00': "Harry Potter and the Sorcerer's Stone",
}),
OrderedDict({
'rowid': 10,
'value00': 'Free Solo',
}),
OrderedDict({
'rowid': 11,
'value00': 'Get Out',
}),
OrderedDict({
'rowid': 12,
'value00': 'The Hangover',
}),
OrderedDict({
'rowid': 13,
'value00': 'The Matrix',
}),
OrderedDict({
'rowid': 14,
'value00': "Pan's Labyrinth",
}),
OrderedDict({
'rowid': 15,
'value00': '13th',
}),
OrderedDict({
'rowid': 16,
'value00': 'It Follows',
}),
OrderedDict({
'rowid': 17,
'value00': 'Step Brothers',
}),
OrderedDict({
'rowid': 18,
'value00': 'Inception',
}),
OrderedDict({
'rowid': 19,
'value00': 'The Shape of Water',
}),
OrderedDict({
'rowid': 20,
'value00': "Won't You Be My Neighbor?",
}),
OrderedDict({
'rowid': 21,
'value00': 'Gravity',
}),
OrderedDict({
'rowid': 22,
'value00': 'Dune',
}),
OrderedDict({
'rowid': 23,
'value00': 'The Martian',
}),
OrderedDict({
'rowid': 24,
'value00': 'A Quiet Place',
}),
OrderedDict({
'rowid': 25,
'value00': 'The Chronicles of Narnia: The Lion, the Witch and the Wardrobe',
}),
]),
}),
'vec_movies_chunks': OrderedDict({
'sql': 'select * from vec_movies_chunks',
'rows': list([
OrderedDict({
'chunk_id': 1,
'size': 8,
'validity': b'\xff',
'rowids': b'\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00\x06\x00\x00\x00\x00\x00\x00\x00\x07\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00\x00\x00\x00\x00',
}),
OrderedDict({
'chunk_id': 2,
'size': 8,
'validity': b'\xff',
'rowids': b'\t\x00\x00\x00\x00\x00\x00\x00\n\x00\x00\x00\x00\x00\x00\x00\x0b\x00\x00\x00\x00\x00\x00\x00\x0c\x00\x00\x00\x00\x00\x00\x00\r\x00\x00\x00\x00\x00\x00\x00\x0e\x00\x00\x00\x00\x00\x00\x00\x0f\x00\x00\x00\x00\x00\x00\x00\x10\x00\x00\x00\x00\x00\x00\x00',
}),
OrderedDict({
'chunk_id': 3,
'size': 8,
'validity': b'\xff',
'rowids': b'\x11\x00\x00\x00\x00\x00\x00\x00\x12\x00\x00\x00\x00\x00\x00\x00\x13\x00\x00\x00\x00\x00\x00\x00\x14\x00\x00\x00\x00\x00\x00\x00\x15\x00\x00\x00\x00\x00\x00\x00\x16\x00\x00\x00\x00\x00\x00\x00\x17\x00\x00\x00\x00\x00\x00\x00\x18\x00\x00\x00\x00\x00\x00\x00',
}),
OrderedDict({
'chunk_id': 4,
'size': 8,
'validity': b'\x01',
'rowids': b'\x19\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'vec_movies_metadatachunks00': OrderedDict({
'sql': 'select * from vec_movies_metadatachunks00',
'rows': list([
OrderedDict({
'rowid': 1,
'data': b'p',
}),
OrderedDict({
'rowid': 2,
'data': b'U',
}),
OrderedDict({
'rowid': 3,
'data': b'\xff',
}),
OrderedDict({
'rowid': 4,
'data': b'\x01',
}),
]),
}),
'vec_movies_metadatachunks01': OrderedDict({
'sql': 'select * from vec_movies_metadatachunks01',
'rows': list([
OrderedDict({
'rowid': 1,
'data': b'\x06\x00\x00\x00horror\x00\x00\x00\x00\x00\x00\x06\x00\x00\x00comedy\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00scifi\x00\x00\x00\x00\x00\x00\x00\x07\x00\x00\x00fantasy\x00\x00\x00\x00\x00\x0b\x00\x00\x00documentary\x00\x06\x00\x00\x00horror\x00\x00\x00\x00\x00\x00\x06\x00\x00\x00comedy\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00scifi\x00\x00\x00\x00\x00\x00\x00',
}),
OrderedDict({
'rowid': 2,
'data': b'\x07\x00\x00\x00fantasy\x00\x00\x00\x00\x00\x0b\x00\x00\x00documentary\x00\x06\x00\x00\x00horror\x00\x00\x00\x00\x00\x00\x06\x00\x00\x00comedy\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00scifi\x00\x00\x00\x00\x00\x00\x00\x07\x00\x00\x00fantasy\x00\x00\x00\x00\x00\x0b\x00\x00\x00documentary\x00\x06\x00\x00\x00horror\x00\x00\x00\x00\x00\x00',
}),
OrderedDict({
'rowid': 3,
'data': b'\x06\x00\x00\x00comedy\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00scifi\x00\x00\x00\x00\x00\x00\x00\x07\x00\x00\x00fantasy\x00\x00\x00\x00\x00\x0b\x00\x00\x00documentary\x00\x05\x00\x00\x00scifi\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00scifi\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00scifi\x00\x00\x00\x00\x00\x00\x00\x06\x00\x00\x00horror\x00\x00\x00\x00\x00\x00',
}),
OrderedDict({
'rowid': 4,
'data': b'\x07\x00\x00\x00fantasy\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'vec_movies_metadatachunks02': OrderedDict({
'sql': 'select * from vec_movies_metadatachunks02',
'rows': list([
OrderedDict({
'rowid': 1,
'data': b'\x99\x00\x00\x00\x00\x00\x00\x00~\x01\x00\x00\x00\x00\x00\x005\x00\x00\x00\x00\x00\x00\x00\xd2\x00\x00\x00\x00\x00\x00\x00]\x00\x00\x00\x00\x00\x00\x00\xa7\x00\x00\x00\x00\x00\x00\x00\xe2\x01\x00\x00\x00\x00\x00\x00-\x01\x00\x00\x00\x00\x00\x00',
}),
OrderedDict({
'rowid': 2,
'data': b'\x86\x00\x00\x00\x00\x00\x00\x00B\x00\x00\x00\x00\x00\x00\x00X\x00\x00\x00\x00\x00\x00\x00;\x00\x00\x00\x00\x00\x00\x00\xa7\x01\x00\x00\x00\x00\x00\x00\x13\x01\x00\x00\x00\x00\x00\x00\xbf\x00\x00\x00\x00\x00\x00\x00:\x01\x00\x00\x00\x00\x00\x00',
}),
OrderedDict({
'rowid': 3,
'data': b'J\x00\x00\x00\x00\x00\x00\x00\xc9\x00\x00\x00\x00\x00\x00\x00\x8f\x01\x00\x00\x00\x00\x00\x00\xba\x00\x00\x00\x00\x00\x00\x00V\x01\x00\x00\x00\x00\x00\x00\xc3\x01\x00\x00\x00\x00\x00\x00\n\x02\x00\x00\x00\x00\x00\x00\x0f\x01\x00\x00\x00\x00\x00\x00',
}),
OrderedDict({
'rowid': 4,
'data': b'6\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'vec_movies_metadatachunks03': OrderedDict({
'sql': 'select * from vec_movies_metadatachunks03',
'rows': list([
OrderedDict({
'rowid': 1,
'data': b'ffffff\x12@\xcd\xcc\xcc\xcc\xcc\xcc\x04@\x00\x00\x00\x00\x00\x00\x14@\xcd\xcc\xcc\xcc\xcc\xcc\x10@333333\x0b@\xcd\xcc\xcc\xcc\xcc\xcc\x12@333333\x07@\x00\x00\x00\x00\x00\x00\x14@',
}),
OrderedDict({
'rowid': 2,
'data': b'ffffff\x10@\x9a\x99\x99\x99\x99\x99\t@\x9a\x99\x99\x99\x99\x99\x13@ffffff\x06@\x00\x00\x00\x00\x00\x00\x12@\xcd\xcc\xcc\xcc\xcc\xcc\x0c@\x9a\x99\x99\x99\x99\x99\x11@333333\x11@',
}),
OrderedDict({
'rowid': 3,
'data': b'\x00\x00\x00\x00\x00\x00\x08@\x00\x00\x00\x00\x00\x00\x14@\x9a\x99\x99\x99\x99\x99\x05@333333\x13@\x00\x00\x00\x00\x00\x00\x10@\x9a\x99\x99\x99\x99\x99\x11@ffffff\x12@333333\x11@',
}),
OrderedDict({
'rowid': 4,
'data': b'333333\x0f@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'vec_movies_metadatatext01': OrderedDict({
'sql': 'select * from vec_movies_metadatatext01',
'rows': list([
]),
}),
'vec_movies_rowids': OrderedDict({
'sql': 'select * from vec_movies_rowids',
'rows': list([
OrderedDict({
'rowid': 1,
'id': None,
'chunk_id': 1,
'chunk_offset': 0,
}),
OrderedDict({
'rowid': 2,
'id': None,
'chunk_id': 1,
'chunk_offset': 1,
}),
OrderedDict({
'rowid': 3,
'id': None,
'chunk_id': 1,
'chunk_offset': 2,
}),
OrderedDict({
'rowid': 4,
'id': None,
'chunk_id': 1,
'chunk_offset': 3,
}),
OrderedDict({
'rowid': 5,
'id': None,
'chunk_id': 1,
'chunk_offset': 4,
}),
OrderedDict({
'rowid': 6,
'id': None,
'chunk_id': 1,
'chunk_offset': 5,
}),
OrderedDict({
'rowid': 7,
'id': None,
'chunk_id': 1,
'chunk_offset': 6,
}),
OrderedDict({
'rowid': 8,
'id': None,
'chunk_id': 1,
'chunk_offset': 7,
}),
OrderedDict({
'rowid': 9,
'id': None,
'chunk_id': 2,
'chunk_offset': 0,
}),
OrderedDict({
'rowid': 10,
'id': None,
'chunk_id': 2,
'chunk_offset': 1,
}),
OrderedDict({
'rowid': 11,
'id': None,
'chunk_id': 2,
'chunk_offset': 2,
}),
OrderedDict({
'rowid': 12,
'id': None,
'chunk_id': 2,
'chunk_offset': 3,
}),
OrderedDict({
'rowid': 13,
'id': None,
'chunk_id': 2,
'chunk_offset': 4,
}),
OrderedDict({
'rowid': 14,
'id': None,
'chunk_id': 2,
'chunk_offset': 5,
}),
OrderedDict({
'rowid': 15,
'id': None,
'chunk_id': 2,
'chunk_offset': 6,
}),
OrderedDict({
'rowid': 16,
'id': None,
'chunk_id': 2,
'chunk_offset': 7,
}),
OrderedDict({
'rowid': 17,
'id': None,
'chunk_id': 3,
'chunk_offset': 0,
}),
OrderedDict({
'rowid': 18,
'id': None,
'chunk_id': 3,
'chunk_offset': 1,
}),
OrderedDict({
'rowid': 19,
'id': None,
'chunk_id': 3,
'chunk_offset': 2,
}),
OrderedDict({
'rowid': 20,
'id': None,
'chunk_id': 3,
'chunk_offset': 3,
}),
OrderedDict({
'rowid': 21,
'id': None,
'chunk_id': 3,
'chunk_offset': 4,
}),
OrderedDict({
'rowid': 22,
'id': None,
'chunk_id': 3,
'chunk_offset': 5,
}),
OrderedDict({
'rowid': 23,
'id': None,
'chunk_id': 3,
'chunk_offset': 6,
}),
OrderedDict({
'rowid': 24,
'id': None,
'chunk_id': 3,
'chunk_offset': 7,
}),
OrderedDict({
'rowid': 25,
'id': None,
'chunk_id': 4,
'chunk_offset': 0,
}),
]),
}),
'vec_movies_vector_chunks00': OrderedDict({
'sql': 'select * from vec_movies_vector_chunks00',
'rows': list([
OrderedDict({
'rowid': 1,
'vectors': b'\x00\x00\x80?\x00\x00\x00@\x00\x00@@\x00\x00\x80@\x00\x00\xa0@\x00\x00\xc0@\x00\x00\xe0@\x00\x00\x00A',
}),
OrderedDict({
'rowid': 2,
'vectors': b'\x00\x00\x10A\x00\x00 A\x00\x000A\x00\x00@A\x00\x00PA\x00\x00`A\x00\x00pA\x00\x00\x80A',
}),
OrderedDict({
'rowid': 3,
'vectors': b'\x00\x00\x88A\x00\x00\x90A\x00\x00\x98A\x00\x00\xa0A\x00\x00\xa8A\x00\x00\xb0A\x00\x00\xb8A\x00\x00\xc0A',
}),
OrderedDict({
'rowid': 4,
'vectors': b'\x00\x00\xc8A\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
})
# ---
# name: test_stress.1
OrderedDict({
'sql': '''
select
movie_id,
title,
genre,
num_reviews,
mean_rating,
is_favorited,
distance
from vec_movies
where synopsis_embedding match '[15.5]'
and genre = 'scifi'
and num_reviews between 100 and 500
and mean_rating > 3.5
and k = 5;
''',
'rows': list([
OrderedDict({
'movie_id': 13,
'title': 'The Matrix',
'genre': 'scifi',
'num_reviews': 423,
'mean_rating': 4.5,
'is_favorited': 1,
'distance': 2.5,
}),
OrderedDict({
'movie_id': 18,
'title': 'Inception',
'genre': 'scifi',
'num_reviews': 201,
'mean_rating': 5.0,
'is_favorited': 1,
'distance': 2.5,
}),
OrderedDict({
'movie_id': 21,
'title': 'Gravity',
'genre': 'scifi',
'num_reviews': 342,
'mean_rating': 4.0,
'is_favorited': 1,
'distance': 5.5,
}),
OrderedDict({
'movie_id': 22,
'title': 'Dune',
'genre': 'scifi',
'num_reviews': 451,
'mean_rating': 4.4,
'is_favorited': 1,
'distance': 6.5,
}),
OrderedDict({
'movie_id': 8,
'title': 'Blade Runner 2049',
'genre': 'scifi',
'num_reviews': 301,
'mean_rating': 5.0,
'is_favorited': 0,
'distance': 7.5,
}),
]),
})
# ---
# name: test_stress.2
OrderedDict({
'sql': "select movie_id, genre, distance from vec_movies where synopsis_embedding match '[100]' and k = 5 and genre = 'horror'",
'rows': list([
OrderedDict({
'movie_id': 24,
'genre': 'horror',
'distance': 76.0,
}),
OrderedDict({
'movie_id': 16,
'genre': 'horror',
'distance': 84.0,
}),
OrderedDict({
'movie_id': 11,
'genre': 'horror',
'distance': 89.0,
}),
OrderedDict({
'movie_id': 6,
'genre': 'horror',
'distance': 94.0,
}),
OrderedDict({
'movie_id': 1,
'genre': 'horror',
'distance': 99.0,
}),
]),
})
# ---
# name: test_stress.3
OrderedDict({
'sql': "select movie_id, genre, distance from vec_movies where synopsis_embedding match '[100]' and k = 5 and genre = 'comedy'",
'rows': list([
OrderedDict({
'movie_id': 17,
'genre': 'comedy',
'distance': 83.0,
}),
OrderedDict({
'movie_id': 12,
'genre': 'comedy',
'distance': 88.0,
}),
OrderedDict({
'movie_id': 7,
'genre': 'comedy',
'distance': 93.0,
}),
OrderedDict({
'movie_id': 2,
'genre': 'comedy',
'distance': 98.0,
}),
]),
})
# ---
# name: test_stress.4
OrderedDict({
'sql': "select movie_id, num_reviews, distance from vec_movies where synopsis_embedding match '[100]' and k = 5 and num_reviews between 100 and 500",
'rows': list([
OrderedDict({
'movie_id': 25,
'num_reviews': 310,
'distance': 75.0,
}),
OrderedDict({
'movie_id': 24,
'num_reviews': 271,
'distance': 76.0,
}),
OrderedDict({
'movie_id': 22,
'num_reviews': 451,
'distance': 78.0,
}),
OrderedDict({
'movie_id': 21,
'num_reviews': 342,
'distance': 79.0,
}),
OrderedDict({
'movie_id': 20,
'num_reviews': 186,
'distance': 80.0,
}),
]),
})
# ---
# name: test_stress.5
OrderedDict({
'sql': "select movie_id, num_reviews, distance from vec_movies where synopsis_embedding match '[100]' and k = 5 and num_reviews >= 500",
'rows': list([
OrderedDict({
'movie_id': 23,
'num_reviews': 522,
'distance': 77.0,
}),
]),
})
# ---
# name: test_stress.6
OrderedDict({
'sql': "select movie_id, mean_rating, distance from vec_movies where synopsis_embedding match '[100]' and k = 5 and mean_rating < 3.0",
'rows': list([
OrderedDict({
'movie_id': 19,
'mean_rating': 2.7,
'distance': 81.0,
}),
OrderedDict({
'movie_id': 12,
'mean_rating': 2.8,
'distance': 88.0,
}),
OrderedDict({
'movie_id': 7,
'mean_rating': 2.9,
'distance': 93.0,
}),
OrderedDict({
'movie_id': 2,
'mean_rating': 2.6,
'distance': 98.0,
}),
]),
})
# ---
# name: test_stress.7
OrderedDict({
'sql': "select movie_id, mean_rating, distance from vec_movies where synopsis_embedding match '[100]' and k = 5 and mean_rating between 4.0 and 5.0",
'rows': list([
OrderedDict({
'movie_id': 24,
'mean_rating': 4.3,
'distance': 76.0,
}),
OrderedDict({
'movie_id': 23,
'mean_rating': 4.6,
'distance': 77.0,
}),
OrderedDict({
'movie_id': 22,
'mean_rating': 4.4,
'distance': 78.0,
}),
OrderedDict({
'movie_id': 21,
'mean_rating': 4.0,
'distance': 79.0,
}),
OrderedDict({
'movie_id': 20,
'mean_rating': 4.8,
'distance': 80.0,
}),
]),
})
# ---
# name: test_stress[bool-eq-false]
OrderedDict({
'sql': "select movie_id, is_favorited, distance from vec_movies where synopsis_embedding match '[100]' and k = 5 and is_favorited = FALSE",
'rows': list([
OrderedDict({
'movie_id': 16,
'is_favorited': 0,
'distance': 84.0,
}),
OrderedDict({
'movie_id': 14,
'is_favorited': 0,
'distance': 86.0,
}),
OrderedDict({
'movie_id': 12,
'is_favorited': 0,
'distance': 88.0,
}),
OrderedDict({
'movie_id': 10,
'is_favorited': 0,
'distance': 90.0,
}),
OrderedDict({
'movie_id': 8,
'is_favorited': 0,
'distance': 92.0,
}),
]),
})
# ---
# name: test_stress[bool-eq-true]
OrderedDict({
'sql': "select movie_id, is_favorited, distance from vec_movies where synopsis_embedding match '[100]' and k = 5 and is_favorited = TRUE",
'rows': list([
OrderedDict({
'movie_id': 25,
'is_favorited': 1,
'distance': 75.0,
}),
OrderedDict({
'movie_id': 24,
'is_favorited': 1,
'distance': 76.0,
}),
OrderedDict({
'movie_id': 23,
'is_favorited': 1,
'distance': 77.0,
}),
OrderedDict({
'movie_id': 22,
'is_favorited': 1,
'distance': 78.0,
}),
OrderedDict({
'movie_id': 21,
'is_favorited': 1,
'distance': 79.0,
}),
]),
})
# ---
# name: test_stress[bool-ne-false]
OrderedDict({
'sql': "select movie_id, is_favorited, distance from vec_movies where synopsis_embedding match '[100]' and k = 5 and is_favorited != FALSE",
'rows': list([
OrderedDict({
'movie_id': 25,
'is_favorited': 1,
'distance': 75.0,
}),
OrderedDict({
'movie_id': 24,
'is_favorited': 1,
'distance': 76.0,
}),
OrderedDict({
'movie_id': 23,
'is_favorited': 1,
'distance': 77.0,
}),
OrderedDict({
'movie_id': 22,
'is_favorited': 1,
'distance': 78.0,
}),
OrderedDict({
'movie_id': 21,
'is_favorited': 1,
'distance': 79.0,
}),
]),
})
# ---
# name: test_stress[bool-ne-true]
OrderedDict({
'sql': "select movie_id, is_favorited, distance from vec_movies where synopsis_embedding match '[100]' and k = 5 and is_favorited != TRUE",
'rows': list([
OrderedDict({
'movie_id': 16,
'is_favorited': 0,
'distance': 84.0,
}),
OrderedDict({
'movie_id': 14,
'is_favorited': 0,
'distance': 86.0,
}),
OrderedDict({
'movie_id': 12,
'is_favorited': 0,
'distance': 88.0,
}),
OrderedDict({
'movie_id': 10,
'is_favorited': 0,
'distance': 90.0,
}),
OrderedDict({
'movie_id': 8,
'is_favorited': 0,
'distance': 92.0,
}),
]),
})
# ---
# name: test_stress[bool-other-op]
dict({
'error': 'OperationalError',
'message': 'ONLY EQUALS (=) or NOT_EQUALS (!=) operators are allowed on boolean metadata columns.',
})
# ---
# name: test_text_knn
dict({
'v_chunks': OrderedDict({
'sql': 'select * from v_chunks',
'rows': list([
]),
}),
'v_metadatachunks00': OrderedDict({
'sql': 'select * from v_metadatachunks00',
'rows': list([
]),
}),
'v_metadatatext00': OrderedDict({
'sql': 'select * from v_metadatatext00',
'rows': list([
]),
}),
'v_rowids': OrderedDict({
'sql': 'select * from v_rowids',
'rows': list([
]),
}),
'v_vector_chunks00': OrderedDict({
'sql': 'select * from v_vector_chunks00',
'rows': list([
]),
}),
})
# ---
# name: test_text_knn.1
OrderedDict({
'sql': 'select * from v',
'rows': list([
OrderedDict({
'rowid': 1,
'vector': b'\xaeG\xe1=',
'name': 'aaa',
}),
OrderedDict({
'rowid': 2,
'vector': b'\xaeGa>',
'name': 'bbb',
}),
OrderedDict({
'rowid': 3,
'vector': b'\xc3\xf5\xa8>',
'name': 'ccc',
}),
OrderedDict({
'rowid': 4,
'vector': b'\xaeG\xe1>',
'name': 'ddd',
}),
OrderedDict({
'rowid': 5,
'vector': b'\xcd\xcc\x0c?',
'name': 'eee',
}),
OrderedDict({
'rowid': 6,
'vector': b'\xc3\xf5(?',
'name': 'fff',
}),
OrderedDict({
'rowid': 7,
'vector': b'\xb8\x1eE?',
'name': 'ggg',
}),
OrderedDict({
'rowid': 8,
'vector': b'\xaeGa?',
'name': 'hhh',
}),
OrderedDict({
'rowid': 9,
'vector': b'\xa4p}?',
'name': 'iii',
}),
]),
})
# ---
# name: test_text_knn.2
dict({
'v_chunks': OrderedDict({
'sql': 'select * from v_chunks',
'rows': list([
OrderedDict({
'chunk_id': 1,
'size': 8,
'validity': b'\xff',
'rowids': b'\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00\x06\x00\x00\x00\x00\x00\x00\x00\x07\x00\x00\x00\x00\x00\x00\x00\x08\x00\x00\x00\x00\x00\x00\x00',
}),
OrderedDict({
'chunk_id': 2,
'size': 8,
'validity': b'\x01',
'rowids': b'\t\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'v_metadatachunks00': OrderedDict({
'sql': 'select * from v_metadatachunks00',
'rows': list([
OrderedDict({
'rowid': 1,
'data': b'\x03\x00\x00\x00aaa\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00bbb\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00ccc\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00ddd\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00eee\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00fff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00ggg\x00\x00\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00hhh\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
OrderedDict({
'rowid': 2,
'data': b'\x03\x00\x00\x00iii\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'v_metadatatext00': OrderedDict({
'sql': 'select * from v_metadatatext00',
'rows': list([
]),
}),
'v_rowids': OrderedDict({
'sql': 'select * from v_rowids',
'rows': list([
OrderedDict({
'rowid': 1,
'id': None,
'chunk_id': 1,
'chunk_offset': 0,
}),
OrderedDict({
'rowid': 2,
'id': None,
'chunk_id': 1,
'chunk_offset': 1,
}),
OrderedDict({
'rowid': 3,
'id': None,
'chunk_id': 1,
'chunk_offset': 2,
}),
OrderedDict({
'rowid': 4,
'id': None,
'chunk_id': 1,
'chunk_offset': 3,
}),
OrderedDict({
'rowid': 5,
'id': None,
'chunk_id': 1,
'chunk_offset': 4,
}),
OrderedDict({
'rowid': 6,
'id': None,
'chunk_id': 1,
'chunk_offset': 5,
}),
OrderedDict({
'rowid': 7,
'id': None,
'chunk_id': 1,
'chunk_offset': 6,
}),
OrderedDict({
'rowid': 8,
'id': None,
'chunk_id': 1,
'chunk_offset': 7,
}),
OrderedDict({
'rowid': 9,
'id': None,
'chunk_id': 2,
'chunk_offset': 0,
}),
]),
}),
'v_vector_chunks00': OrderedDict({
'sql': 'select * from v_vector_chunks00',
'rows': list([
OrderedDict({
'rowid': 1,
'vectors': b'\xaeG\xe1=\xaeGa>\xc3\xf5\xa8>\xaeG\xe1>\xcd\xcc\x0c?\xc3\xf5(?\xb8\x1eE?\xaeGa?',
}),
OrderedDict({
'rowid': 2,
'vectors': b'\xa4p}?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
})
# ---
# name: test_text_knn.3
OrderedDict({
'sql': "select rowid, name, distance from v where vector match '[1]' and k = 5",
'rows': list([
OrderedDict({
'rowid': 9,
'name': 'iii',
'distance': 0.009999990463256836,
}),
OrderedDict({
'rowid': 8,
'name': 'hhh',
'distance': 0.12000000476837158,
}),
OrderedDict({
'rowid': 7,
'name': 'ggg',
'distance': 0.23000001907348633,
}),
OrderedDict({
'rowid': 6,
'name': 'fff',
'distance': 0.3399999737739563,
}),
OrderedDict({
'rowid': 5,
'name': 'eee',
'distance': 0.44999998807907104,
}),
]),
})
# ---
# name: test_text_knn.4
OrderedDict({
'sql': "select rowid, name, distance from v where vector match '[1]' and k = 5 and name < 'ddd'",
'rows': list([
OrderedDict({
'rowid': 3,
'name': 'ccc',
'distance': 0.6699999570846558,
}),
OrderedDict({
'rowid': 2,
'name': 'bbb',
'distance': 0.7799999713897705,
}),
OrderedDict({
'rowid': 1,
'name': 'aaa',
'distance': 0.8899999856948853,
}),
]),
})
# ---
# name: test_text_knn.5
OrderedDict({
'sql': "select rowid, name, distance from v where vector match '[1]' and k = 5 and name <= 'ddd'",
'rows': list([
OrderedDict({
'rowid': 4,
'name': 'ddd',
'distance': 0.5600000023841858,
}),
OrderedDict({
'rowid': 3,
'name': 'ccc',
'distance': 0.6699999570846558,
}),
OrderedDict({
'rowid': 2,
'name': 'bbb',
'distance': 0.7799999713897705,
}),
OrderedDict({
'rowid': 1,
'name': 'aaa',
'distance': 0.8899999856948853,
}),
]),
})
# ---
# name: test_text_knn.6
OrderedDict({
'sql': "select rowid, name, distance from v where vector match '[1]' and k = 5 and name > 'fff'",
'rows': list([
OrderedDict({
'rowid': 9,
'name': 'iii',
'distance': 0.009999990463256836,
}),
OrderedDict({
'rowid': 8,
'name': 'hhh',
'distance': 0.12000000476837158,
}),
OrderedDict({
'rowid': 7,
'name': 'ggg',
'distance': 0.23000001907348633,
}),
]),
})
# ---
# name: test_text_knn.7
OrderedDict({
'sql': "select rowid, name, distance from v where vector match '[1]' and k = 5 and name >= 'fff'",
'rows': list([
OrderedDict({
'rowid': 9,
'name': 'iii',
'distance': 0.009999990463256836,
}),
OrderedDict({
'rowid': 8,
'name': 'hhh',
'distance': 0.12000000476837158,
}),
OrderedDict({
'rowid': 7,
'name': 'ggg',
'distance': 0.23000001907348633,
}),
OrderedDict({
'rowid': 6,
'name': 'fff',
'distance': 0.3399999737739563,
}),
]),
})
# ---
# name: test_text_knn.8
OrderedDict({
'sql': "select rowid, name, distance from v where vector match '[1]' and k = 5 and name = 'aaa'",
'rows': list([
OrderedDict({
'rowid': 1,
'name': 'aaa',
'distance': 0.8899999856948853,
}),
]),
})
# ---
# name: test_text_knn.9
OrderedDict({
'sql': "select rowid, name, distance from v where vector match '[.01]' and k = 5 and name != 'aaa'",
'rows': list([
OrderedDict({
'rowid': 2,
'name': 'bbb',
'distance': 0.20999999344348907,
}),
OrderedDict({
'rowid': 3,
'name': 'ccc',
'distance': 0.320000022649765,
}),
OrderedDict({
'rowid': 4,
'name': 'ddd',
'distance': 0.4300000071525574,
}),
OrderedDict({
'rowid': 5,
'name': 'eee',
'distance': 0.5400000214576721,
}),
OrderedDict({
'rowid': 6,
'name': 'fff',
'distance': 0.6500000357627869,
}),
]),
})
# ---
# name: test_types[illegal-boolean]
dict({
'error': 'OperationalError',
'message': 'Expected 0 or 1 for BOOLEAN metadata column b',
})
# ---
# name: test_types[illegal-type-boolean]
dict({
'error': 'OperationalError',
'message': 'Expected 0 or 1 for BOOLEAN metadata column b',
})
# ---
# name: test_types[illegal-type-float]
dict({
'error': 'OperationalError',
'message': 'Expected float for FLOAT metadata column f, received TEXT',
})
# ---
# name: test_types[illegal-type-int]
dict({
'error': 'OperationalError',
'message': 'Expected integer for INTEGER metadata column n, received TEXT',
})
# ---
# name: test_types[illegal-type-text]
dict({
'error': 'OperationalError',
'message': 'Expected text for TEXT metadata column t, received INTEGER',
})
# ---
# name: test_types[legal]
OrderedDict({
'sql': 'insert into v(vector, b, n, f, t) values (?, ?, ?, ?, ?)',
'rows': list([
]),
})
# ---
# name: test_updates[1-init-contents]
OrderedDict({
'sql': 'select * from v',
'rows': list([
OrderedDict({
'rowid': 1,
'vector': b'\x11\x11\x11\x11',
'b': 1,
'n': 1,
'f': 1.1,
't': 'test1',
}),
OrderedDict({
'rowid': 2,
'vector': b'""""',
'b': 1,
'n': 2,
'f': 2.2,
't': 'test2',
}),
OrderedDict({
'rowid': 3,
'vector': b'3333',
'b': 1,
'n': 3,
'f': 3.3,
't': '1234567890123',
}),
]),
})
# ---
# name: test_updates[1-init-shadow]
dict({
'v_chunks': OrderedDict({
'sql': 'select * from v_chunks',
'rows': list([
OrderedDict({
'chunk_id': 1,
'size': 8,
'validity': b'\x07',
'rowids': b'\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'v_metadatachunks00': OrderedDict({
'sql': 'select * from v_metadatachunks00',
'rows': list([
OrderedDict({
'rowid': 1,
'data': b'\x07',
}),
]),
}),
'v_metadatachunks01': OrderedDict({
'sql': 'select * from v_metadatachunks01',
'rows': list([
OrderedDict({
'rowid': 1,
'data': b'\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'v_metadatachunks02': OrderedDict({
'sql': 'select * from v_metadatachunks02',
'rows': list([
OrderedDict({
'rowid': 1,
'data': b'\x9a\x99\x99\x99\x99\x99\xf1?\x9a\x99\x99\x99\x99\x99\x01@ffffff\n@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'v_metadatachunks03': OrderedDict({
'sql': 'select * from v_metadatachunks03',
'rows': list([
OrderedDict({
'rowid': 1,
'data': b'\x05\x00\x00\x00test1\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00test2\x00\x00\x00\x00\x00\x00\x00\r\x00\x00\x00123456789012\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'v_metadatatext03': OrderedDict({
'sql': 'select * from v_metadatatext03',
'rows': list([
OrderedDict({
'rowid': 3,
'data': '1234567890123',
}),
]),
}),
'v_rowids': OrderedDict({
'sql': 'select * from v_rowids',
'rows': list([
OrderedDict({
'rowid': 1,
'id': None,
'chunk_id': 1,
'chunk_offset': 0,
}),
OrderedDict({
'rowid': 2,
'id': None,
'chunk_id': 1,
'chunk_offset': 1,
}),
OrderedDict({
'rowid': 3,
'id': None,
'chunk_id': 1,
'chunk_offset': 2,
}),
]),
}),
'v_vector_chunks00': OrderedDict({
'sql': 'select * from v_vector_chunks00',
'rows': list([
OrderedDict({
'rowid': 1,
'vectors': b'\x11\x11\x11\x11""""3333\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
})
# ---
# name: test_updates[general-update-contents]
OrderedDict({
'sql': 'select * from v',
'rows': list([
OrderedDict({
'rowid': 1,
'vector': b'\x11\x11\x11\x11',
'b': 0,
'n': 11,
'f': 11.11,
't': 'newtest1',
}),
OrderedDict({
'rowid': 2,
'vector': b'""""',
'b': 1,
'n': 2,
'f': 2.2,
't': 'test2',
}),
OrderedDict({
'rowid': 3,
'vector': b'3333',
'b': 1,
'n': 3,
'f': 3.3,
't': '1234567890123',
}),
]),
})
# ---
# name: test_updates[general-update-shaodnw]
dict({
'v_chunks': OrderedDict({
'sql': 'select * from v_chunks',
'rows': list([
OrderedDict({
'chunk_id': 1,
'size': 8,
'validity': b'\x07',
'rowids': b'\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'v_metadatachunks00': OrderedDict({
'sql': 'select * from v_metadatachunks00',
'rows': list([
OrderedDict({
'rowid': 1,
'data': b'\x06',
}),
]),
}),
'v_metadatachunks01': OrderedDict({
'sql': 'select * from v_metadatachunks01',
'rows': list([
OrderedDict({
'rowid': 1,
'data': b'\x0b\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'v_metadatachunks02': OrderedDict({
'sql': 'select * from v_metadatachunks02',
'rows': list([
OrderedDict({
'rowid': 1,
'data': b'\xb8\x1e\x85\xebQ8&@\x9a\x99\x99\x99\x99\x99\x01@ffffff\n@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'v_metadatachunks03': OrderedDict({
'sql': 'select * from v_metadatachunks03',
'rows': list([
OrderedDict({
'rowid': 1,
'data': b'\x08\x00\x00\x00newtest1\x00\x00\x00\x00\x05\x00\x00\x00test2\x00\x00\x00\x00\x00\x00\x00\r\x00\x00\x00123456789012\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'v_metadatatext03': OrderedDict({
'sql': 'select * from v_metadatatext03',
'rows': list([
OrderedDict({
'rowid': 3,
'data': '1234567890123',
}),
]),
}),
'v_rowids': OrderedDict({
'sql': 'select * from v_rowids',
'rows': list([
OrderedDict({
'rowid': 1,
'id': None,
'chunk_id': 1,
'chunk_offset': 0,
}),
OrderedDict({
'rowid': 2,
'id': None,
'chunk_id': 1,
'chunk_offset': 1,
}),
OrderedDict({
'rowid': 3,
'id': None,
'chunk_id': 1,
'chunk_offset': 2,
}),
]),
}),
'v_vector_chunks00': OrderedDict({
'sql': 'select * from v_vector_chunks00',
'rows': list([
OrderedDict({
'rowid': 1,
'vectors': b'\x11\x11\x11\x11""""3333\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
})
# ---
# name: test_updates[string-update-1-contents]
OrderedDict({
'sql': 'select * from v',
'rows': list([
OrderedDict({
'rowid': 1,
'vector': b'\x11\x11\x11\x11',
'b': 0,
'n': 11,
'f': 11.11,
't': 'newtest1',
}),
OrderedDict({
'rowid': 2,
'vector': b'""""',
'b': 1,
'n': 2,
'f': 2.2,
't': 'test2',
}),
OrderedDict({
'rowid': 3,
'vector': b'3333',
'b': 1,
'n': 3,
'f': 3.3,
't': '1234567890123-updated',
}),
]),
})
# ---
# name: test_updates[string-update-1-shadow]
dict({
'v_chunks': OrderedDict({
'sql': 'select * from v_chunks',
'rows': list([
OrderedDict({
'chunk_id': 1,
'size': 8,
'validity': b'\x07',
'rowids': b'\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'v_metadatachunks00': OrderedDict({
'sql': 'select * from v_metadatachunks00',
'rows': list([
OrderedDict({
'rowid': 1,
'data': b'\x06',
}),
]),
}),
'v_metadatachunks01': OrderedDict({
'sql': 'select * from v_metadatachunks01',
'rows': list([
OrderedDict({
'rowid': 1,
'data': b'\x0b\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'v_metadatachunks02': OrderedDict({
'sql': 'select * from v_metadatachunks02',
'rows': list([
OrderedDict({
'rowid': 1,
'data': b'\xb8\x1e\x85\xebQ8&@\x9a\x99\x99\x99\x99\x99\x01@ffffff\n@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'v_metadatachunks03': OrderedDict({
'sql': 'select * from v_metadatachunks03',
'rows': list([
OrderedDict({
'rowid': 1,
'data': b'\x08\x00\x00\x00newtest1\x00\x00\x00\x00\x05\x00\x00\x00test2\x00\x00\x00\x00\x00\x00\x00\x15\x00\x00\x00123456789012\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'v_metadatatext03': OrderedDict({
'sql': 'select * from v_metadatatext03',
'rows': list([
OrderedDict({
'rowid': 3,
'data': '1234567890123-updated',
}),
]),
}),
'v_rowids': OrderedDict({
'sql': 'select * from v_rowids',
'rows': list([
OrderedDict({
'rowid': 1,
'id': None,
'chunk_id': 1,
'chunk_offset': 0,
}),
OrderedDict({
'rowid': 2,
'id': None,
'chunk_id': 1,
'chunk_offset': 1,
}),
OrderedDict({
'rowid': 3,
'id': None,
'chunk_id': 1,
'chunk_offset': 2,
}),
]),
}),
'v_vector_chunks00': OrderedDict({
'sql': 'select * from v_vector_chunks00',
'rows': list([
OrderedDict({
'rowid': 1,
'vectors': b'\x11\x11\x11\x11""""3333\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
})
# ---
# name: test_updates[string-update-2-contents]
OrderedDict({
'sql': 'select * from v',
'rows': list([
OrderedDict({
'rowid': 1,
'vector': b'\x11\x11\x11\x11',
'b': 0,
'n': 11,
'f': 11.11,
't': 'newtest1',
}),
OrderedDict({
'rowid': 2,
'vector': b'""""',
'b': 1,
'n': 2,
'f': 2.2,
't': 'test2-short',
}),
OrderedDict({
'rowid': 3,
'vector': b'3333',
'b': 1,
'n': 3,
'f': 3.3,
't': '1234567890123-updated',
}),
]),
})
# ---
# name: test_updates[string-update-2-shadow]
dict({
'v_chunks': OrderedDict({
'sql': 'select * from v_chunks',
'rows': list([
OrderedDict({
'chunk_id': 1,
'size': 8,
'validity': b'\x07',
'rowids': b'\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'v_metadatachunks00': OrderedDict({
'sql': 'select * from v_metadatachunks00',
'rows': list([
OrderedDict({
'rowid': 1,
'data': b'\x06',
}),
]),
}),
'v_metadatachunks01': OrderedDict({
'sql': 'select * from v_metadatachunks01',
'rows': list([
OrderedDict({
'rowid': 1,
'data': b'\x0b\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'v_metadatachunks02': OrderedDict({
'sql': 'select * from v_metadatachunks02',
'rows': list([
OrderedDict({
'rowid': 1,
'data': b'\xb8\x1e\x85\xebQ8&@\x9a\x99\x99\x99\x99\x99\x01@ffffff\n@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'v_metadatachunks03': OrderedDict({
'sql': 'select * from v_metadatachunks03',
'rows': list([
OrderedDict({
'rowid': 1,
'data': b'\x08\x00\x00\x00newtest1\x00\x00\x00\x00\x0b\x00\x00\x00test2-short\x00\x15\x00\x00\x00123456789012\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'v_metadatatext03': OrderedDict({
'sql': 'select * from v_metadatatext03',
'rows': list([
OrderedDict({
'rowid': 3,
'data': '1234567890123-updated',
}),
]),
}),
'v_rowids': OrderedDict({
'sql': 'select * from v_rowids',
'rows': list([
OrderedDict({
'rowid': 1,
'id': None,
'chunk_id': 1,
'chunk_offset': 0,
}),
OrderedDict({
'rowid': 2,
'id': None,
'chunk_id': 1,
'chunk_offset': 1,
}),
OrderedDict({
'rowid': 3,
'id': None,
'chunk_id': 1,
'chunk_offset': 2,
}),
]),
}),
'v_vector_chunks00': OrderedDict({
'sql': 'select * from v_vector_chunks00',
'rows': list([
OrderedDict({
'rowid': 1,
'vectors': b'\x11\x11\x11\x11""""3333\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
})
# ---
# name: test_updates[string-update-3-contents]
OrderedDict({
'sql': 'select * from v',
'rows': list([
OrderedDict({
'rowid': 1,
'vector': b'\x11\x11\x11\x11',
'b': 0,
'n': 11,
'f': 11.11,
't': 'newtest1',
}),
OrderedDict({
'rowid': 2,
'vector': b'""""',
'b': 1,
'n': 2,
'f': 2.2,
't': 'test2-long-long-long',
}),
OrderedDict({
'rowid': 3,
'vector': b'3333',
'b': 1,
'n': 3,
'f': 3.3,
't': '1234567890123-updated',
}),
]),
})
# ---
# name: test_updates[string-update-3-shadow]
dict({
'v_chunks': OrderedDict({
'sql': 'select * from v_chunks',
'rows': list([
OrderedDict({
'chunk_id': 1,
'size': 8,
'validity': b'\x07',
'rowids': b'\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'v_metadatachunks00': OrderedDict({
'sql': 'select * from v_metadatachunks00',
'rows': list([
OrderedDict({
'rowid': 1,
'data': b'\x06',
}),
]),
}),
'v_metadatachunks01': OrderedDict({
'sql': 'select * from v_metadatachunks01',
'rows': list([
OrderedDict({
'rowid': 1,
'data': b'\x0b\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'v_metadatachunks02': OrderedDict({
'sql': 'select * from v_metadatachunks02',
'rows': list([
OrderedDict({
'rowid': 1,
'data': b'\xb8\x1e\x85\xebQ8&@\x9a\x99\x99\x99\x99\x99\x01@ffffff\n@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'v_metadatachunks03': OrderedDict({
'sql': 'select * from v_metadatachunks03',
'rows': list([
OrderedDict({
'rowid': 1,
'data': b'\x08\x00\x00\x00newtest1\x00\x00\x00\x00\x14\x00\x00\x00test2-long-l\x15\x00\x00\x00123456789012\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'v_metadatatext03': OrderedDict({
'sql': 'select * from v_metadatatext03',
'rows': list([
OrderedDict({
'rowid': 3,
'data': '1234567890123-updated',
}),
OrderedDict({
'rowid': 2,
'data': 'test2-long-long-long',
}),
]),
}),
'v_rowids': OrderedDict({
'sql': 'select * from v_rowids',
'rows': list([
OrderedDict({
'rowid': 1,
'id': None,
'chunk_id': 1,
'chunk_offset': 0,
}),
OrderedDict({
'rowid': 2,
'id': None,
'chunk_id': 1,
'chunk_offset': 1,
}),
OrderedDict({
'rowid': 3,
'id': None,
'chunk_id': 1,
'chunk_offset': 2,
}),
]),
}),
'v_vector_chunks00': OrderedDict({
'sql': 'select * from v_vector_chunks00',
'rows': list([
OrderedDict({
'rowid': 1,
'vectors': b'\x11\x11\x11\x11""""3333\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
})
# ---
# name: test_updates[string-update-4-contents]
OrderedDict({
'sql': 'select * from v',
'rows': list([
OrderedDict({
'rowid': 1,
'vector': b'\x11\x11\x11\x11',
'b': 0,
'n': 11,
'f': 11.11,
't': 'newtest1',
}),
OrderedDict({
'rowid': 2,
'vector': b'""""',
'b': 1,
'n': 2,
'f': 2.2,
't': 'test2-shortx',
}),
OrderedDict({
'rowid': 3,
'vector': b'3333',
'b': 1,
'n': 3,
'f': 3.3,
't': '1234567890123-updated',
}),
]),
})
# ---
# name: test_updates[string-update-4-shadow]
dict({
'v_chunks': OrderedDict({
'sql': 'select * from v_chunks',
'rows': list([
OrderedDict({
'chunk_id': 1,
'size': 8,
'validity': b'\x07',
'rowids': b'\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'v_metadatachunks00': OrderedDict({
'sql': 'select * from v_metadatachunks00',
'rows': list([
OrderedDict({
'rowid': 1,
'data': b'\x06',
}),
]),
}),
'v_metadatachunks01': OrderedDict({
'sql': 'select * from v_metadatachunks01',
'rows': list([
OrderedDict({
'rowid': 1,
'data': b'\x0b\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'v_metadatachunks02': OrderedDict({
'sql': 'select * from v_metadatachunks02',
'rows': list([
OrderedDict({
'rowid': 1,
'data': b'\xb8\x1e\x85\xebQ8&@\x9a\x99\x99\x99\x99\x99\x01@ffffff\n@\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'v_metadatachunks03': OrderedDict({
'sql': 'select * from v_metadatachunks03',
'rows': list([
OrderedDict({
'rowid': 1,
'data': b'\x08\x00\x00\x00newtest1\x00\x00\x00\x00\x0c\x00\x00\x00test2-shortx\x15\x00\x00\x00123456789012\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'v_metadatatext03': OrderedDict({
'sql': 'select * from v_metadatatext03',
'rows': list([
OrderedDict({
'rowid': 3,
'data': '1234567890123-updated',
}),
]),
}),
'v_rowids': OrderedDict({
'sql': 'select * from v_rowids',
'rows': list([
OrderedDict({
'rowid': 1,
'id': None,
'chunk_id': 1,
'chunk_offset': 0,
}),
OrderedDict({
'rowid': 2,
'id': None,
'chunk_id': 1,
'chunk_offset': 1,
}),
OrderedDict({
'rowid': 3,
'id': None,
'chunk_id': 1,
'chunk_offset': 2,
}),
]),
}),
'v_vector_chunks00': OrderedDict({
'sql': 'select * from v_vector_chunks00',
'rows': list([
OrderedDict({
'rowid': 1,
'vectors': b'\x11\x11\x11\x11""""3333\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
})
# ---
# name: test_vtab_in[allow-int-all]
OrderedDict({
'sql': "select rowid, n, distance from v where vector match '[0]' and k = 8 and n in (555, 999)",
'rows': list([
OrderedDict({
'rowid': 1,
'n': 999,
'distance': 1.0,
}),
OrderedDict({
'rowid': 2,
'n': 555,
'distance': 2.0,
}),
OrderedDict({
'rowid': 3,
'n': 999,
'distance': 3.0,
}),
OrderedDict({
'rowid': 4,
'n': 555,
'distance': 4.0,
}),
OrderedDict({
'rowid': 5,
'n': 999,
'distance': 5.0,
}),
OrderedDict({
'rowid': 6,
'n': 555,
'distance': 6.0,
}),
OrderedDict({
'rowid': 7,
'n': 999,
'distance': 7.0,
}),
OrderedDict({
'rowid': 8,
'n': 555,
'distance': 8.0,
}),
]),
})
# ---
# name: test_vtab_in[allow-int-superfluous]
OrderedDict({
'sql': "select rowid, n, distance from v where vector match '[0]' and k = 8 and n in (555, -1, -2)",
'rows': list([
OrderedDict({
'rowid': 2,
'n': 555,
'distance': 2.0,
}),
OrderedDict({
'rowid': 4,
'n': 555,
'distance': 4.0,
}),
OrderedDict({
'rowid': 6,
'n': 555,
'distance': 6.0,
}),
OrderedDict({
'rowid': 8,
'n': 555,
'distance': 8.0,
}),
]),
})
# ---
# name: test_vtab_in[allow-text-all]
OrderedDict({
'sql': "select rowid, t, distance from v where vector match '[0]' and k = 8 and t in ('aaaa', 'zzzz')",
'rows': list([
OrderedDict({
'rowid': 1,
't': 'aaaa',
'distance': 1.0,
}),
OrderedDict({
'rowid': 2,
't': 'aaaa',
'distance': 2.0,
}),
OrderedDict({
'rowid': 3,
't': 'aaaa',
'distance': 3.0,
}),
OrderedDict({
'rowid': 4,
't': 'aaaa',
'distance': 4.0,
}),
OrderedDict({
'rowid': 5,
't': 'zzzz',
'distance': 5.0,
}),
OrderedDict({
'rowid': 6,
't': 'zzzz',
'distance': 6.0,
}),
OrderedDict({
'rowid': 7,
't': 'zzzz',
'distance': 7.0,
}),
OrderedDict({
'rowid': 8,
't': 'zzzz',
'distance': 8.0,
}),
]),
})
# ---
# name: test_vtab_in[allow-text-superfluous]
OrderedDict({
'sql': "select rowid, t, distance from v where vector match '[0]' and k = 8 and t in ('aaaa', 'foo', 'bar')",
'rows': list([
OrderedDict({
'rowid': 1,
't': 'aaaa',
'distance': 1.0,
}),
OrderedDict({
'rowid': 2,
't': 'aaaa',
'distance': 2.0,
}),
OrderedDict({
'rowid': 3,
't': 'aaaa',
'distance': 3.0,
}),
OrderedDict({
'rowid': 4,
't': 'aaaa',
'distance': 4.0,
}),
]),
})
# ---
# name: test_vtab_in[block-bool]
dict({
'error': 'OperationalError',
'message': "'xxx in (...)' is only available on INTEGER or TEXT metadata columns.",
})
# ---
# name: test_vtab_in[block-float]
dict({
'error': 'OperationalError',
'message': "'xxx in (...)' is only available on INTEGER or TEXT metadata columns.",
})
# ---
# name: test_vtab_in_long_text[all]
OrderedDict({
'sql': "select rowid, t from v where vector match '[0]' and k = 10 and t in (select value from json_each(?))",
'rows': list([
OrderedDict({
'rowid': 1,
't': 'aaaa',
}),
OrderedDict({
'rowid': 2,
't': 'aaaaaaaaaaaa_aaa',
}),
OrderedDict({
'rowid': 3,
't': 'bbbb',
}),
OrderedDict({
'rowid': 4,
't': 'bbbbbbbbbbbb_bbb',
}),
OrderedDict({
'rowid': 5,
't': 'cccc',
}),
OrderedDict({
'rowid': 6,
't': 'cccccccccccc_ccc',
}),
]),
})
# ---
# name: test_vtab_in_long_text[individual-aaaa]
OrderedDict({
'sql': "select rowid, t from v where vector match '[0]' and k = 10 and t in (?, 'nonsense')",
'rows': list([
OrderedDict({
'rowid': 1,
't': 'aaaa',
}),
]),
})
# ---
# name: test_vtab_in_long_text[individual-aaaaaaaaaaaa_aaa]
OrderedDict({
'sql': "select rowid, t from v where vector match '[0]' and k = 10 and t in (?, 'nonsense')",
'rows': list([
OrderedDict({
'rowid': 2,
't': 'aaaaaaaaaaaa_aaa',
}),
]),
})
# ---
# name: test_vtab_in_long_text[individual-bbbb]
OrderedDict({
'sql': "select rowid, t from v where vector match '[0]' and k = 10 and t in (?, 'nonsense')",
'rows': list([
OrderedDict({
'rowid': 3,
't': 'bbbb',
}),
]),
})
# ---
# name: test_vtab_in_long_text[individual-bbbbbbbbbbbb_bbb]
OrderedDict({
'sql': "select rowid, t from v where vector match '[0]' and k = 10 and t in (?, 'nonsense')",
'rows': list([
OrderedDict({
'rowid': 4,
't': 'bbbbbbbbbbbb_bbb',
}),
]),
})
# ---
# name: test_vtab_in_long_text[individual-cccc]
OrderedDict({
'sql': "select rowid, t from v where vector match '[0]' and k = 10 and t in (?, 'nonsense')",
'rows': list([
OrderedDict({
'rowid': 5,
't': 'cccc',
}),
]),
})
# ---
# name: test_vtab_in_long_text[individual-cccccccccccc_ccc]
OrderedDict({
'sql': "select rowid, t from v where vector match '[0]' and k = 10 and t in (?, 'nonsense')",
'rows': list([
OrderedDict({
'rowid': 6,
't': 'cccccccccccc_ccc',
}),
]),
})
# ---
================================================
FILE: tests/__snapshots__/test-partition-keys.ambr
================================================
# serializer version: 1
# name: test_constructor_limit[max 4 partition keys]
dict({
'error': 'OperationalError',
'message': 'vec0 constructor error: More than 4 partition key columns were provided',
})
# ---
# name: test_normal[1 row]
dict({
'v_chunks': OrderedDict({
'sql': 'select * from v_chunks',
'rows': list([
OrderedDict({
'chunk_id': 1,
'size': 8,
'sequence_id': None,
'partition00': 100,
'validity': b'\x01',
'rowids': b'\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'v_rowids': OrderedDict({
'sql': 'select * from v_rowids',
'rows': list([
OrderedDict({
'rowid': 1,
'id': None,
'chunk_id': 1,
'chunk_offset': 0,
}),
]),
}),
'v_vector_chunks00': OrderedDict({
'sql': 'select * from v_vector_chunks00',
'rows': list([
OrderedDict({
'rowid': 1,
'vectors': b'\x11"3D\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
})
# ---
# name: test_normal[2 rows, same parition]
dict({
'v_chunks': OrderedDict({
'sql': 'select * from v_chunks',
'rows': list([
OrderedDict({
'chunk_id': 1,
'size': 8,
'sequence_id': None,
'partition00': 100,
'validity': b'\x03',
'rowids': b'\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'v_rowids': OrderedDict({
'sql': 'select * from v_rowids',
'rows': list([
OrderedDict({
'rowid': 1,
'id': None,
'chunk_id': 1,
'chunk_offset': 0,
}),
OrderedDict({
'rowid': 2,
'id': None,
'chunk_id': 1,
'chunk_offset': 1,
}),
]),
}),
'v_vector_chunks00': OrderedDict({
'sql': 'select * from v_vector_chunks00',
'rows': list([
OrderedDict({
'rowid': 1,
'vectors': b'\x11"3DDUfw\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
})
# ---
# name: test_normal[3 rows, 2 partitions]
dict({
'v_chunks': OrderedDict({
'sql': 'select * from v_chunks',
'rows': list([
OrderedDict({
'chunk_id': 1,
'size': 8,
'sequence_id': None,
'partition00': 100,
'validity': b'\x03',
'rowids': b'\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
OrderedDict({
'chunk_id': 2,
'size': 8,
'sequence_id': None,
'partition00': 200,
'validity': b'\x01',
'rowids': b'\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'v_rowids': OrderedDict({
'sql': 'select * from v_rowids',
'rows': list([
OrderedDict({
'rowid': 1,
'id': None,
'chunk_id': 1,
'chunk_offset': 0,
}),
OrderedDict({
'rowid': 2,
'id': None,
'chunk_id': 1,
'chunk_offset': 1,
}),
OrderedDict({
'rowid': 3,
'id': None,
'chunk_id': 2,
'chunk_offset': 0,
}),
]),
}),
'v_vector_chunks00': OrderedDict({
'sql': 'select * from v_vector_chunks00',
'rows': list([
OrderedDict({
'rowid': 1,
'vectors': b'\x11"3DDUfw\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
OrderedDict({
'rowid': 2,
'vectors': b'\x88\x99\xaa\xbb\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
})
# ---
# name: test_types[1. raises type error]
dict({
'error': 'OperationalError',
'message': 'Parition key type mismatch: The partition key column p1 has type INTEGER, but TEXT was provided.',
})
# ---
# name: test_types[2. empty DB]
dict({
'v_chunks': OrderedDict({
'sql': 'select * from v_chunks',
'rows': list([
]),
}),
'v_rowids': OrderedDict({
'sql': 'select * from v_rowids',
'rows': list([
]),
}),
'v_vector_chunks00': OrderedDict({
'sql': 'select * from v_vector_chunks00',
'rows': list([
]),
}),
})
# ---
# name: test_types[3. allow nulls]
OrderedDict({
'sql': 'insert into v(p1, a) values(?, ?)',
'rows': list([
]),
})
# ---
# name: test_types[4. show NULL partition key]
dict({
'v_chunks': OrderedDict({
'sql': 'select * from v_chunks',
'rows': list([
OrderedDict({
'chunk_id': 1,
'size': 8,
'sequence_id': None,
'partition00': None,
'validity': b'\x01',
'rowids': b'\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
'v_rowids': OrderedDict({
'sql': 'select * from v_rowids',
'rows': list([
OrderedDict({
'rowid': 1,
'id': None,
'chunk_id': 1,
'chunk_offset': 0,
}),
]),
}),
'v_vector_chunks00': OrderedDict({
'sql': 'select * from v_vector_chunks00',
'rows': list([
OrderedDict({
'rowid': 1,
'vectors': b'\x11"3D\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
}),
]),
}),
})
# ---
# name: test_updates[1. Initial dataset]
OrderedDict({
'sql': 'select * from v',
'rows': list([
OrderedDict({
'rowid': 1,
'p': 'a',
'a': b'\x11\x11\x11\x11',
}),
OrderedDict({
'rowid': 2,
'p': 'a',
'a': b'""""',
}),
OrderedDict({
'rowid': 3,
'p': 'a',
'a': b'3333',
}),
]),
})
# ---
# name: test_updates[2. update #1]
dict({
'error': 'OperationalError',
'message': 'UPDATE on partition key columns are not supported yet. ',
})
# ---
================================================
FILE: tests/afbd/.gitignore
================================================
*.tgz
================================================
FILE: tests/afbd/.python-version
================================================
3.12
================================================
FILE: tests/afbd/Makefile
================================================
random_ints_1m.tgz:
curl -o $@ https://storage.googleapis.com/ann-filtered-benchmark/datasets/random_ints_1m.tgz
random_float_1m.tgz:
curl -o $@ https://storage.googleapis.com/ann-filtered-benchmark/datasets/random_float_1m.tgz
random_keywords_1m.tgz:
curl -o $@ https://storage.googleapis.com/ann-filtered-benchmark/datasets/random_keywords_1m.tgz
all: random_ints_1m.tgz random_float_1m.tgz random_keywords_1m.tgz
================================================
FILE: tests/afbd/README.md
================================================
# hnm
```
tar -xOzf hnm.tgz ./tests.jsonl > tests.jsonl
solite q "select group_concat(distinct key) from lines_read('tests.jsonl'), json_each(line -> '$.conditions.and[0]')"
```
```
> python test-afbd.py build hnm.tgz --metadata product_group_name,colour_group_name,index_group_name,perceived_colour_value_name,section_name,product_type_name,department_name,graphical_appearance_name,garment_group_name,perceived_colour_master_name
```
================================================
FILE: tests/afbd/test-afbd.py
================================================
import numpy as np
from tqdm import tqdm
from deepdiff import DeepDiff
import tarfile
import json
from io import BytesIO
import sqlite3
from typing import List
from struct import pack
import time
from pathlib import Path
import argparse
def serialize_float32(vector: List[float]) -> bytes:
"""Serializes a list of floats into the "raw bytes" format sqlite-vec expects"""
return pack("%sf" % len(vector), *vector)
def build_command(file_path, metadata_set=None):
if metadata_set:
metadata_set = set(metadata_set.split(","))
file_path = Path(file_path)
print(f"reading {file_path}...")
t0 = time.time()
with tarfile.open(file_path, "r:gz") as archive:
for file in archive:
if file.name == "./payloads.jsonl":
payloads = [
json.loads(line)
for line in archive.extractfile(file.name).readlines()
]
if file.name == "./tests.jsonl":
tests = [
json.loads(line)
for line in archive.extractfile(file.name).readlines()
]
if file.name == "./vectors.npy":
f = BytesIO()
f.write(archive.extractfile(file.name).read())
f.seek(0)
vectors = np.load(f)
assert payloads is not None
assert tests is not None
assert vectors is not None
dimensions = vectors.shape[1]
metadata_columns = sorted(list(payloads[0].keys()))
def col_type(v):
if isinstance(v, int):
return "integer"
if isinstance(v, float):
return "float"
if isinstance(v, str):
return "text"
raise Exception(f"Unknown column type: {v}")
metadata_columns_types = [col_type(payloads[0][col]) for col in metadata_columns]
print(time.time() - t0)
t0 = time.time()
print("seeding...")
db = sqlite3.connect(f"{file_path.stem}.db")
db.execute("PRAGMA page_size = 16384")
db.row_factory = sqlite3.Row
db.enable_load_extension(True)
db.load_extension("../../dist/vec0")
db.enable_load_extension(False)
with db:
db.execute("create table tests(data)")
for test in tests:
db.execute("insert into tests values (?)", [json.dumps(test)])
with db:
create_sql = f"create virtual table v using vec0(vector float[{dimensions}] distance_metric=cosine"
insert_sql = "insert into v(rowid, vector"
for name, type in zip(metadata_columns, metadata_columns_types):
if metadata_set:
if name in metadata_set:
create_sql += f", {name} {type}"
else:
create_sql += f", +{name} {type}"
else:
create_sql += f", {name} {type}"
insert_sql += f", {name}"
create_sql += ")"
insert_sql += ") values (" + ",".join("?" * (2 + len(metadata_columns))) + ")"
print(create_sql)
print(insert_sql)
db.execute(create_sql)
for idx, (payload, vector) in enumerate(
tqdm(zip(payloads, vectors), total=len(payloads))
):
params = [idx, vector]
for c in metadata_columns:
params.append(payload[c])
db.execute(insert_sql, params)
print(time.time() - t0)
def tests_command(file_path):
file_path = Path(file_path)
db = sqlite3.connect(f"{file_path.stem}.db")
db.execute("PRAGMA cache_size = -100000000")
db.row_factory = sqlite3.Row
db.enable_load_extension(True)
db.load_extension("../../dist/vec0")
db.enable_load_extension(False)
tests = [
json.loads(row["data"])
for row in db.execute("select data from tests").fetchall()
]
num_or_skips = 0
num_1off_errors = 0
t0 = time.time()
print("testing...")
for idx, test in enumerate(tqdm(tests)):
query = test["query"]
conditions = test["conditions"]
expected_closest_ids = test["closest_ids"]
expected_closest_scores = test["closest_scores"]
sql = "select rowid, 1 - distance as similarity from v where vector match ? and k = ?"
params = [serialize_float32(query), len(expected_closest_ids)]
if "and" in conditions:
for condition in conditions["and"]:
assert len(condition.keys()) == 1
column = list(condition.keys())[0]
assert len(list(condition[column].keys())) == 1
condition_type = list(condition[column].keys())[0]
if condition_type == "match":
value = condition[column]["match"]["value"]
sql += f" and {column} = ?"
params.append(value)
elif condition_type == "range":
sql += f" and {column} between ? and ?"
params.append(condition[column]["range"]["gt"])
params.append(condition[column]["range"]["lt"])
else:
raise Exception(f"Unknown condition type: {condition_type}")
elif "or" in conditions:
column = list(conditions["or"][0].keys())[0]
condition_type = list(conditions["or"][0][column].keys())[0]
assert condition_type == "match"
sql += f" and {column} in ("
for idx, condition in enumerate(conditions["or"]):
if condition_type == "match":
value = condition[column]["match"]["value"]
if idx != 0:
sql += ","
sql += "?"
params.append(value)
elif condition_type == "range":
breakpoint()
else:
raise Exception(f"Unknown condition type: {condition_type}")
sql += ")"
# print(sql, params[1:])
rows = db.execute(sql, params).fetchall()
actual_closest_ids = [row["rowid"] for row in rows]
matches = expected_closest_ids == actual_closest_ids
if not matches:
diff = DeepDiff(
expected_closest_ids, actual_closest_ids, ignore_order=False
)
assert len(list(diff.keys())) == 1
assert "values_changed" in diff.keys()
keys_changed = list(diff["values_changed"].keys())
if len(keys_changed) == 2:
akey, bkey = keys_changed
a = int(akey.lstrip("root[").rstrip("]"))
b = int(bkey.lstrip("root[").rstrip("]"))
assert abs(a - b) == 1
assert (
diff["values_changed"][akey]["new_value"]
== diff["values_changed"][bkey]["old_value"]
)
assert (
diff["values_changed"][akey]["old_value"]
== diff["values_changed"][bkey]["new_value"]
)
elif len(keys_changed) == 1:
v = int(keys_changed[0].lstrip("root[").rstrip("]"))
assert (v + 1) == len(expected_closest_ids)
else:
raise Exception("fuck")
num_1off_errors += 1
# print(closest_scores)
# print([row["similarity"] for row in rows])
# assert closest_scores == [row["similarity"] for row in rows]
print("Number skipped: ", num_or_skips)
print("Num 1 off errors: ", num_1off_errors)
print("1 off error rate: ", num_1off_errors / (len(tests) - num_or_skips))
print(time.time() - t0)
print("done")
def main():
parser = argparse.ArgumentParser(description="CLI tool")
subparsers = parser.add_subparsers(dest="command", required=True)
build_parser = subparsers.add_parser("build")
build_parser.add_argument("file", type=str, help="Path to input file")
build_parser.add_argument("--metadata", type=str, help="Metadata columns")
build_parser.set_defaults(func=lambda args: build_command(args.file, args.metadata))
tests_parser = subparsers.add_parser("test")
tests_parser.add_argument("file", type=str, help="Path to input file")
tests_parser.set_defaults(func=lambda args: tests_command(args.file))
args = parser.parse_args()
args.func(args)
if __name__ == "__main__":
main()
================================================
FILE: tests/build.rs
================================================
fn main() {
cc::Build::new()
.file("../sqlite-vec.c")
.file("../vendor/sqlite3.c")
.define("SQLITE_CORE", None)
.include("../vendor")
.include("..")
.static_flag(true)
.compile("sqlite-vec-internal");
println!("cargo:rerun-if-changed=build.rs");
println!("cargo:rerun-if-changed=../sqlite-vec.c");
}
================================================
FILE: tests/conftest.py
================================================
import pytest
import sqlite3
@pytest.fixture()
def db():
db = sqlite3.connect(":memory:")
db.row_factory = sqlite3.Row
db.enable_load_extension(True)
db.load_extension("dist/vec0")
db.enable_load_extension(False)
return db
================================================
FILE: tests/correctness/build.py
================================================
import numpy as np
import duckdb
db = duckdb.connect(":memory:")
result = db.execute(
"""
select
-- _id,
-- title,
-- text as contents,
embedding::float[] as embeddings
from "hf://datasets/Supabase/dbpedia-openai-3-large-1M/dbpedia_openai_3_large_00.parquet"
"""
).fetchnumpy()['embeddings']
np.save("dbpedia_openai_3_large_00.npy", np.vstack(result))
================================================
FILE: tests/correctness/test-correctness.py
================================================
import numpy as np
import numpy.typing as npt
import time
import tqdm
import pytest
def cosine_similarity(
vec: npt.NDArray[np.float32], mat: npt.NDArray[np.float32], do_norm: bool = True
) -> npt.NDArray[np.float32]:
sim = vec @ mat.T
if do_norm:
sim /= np.linalg.norm(vec) * np.linalg.norm(mat, axis=1)
return sim
def distance_l2(
vec: npt.NDArray[np.float32], mat: npt.NDArray[np.float32]
) -> npt.NDArray[np.float32]:
return np.sqrt(np.sum((mat - vec) ** 2, axis=1))
def topk(
vec: npt.NDArray[np.float32],
mat: npt.NDArray[np.float32],
k: int = 5,
) -> tuple[npt.NDArray[np.int32], npt.NDArray[np.float32]]:
distances = distance_l2(vec, mat)
# Rather than sorting all similarities and taking the top K, it's faster to
# argpartition and then just sort the top K.
# The difference is O(N logN) vs O(N + k logk)
indices = np.argpartition(distances, kth=k)[:k]
top_indices = indices[np.argsort(distances[indices])]
return top_indices, distances[top_indices]
vec = np.array([1.0, 2.0, 3.0], dtype=np.float32)
mat = np.array([
[4.0, 5.0, 6.0],
[1.0, 2.0, 1.0],
[7.0, 8.0, 9.0]
], dtype=np.float32)
indices, distances = topk(vec, mat, k=2)
print(indices)
print(distances)
import sqlite3
import json
db = sqlite3.connect(":memory:")
db.enable_load_extension(True)
db.load_extension("../../dist/vec0")
db.execute("select load_extension('../../dist/vec0', 'sqlite3_vec_fs_read_init')")
db.enable_load_extension(False)
results = db.execute(
'''
select
key,
--value,
vec_distance_l2(:q, value) as distance
from json_each(:base)
order by distance
limit 2
''',
{
'base': json.dumps(mat.tolist()),
'q': '[1.0, 2.0, 3.0]'
}).fetchall()
a = [row[0] for row in results]
b = [row[1] for row in results]
print(a)
print(b)
#import sys; sys.exit()
db.execute('PRAGMA page_size=16384')
print("Loading into sqlite-vec vec0 table...")
t0 = time.time()
db.execute("create virtual table v using vec0(a float[3072], chunk_size=16)")
db.execute('insert into v select rowid, vector from vec_npy_each(vec_npy_file("dbpedia_openai_3_large_00.npy"))')
print(time.time() - t0)
print("loading numpy array...")
t0 = time.time()
base = np.load('dbpedia_openai_3_large_00.npy')
print(time.time() - t0)
np.random.seed(1)
queries = base[np.random.choice(base.shape[0], 20, replace=False), :]
np_durations = []
vec_durations = []
from random import randrange
def test_all():
for idx, query in tqdm.tqdm(enumerate(queries)):
#k = randrange(20, 1000)
#k = 500
k = 10
t0 = time.time()
np_ids, np_distances = topk(query, base, k=k)
np_durations.append(time.time() - t0)
t0 = time.time()
rows = db.execute('select rowid, distance from v where a match ? and k = ?', [query, k]).fetchall()
vec_durations.append(time.time() - t0)
vec_ids = [row[0] for row in rows]
vec_distances = [row[1] for row in rows]
assert vec_distances == np_distances.tolist()
#assert vec_ids == np_ids.tolist()
#if (vec_ids != np_ids).any():
# print('idx', idx)
# print('query', query)
# print('np_ids', np_ids)
# print('np_distances', np_distances)
# print('vec_ids', vec_ids)
# print('vec_distances', vec_distances)
# raise Exception(idx)
print('final', 'np' ,np.mean(np_durations), 'vec', np.mean(vec_durations))
================================================
FILE: tests/fuzz/.gitignore
================================================
*.dSYM
targets/
================================================
FILE: tests/fuzz/Makefile
================================================
# Auto-detect clang with libFuzzer support.
# Priority: Homebrew LLVM (macOS ARM) → Homebrew LLVM (macOS Intel) →
# versioned clang (Linux) → system clang
FUZZ_CC ?= $(shell \
if [ -x /opt/homebrew/opt/llvm/bin/clang ]; then \
echo "/opt/homebrew/opt/llvm/bin/clang"; \
elif [ -x /usr/local/opt/llvm/bin/clang ]; then \
echo "/usr/local/opt/llvm/bin/clang"; \
elif command -v clang-18 >/dev/null 2>&1; then \
echo "clang-18"; \
elif command -v clang-17 >/dev/null 2>&1; then \
echo "clang-17"; \
elif command -v clang >/dev/null 2>&1; then \
echo "clang"; \
else \
echo "FUZZ_CC_NOT_FOUND"; \
fi)
# AddressSanitizer + UndefinedBehaviorSanitizer + libFuzzer.
# Override FUZZ_SANITIZERS to change (e.g., drop ubsan on Windows).
FUZZ_SANITIZERS ?= -fsanitize=address,undefined,fuzzer
# On macOS, Homebrew LLVM may need -Wl,-ld_classic to work with the system linker.
FUZZ_LDFLAGS ?= $(shell \
if [ "$$(uname -s)" = "Darwin" ]; then \
echo "-Wl,-ld_classic"; \
fi)
FUZZ_CFLAGS = $(FUZZ_SANITIZERS) -I ../../ -I ../../vendor -DSQLITE_CORE -g $(FUZZ_LDFLAGS)
FUZZ_SRCS = ../../vendor/sqlite3.c ../../sqlite-vec.c
TARGET_DIR = ./targets
$(TARGET_DIR):
mkdir -p $@
# Existing targets (filename uses -, Makefile target uses _)
$(TARGET_DIR)/vec0_create: vec0-create.c $(FUZZ_SRCS) | $(TARGET_DIR)
$(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@
$(TARGET_DIR)/exec: exec.c $(FUZZ_SRCS) | $(TARGET_DIR)
$(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@
$(TARGET_DIR)/json: json.c $(FUZZ_SRCS) | $(TARGET_DIR)
$(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@
$(TARGET_DIR)/numpy: numpy.c $(FUZZ_SRCS) | $(TARGET_DIR)
$(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@
# New targets
$(TARGET_DIR)/shadow_corrupt: shadow-corrupt.c $(FUZZ_SRCS) | $(TARGET_DIR)
$(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@
$(TARGET_DIR)/vec0_operations: vec0-operations.c $(FUZZ_SRCS) | $(TARGET_DIR)
$(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@
$(TARGET_DIR)/scalar_functions: scalar-functions.c $(FUZZ_SRCS) | $(TARGET_DIR)
$(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@
$(TARGET_DIR)/vec0_create_full: vec0-create-full.c $(FUZZ_SRCS) | $(TARGET_DIR)
$(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@
$(TARGET_DIR)/metadata_columns: metadata-columns.c $(FUZZ_SRCS) | $(TARGET_DIR)
$(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@
$(TARGET_DIR)/vec_each: vec-each.c $(FUZZ_SRCS) | $(TARGET_DIR)
$(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@
$(TARGET_DIR)/vec_mismatch: vec-mismatch.c $(FUZZ_SRCS) | $(TARGET_DIR)
$(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@
$(TARGET_DIR)/vec0_delete_completeness: vec0-delete-completeness.c $(FUZZ_SRCS) | $(TARGET_DIR)
$(FUZZ_CC) $(FUZZ_CFLAGS) $(FUZZ_SRCS) $< -o $@
FUZZ_TARGETS = vec0_create exec json numpy \
shadow_corrupt vec0_operations scalar_functions \
vec0_create_full metadata_columns vec_each vec_mismatch \
vec0_delete_completeness
all: $(addprefix $(TARGET_DIR)/,$(FUZZ_TARGETS))
clean:
rm -rf $(TARGET_DIR)/*
.PHONY: all clean
================================================
FILE: tests/fuzz/README.md
================================================
```
ASAN_OPTIONS=detect_leaks=1 ./targets/vec0_create \
-dict=./vec0-create.dict -max_total_time=5 \
./corpus/vec0-create
```
```
export PATH="/opt/homebrew/opt/llvm/bin:$PATH"
export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
LDFLAGS="-L/opt/homebrew/opt/llvm/lib/c++ -Wl,-rpath,/opt/homebrew/opt/llvm/lib/c++"
```
================================================
FILE: tests/fuzz/TODO.md
================================================
# Fuzz Testing TODO: Undefined Behavior Findings
UBSAN findings from fuzz targets. None are crash-level bugs, but all are
formally undefined behavior per the C standard.
## Class 1: Function pointer type mismatch (~20 sites)
`fvec_cleanup_noop` is defined as `void (f32 *)` but called through
`vector_cleanup` which is `void (*)(void *)`. Two cleanup typedefs exist
with incompatible signatures:
```c
typedef void (*vector_cleanup)(void *p); // line 597
typedef void (*fvec_cleanup)(f32 *vector); // line 695
```
Affected lines: 1031, 1049, 1050, 1160, 1200, 1201, 1241, 1242, 1282,
1283, 1324, 1325, 1356, 1424, 1524, 1525, 1582, 1583, 1699, 1749, 1798,
2520, 7236, 8501, and sqlite3.c:82930 (via sqlite3_result_blob destructor).
Low practical severity — calling conventions on all real platforms pass
`f32 *` and `void *` identically — but flags on every UBSAN run.
Fix: change `fvec_cleanup_noop` to take `void *`, or unify the typedefs.
## Class 2: Misaligned f32 reads (~10 sites)
`f32` (4-byte alignment required) read from potentially unaligned addresses.
Happens when a blob from SQLite's internal storage is cast to `f32 *` and
dereferenced. The blob pointer may not be 4-byte aligned.
Affected lines: 369, 446, 473-475, 1401, 1461, 1501, 1559, 1653, 1726,
1789, 1793.
Medium severity — silent on x86/ARM64 (hardware supports unaligned float
access) but UB on strict-alignment architectures.
Fix: use `memcpy` to load floats from potentially-unaligned memory, or
ensure blob pointers are aligned before use.
## Class 3: Float-to-integer overflow (1 site)
`vec_quantize_int8` at line 1461 — when `srcVector[i]` is a large float,
the expression `((srcVector[i] - (-1.0)) / step) - 128` overflows
`signed char` range. Assigning this to `i8 out[i]` is UB.
Low-medium severity — silent truncation in practice.
Fix: clamp the result before cast.
================================================
FILE: tests/fuzz/corpus/exec/select1
================================================
SELECT 1
================================================
FILE: tests/fuzz/corpus/exec/vec_version
================================================
SELECT vec_version()
================================================
FILE: tests/fuzz/corpus/json/empty
================================================
[]
================================================
FILE: tests/fuzz/corpus/json/valid_2d
================================================
[0.5, -0.5]
================================================
FILE: tests/fuzz/corpus/json/valid_4d
================================================
[1.0, 2.0, 3.0, 4.0]
================================================
FILE: tests/fuzz/corpus/vec-mismatch/json_1d_blob_5byte
================================================
ABCDE
================================================
FILE: tests/fuzz/corpus/vec-mismatch/json_2d_blob_3byte
================================================
ABC
================================================
FILE: tests/fuzz/corpus/vec0-create/normal1
================================================
aaa float[12]
================================================
FILE: tests/fuzz/corpus/vec0-create/normal2
================================================
aaa float[12], bbb int8[6]
================================================
FILE: tests/fuzz/exec.c
================================================
#include
#include
#include
#include
#include
#include "sqlite-vec.h"
#include "sqlite3.h"
#include
int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
int rc = SQLITE_OK;
sqlite3 *db;
sqlite3_stmt *stmt;
if(size < 1) return 0;
rc = sqlite3_open(":memory:", &db);
assert(rc == SQLITE_OK);
rc = sqlite3_vec_init(db, NULL, NULL);
assert(rc == SQLITE_OK);
const char * zSrc = sqlite3_mprintf("%.*s", size, data);
assert(zSrc);
sqlite3_exec(db, zSrc, NULL, NULL, NULL);
sqlite3_free(zSrc);
sqlite3_close(db);
return 0;
}
================================================
FILE: tests/fuzz/exec.dict
================================================
select="select"
from="from"
cname1="aaa"
cname1="bbb"
cname1="ccc"
type1="float"
type2="int8"
type3="bit"
lparen="["
rparen="]"
pk="primary key"
text="text"
distance_metric="distance_metric"
eq="="
l1="l1"
l2="l2"
cosine="cosine"
hamming="hamming"
vec_distance_l2="vec_distance_l2"
vec_distance_l1="vec_distance_l1"
comma=","
================================================
FILE: tests/fuzz/json.c
================================================
#include
#include
#include
#include
#include
#include "sqlite-vec.h"
#include "sqlite3.h"
#include
int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
int rc = SQLITE_OK;
sqlite3 *db;
sqlite3_stmt *stmt;
//rc = sqlite3_auto_extension((void (*)())sqlite3_vec_init);
//assert(rc == SQLITE_OK);
rc = sqlite3_open(":memory:", &db);
assert(rc == SQLITE_OK);
rc = sqlite3_vec_init(db, NULL, NULL);
assert(rc == SQLITE_OK);
rc = sqlite3_prepare_v2(db, "SELECT vec_f32(cast(? as text))", -1, &stmt, NULL);
assert(rc == SQLITE_OK);
sqlite3_bind_blob(stmt, 1, data, size, SQLITE_STATIC);
sqlite3_step(stmt);
sqlite3_finalize(stmt);
sqlite3_close(db);
return 0;
}
================================================
FILE: tests/fuzz/metadata-columns.c
================================================
#include
#include
#include
#include
#include
#include "sqlite-vec.h"
#include "sqlite3.h"
#include
int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
if (size < 8) return 0;
int rc;
sqlite3 *db;
rc = sqlite3_open(":memory:", &db);
assert(rc == SQLITE_OK);
rc = sqlite3_vec_init(db, NULL, NULL);
assert(rc == SQLITE_OK);
rc = sqlite3_exec(db,
"CREATE VIRTUAL TABLE v USING vec0("
" emb float[4],"
" flag boolean metadata,"
" count integer metadata,"
" score float metadata,"
" label text metadata,"
" aux_data text auxiliary"
")", NULL, NULL, NULL);
if (rc != SQLITE_OK) { sqlite3_close(db); return 0; }
// Prepare statements for insert and query
sqlite3_stmt *stmtInsert = NULL;
sqlite3_stmt *stmtKnn = NULL;
sqlite3_stmt *stmtKnnFilter = NULL;
sqlite3_stmt *stmtDelete = NULL;
sqlite3_prepare_v2(db,
"INSERT INTO v(rowid, emb, flag, count, score, label, aux_data) "
"VALUES (?, ?, ?, ?, ?, ?, ?)", -1, &stmtInsert, NULL);
sqlite3_prepare_v2(db,
"SELECT rowid, distance FROM v WHERE emb MATCH ? LIMIT 3",
-1, &stmtKnn, NULL);
sqlite3_prepare_v2(db,
"SELECT rowid, distance FROM v WHERE emb MATCH ? AND flag = 1 LIMIT 3",
-1, &stmtKnnFilter, NULL);
sqlite3_prepare_v2(db,
"DELETE FROM v WHERE rowid = ?", -1, &stmtDelete, NULL);
if (!stmtInsert || !stmtKnn || !stmtKnnFilter || !stmtDelete) goto cleanup;
size_t i = 0;
while (i + 6 <= size) {
uint8_t op = data[i++] % 4;
uint8_t rowid_byte = data[i++];
int64_t rowid = (int64_t)(rowid_byte % 50) + 1;
switch (op) {
case 0: {
// INSERT with fuzz-derived vector and metadata
float vec[4];
for (int j = 0; j < 4 && i < size; j++, i++) {
vec[j] = (float)((int8_t)data[i]) / 10.0f;
}
int flag_val = (i < size) ? data[i++] % 2 : 0;
int count_val = (i < size) ? (int)((int8_t)data[i++]) : 0;
float score_val = (i < size) ? (float)((int8_t)data[i++]) / 10.0f : 0.0f;
sqlite3_reset(stmtInsert);
sqlite3_bind_int64(stmtInsert, 1, rowid);
sqlite3_bind_blob(stmtInsert, 2, vec, sizeof(vec), SQLITE_TRANSIENT);
sqlite3_bind_int(stmtInsert, 3, flag_val);
sqlite3_bind_int(stmtInsert, 4, count_val);
sqlite3_bind_double(stmtInsert, 5, (double)score_val);
sqlite3_bind_text(stmtInsert, 6, "label", -1, SQLITE_STATIC);
sqlite3_bind_text(stmtInsert, 7, "aux", -1, SQLITE_STATIC);
sqlite3_step(stmtInsert);
break;
}
case 1: {
// KNN query (no filter)
float qvec[4] = {1.0f, 0.0f, 0.0f, 0.0f};
sqlite3_reset(stmtKnn);
sqlite3_bind_blob(stmtKnn, 1, qvec, sizeof(qvec), SQLITE_STATIC);
while (sqlite3_step(stmtKnn) == SQLITE_ROW) {}
break;
}
case 2: {
// KNN query WITH metadata filter
float qvec[4] = {0.0f, 1.0f, 0.0f, 0.0f};
sqlite3_reset(stmtKnnFilter);
sqlite3_bind_blob(stmtKnnFilter, 1, qvec, sizeof(qvec), SQLITE_STATIC);
while (sqlite3_step(stmtKnnFilter) == SQLITE_ROW) {}
break;
}
case 3: {
// DELETE
sqlite3_reset(stmtDelete);
sqlite3_bind_int64(stmtDelete, 1, rowid);
sqlite3_step(stmtDelete);
break;
}
}
}
sqlite3_exec(db, "SELECT * FROM v", NULL, NULL, NULL);
cleanup:
sqlite3_finalize(stmtInsert);
sqlite3_finalize(stmtKnn);
sqlite3_finalize(stmtKnnFilter);
sqlite3_finalize(stmtDelete);
sqlite3_close(db);
return 0;
}
================================================
FILE: tests/fuzz/numpy.c
================================================
#include
#include
#include
#include
#include
#include "sqlite-vec.h"
#include "sqlite3.h"
#include
extern int sqlite3_vec_numpy_init(sqlite3 *db, char **pzErrMsg,
const sqlite3_api_routines *pApi);
int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
int rc = SQLITE_OK;
sqlite3 *db;
sqlite3_stmt *stmt;
rc = sqlite3_open(":memory:", &db);
assert(rc == SQLITE_OK);
rc = sqlite3_vec_init(db, NULL, NULL);
assert(rc == SQLITE_OK);
rc = sqlite3_vec_numpy_init(db, NULL, NULL);
assert(rc == SQLITE_OK);
rc = sqlite3_prepare_v2(db, "select * from vec_npy_each(?)", -1, &stmt, NULL);
assert(rc == SQLITE_OK);
sqlite3_bind_blob(stmt, 1, data, size, SQLITE_STATIC);
rc = sqlite3_step(stmt);
while (rc == SQLITE_ROW) {
rc = sqlite3_step(stmt);
}
sqlite3_finalize(stmt);
sqlite3_close(db);
return 0;
}
================================================
FILE: tests/fuzz/numpy.dict
================================================
magic="\x93NUMPY"
lparen="("
rparen=")"
lbrace="{"
rbrace="}"
sq1="\""
sq2="'"
================================================
FILE: tests/fuzz/scalar-functions.c
================================================
#include
#include
#include
#include
#include
#include "sqlite-vec.h"
#include "sqlite3.h"
#include
int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
if (size < 2) return 0;
int rc;
sqlite3 *db;
sqlite3_stmt *stmt = NULL;
rc = sqlite3_open(":memory:", &db);
assert(rc == SQLITE_OK);
rc = sqlite3_vec_init(db, NULL, NULL);
assert(rc == SQLITE_OK);
static const char *queries[] = {
"SELECT vec_f32(cast(? as text))", /* 0: JSON text -> f32 */
"SELECT vec_f32(?)", /* 1: blob -> f32 */
"SELECT vec_int8(?)", /* 2: blob -> int8 */
"SELECT vec_bit(?)", /* 3: blob -> bit */
"SELECT vec_length(?)", /* 4: vector length */
"SELECT vec_type(?)", /* 5: vector type string */
"SELECT vec_to_json(?)", /* 6: vector -> JSON */
"SELECT vec_normalize(?)", /* 7: normalize */
"SELECT vec_quantize_binary(?)", /* 8: quantize to binary */
"SELECT vec_quantize_int8(?, 'unit')", /* 9: quantize to int8 */
"SELECT vec_distance_l2(?, ?)", /* 10: L2 distance */
"SELECT vec_distance_cosine(?, ?)", /* 11: cosine distance */
"SELECT vec_distance_l1(?, ?)", /* 12: L1 distance */
"SELECT vec_distance_hamming(?, ?)", /* 13: hamming distance */
"SELECT vec_add(?, ?)", /* 14: vector add */
"SELECT vec_sub(?, ?)", /* 15: vector subtract */
"SELECT vec_slice(?, 0, ?)", /* 16: vector slice */
};
static const int nQueries = sizeof(queries) / sizeof(queries[0]);
int qIdx = data[0] % nQueries;
const uint8_t *payload = data + 1;
int payload_size = (int)(size - 1);
rc = sqlite3_prepare_v2(db, queries[qIdx], -1, &stmt, NULL);
if (rc != SQLITE_OK) {
sqlite3_close(db);
return 0;
}
int nParams = sqlite3_bind_parameter_count(stmt);
// Bind param 1: fuzz payload as blob
sqlite3_bind_blob(stmt, 1, payload, payload_size, SQLITE_STATIC);
if (nParams >= 2) {
if (qIdx == 16) {
// vec_slice 3rd param is integer (end index)
int end_idx = (payload_size > 0) ? (payload[0] % 64) : 0;
sqlite3_bind_int(stmt, 2, end_idx);
} else {
// For 2-param functions (distance, add, sub): split payload in half
int half = payload_size / 2;
sqlite3_bind_blob(stmt, 2, payload + half,
payload_size - half, SQLITE_STATIC);
}
}
if (nParams >= 3) {
// vec_slice: param 3 is the end index
int end_idx = (payload_size > 1) ? (payload[1] % 64) : 0;
sqlite3_bind_int(stmt, 3, end_idx);
}
sqlite3_step(stmt);
sqlite3_finalize(stmt);
sqlite3_close(db);
return 0;
}
================================================
FILE: tests/fuzz/scalar-functions.dict
================================================
json_vec1="[1.0, 2.0, 3.0, 4.0]"
json_vec2="[0.5, -0.5]"
json_empty="[]"
json_nan="[NaN]"
json_inf="[Infinity]"
json_large="[1e38, -1e38]"
unit="unit"
null="null"
================================================
FILE: tests/fuzz/shadow-corrupt.c
================================================
#include
#include
#include
#include
#include
#include "sqlite-vec.h"
#include "sqlite3.h"
#include
int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
if (size < 2) return 0;
int rc;
sqlite3 *db;
rc = sqlite3_open(":memory:", &db);
assert(rc == SQLITE_OK);
rc = sqlite3_vec_init(db, NULL, NULL);
assert(rc == SQLITE_OK);
// Build a valid table with 3 vectors (float[4] = 16 bytes each)
// [1,0,0,0], [0,-1,0,1], [1,1,0,1] as little-endian float32 hex
rc = sqlite3_exec(db,
"CREATE VIRTUAL TABLE v USING vec0(emb float[4]);"
"INSERT INTO v(rowid, emb) VALUES (1, X'0000803f000000000000000000000000');"
"INSERT INTO v(rowid, emb) VALUES (2, X'00000000000080bf000000000000803f');"
"INSERT INTO v(rowid, emb) VALUES (3, X'0000803f0000803f000000000000803f');",
NULL, NULL, NULL);
if (rc != SQLITE_OK) {
sqlite3_close(db);
return 0;
}
// Use first byte to select corruption strategy
int target = data[0] % 6;
const uint8_t *payload = data + 1;
int payload_size = (int)(size - 1);
sqlite3_stmt *stmt = NULL;
switch (target) {
case 0: {
// Corrupt _chunks validity blob with fuzz data
rc = sqlite3_prepare_v2(db,
"UPDATE v_chunks SET validity = ? WHERE rowid = 1", -1, &stmt, NULL);
if (rc == SQLITE_OK) {
sqlite3_bind_blob(stmt, 1, payload, payload_size, SQLITE_STATIC);
sqlite3_step(stmt);
sqlite3_finalize(stmt);
}
break;
}
case 1: {
// Corrupt _chunks rowids blob with fuzz data
rc = sqlite3_prepare_v2(db,
"UPDATE v_chunks SET rowids = ? WHERE rowid = 1", -1, &stmt, NULL);
if (rc == SQLITE_OK) {
sqlite3_bind_blob(stmt, 1, payload, payload_size, SQLITE_STATIC);
sqlite3_step(stmt);
sqlite3_finalize(stmt);
}
break;
}
case 2: {
// Corrupt _vector_chunks00 vectors blob with fuzz data
rc = sqlite3_prepare_v2(db,
"UPDATE v_vector_chunks00 SET vectors = ? WHERE rowid = 1", -1, &stmt, NULL);
if (rc == SQLITE_OK) {
sqlite3_bind_blob(stmt, 1, payload, payload_size, SQLITE_STATIC);
sqlite3_step(stmt);
sqlite3_finalize(stmt);
}
break;
}
case 3: {
// Set validity to NULL (violates NOT NULL but shadow tables are writable)
sqlite3_exec(db,
"UPDATE v_chunks SET validity = NULL WHERE rowid = 1",
NULL, NULL, NULL);
break;
}
case 4: {
// Set rowids to NULL
sqlite3_exec(db,
"UPDATE v_chunks SET rowids = NULL WHERE rowid = 1",
NULL, NULL, NULL);
break;
}
case 5: {
// Delete shadow table rows entirely (orphan the virtual table data)
sqlite3_exec(db,
"DELETE FROM v_vector_chunks00 WHERE rowid = 1",
NULL, NULL, NULL);
break;
}
}
// Exercise all read paths — NONE should crash
sqlite3_exec(db, "SELECT * FROM v", NULL, NULL, NULL);
sqlite3_exec(db, "SELECT * FROM v WHERE rowid = 1", NULL, NULL, NULL);
sqlite3_exec(db, "SELECT * FROM v WHERE rowid = 2", NULL, NULL, NULL);
sqlite3_exec(db,
"SELECT rowid, distance FROM v "
"WHERE emb MATCH X'0000803f000000000000000000000000' LIMIT 3",
NULL, NULL, NULL);
sqlite3_exec(db, "DELETE FROM v WHERE rowid = 2", NULL, NULL, NULL);
sqlite3_exec(db,
"INSERT INTO v(rowid, emb) VALUES (4, X'0000803f000000000000000000000000')",
NULL, NULL, NULL);
sqlite3_exec(db, "DROP TABLE v", NULL, NULL, NULL);
sqlite3_close(db);
return 0;
}
================================================
FILE: tests/fuzz/targets/.gitignore
================================================
*
!.gitignore
================================================
FILE: tests/fuzz/vec-each.c
================================================
#include
#include
#include
#include
#include
#include "sqlite-vec.h"
#include "sqlite3.h"
#include
int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
int rc;
sqlite3 *db;
sqlite3_stmt *stmt = NULL;
rc = sqlite3_open(":memory:", &db);
assert(rc == SQLITE_OK);
rc = sqlite3_vec_init(db, NULL, NULL);
assert(rc == SQLITE_OK);
rc = sqlite3_prepare_v2(db,
"SELECT * FROM vec_each(?)", -1, &stmt, NULL);
assert(rc == SQLITE_OK);
sqlite3_bind_blob(stmt, 1, data, (int)size, SQLITE_STATIC);
while (sqlite3_step(stmt) == SQLITE_ROW) {
// Consume all rows — just exercise the iteration path
}
sqlite3_finalize(stmt);
sqlite3_close(db);
return 0;
}
================================================
FILE: tests/fuzz/vec-mismatch.c
================================================
#include
#include
#include
#include
#include
#include "sqlite-vec.h"
#include "sqlite3.h"
#include
/*
* Fuzz target that exercises error-path cleanup in vector functions.
*
* The key insight: when a vector is parsed from JSON TEXT, the cleanup
* function is sqlite3_free (heap allocator). When parsed from BLOB,
* cleanup is a no-op. Bugs in cleanup code (wrong pointer, missing
* cleanup, double-free) are only observable with the sqlite3_free path.
*
* This fuzzer systematically covers:
* 1. Valid JSON arg1 + invalid fuzz arg2 (parse failure → cleanup arg1)
* 2. Valid JSON arg1 + valid JSON arg2 with different dimensions
* (dimension mismatch → cleanup both)
* 3. Valid JSON arg1 + int8/bit blob arg2 with mismatched type
* (type mismatch → cleanup both)
* 4. Fuzz arg1 + valid JSON arg2 (parse failure of arg1, no cleanup)
* 5. Single-arg functions with JSON text (normal cleanup path)
* 6. Single-arg functions with fuzz text (parse failure path)
*/
/* Helper: bind a valid vector to a statement parameter.
* mode selects the vector type and format. */
static void bind_valid_vector(sqlite3_stmt *stmt, int param, int mode) {
/* JSON text vectors — cleanup = sqlite3_free */
static const char *json_f32_4d = "[1.0, 0.0, 0.0, 0.0]";
static const char *json_f32_2d = "[1.0, 2.0]";
static const char *json_f32_1d = "[1.0]";
/* Blob vectors — cleanup = noop */
static const float blob_f32_4d[] = {1.0f, 0.0f, 0.0f, 0.0f};
static const float blob_f32_2d[] = {1.0f, 2.0f};
/* int8 blob — 4 bytes = 4 dimensions */
static const int8_t blob_int8_4d[] = {10, 20, 30, 40};
/* bit blob — 1 byte = 8 bits */
static const uint8_t blob_bit_1b[] = {0xAA};
switch (mode % 7) {
case 0: sqlite3_bind_text(stmt, param, json_f32_4d, -1, SQLITE_STATIC); break;
case 1: sqlite3_bind_text(stmt, param, json_f32_2d, -1, SQLITE_STATIC); break;
case 2: sqlite3_bind_text(stmt, param, json_f32_1d, -1, SQLITE_STATIC); break;
case 3: sqlite3_bind_blob(stmt, param, blob_f32_4d, sizeof(blob_f32_4d), SQLITE_STATIC); break;
case 4: sqlite3_bind_blob(stmt, param, blob_f32_2d, sizeof(blob_f32_2d), SQLITE_STATIC); break;
case 5: /* int8 — must set subtype */
sqlite3_bind_blob(stmt, param, blob_int8_4d, sizeof(blob_int8_4d), SQLITE_STATIC);
break;
case 6: /* bit — must set subtype */
sqlite3_bind_blob(stmt, param, blob_bit_1b, sizeof(blob_bit_1b), SQLITE_STATIC);
break;
}
}
static void run_query(sqlite3 *db, const char *sql,
int arg1_mode, int arg2_mode,
const uint8_t *fuzz, int fuzz_len,
int fuzz_arg, int fuzz_as_text) {
sqlite3_stmt *stmt = NULL;
int rc = sqlite3_prepare_v2(db, sql, -1, &stmt, NULL);
if (rc != SQLITE_OK) return;
int nParams = sqlite3_bind_parameter_count(stmt);
for (int p = 1; p <= nParams; p++) {
if (p == fuzz_arg) {
/* Bind fuzz data */
if (fuzz_as_text)
sqlite3_bind_text(stmt, p, (const char *)fuzz, fuzz_len, SQLITE_STATIC);
else
sqlite3_bind_blob(stmt, p, fuzz, fuzz_len, SQLITE_STATIC);
} else if (p == 1) {
bind_valid_vector(stmt, p, arg1_mode);
} else {
bind_valid_vector(stmt, p, arg2_mode);
}
}
sqlite3_step(stmt);
sqlite3_finalize(stmt);
}
int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
if (size < 3) return 0;
int rc;
sqlite3 *db;
rc = sqlite3_open(":memory:", &db);
assert(rc == SQLITE_OK);
rc = sqlite3_vec_init(db, NULL, NULL);
assert(rc == SQLITE_OK);
/* --- Decode fuzz control bytes --- */
uint8_t b0 = data[0];
uint8_t b1 = data[1];
uint8_t b2 = data[2];
const uint8_t *payload = data + 3;
int payload_size = (int)(size - 3);
/* Two-argument vector functions */
static const char *two_arg[] = {
"SELECT vec_distance_l2(?, ?)",
"SELECT vec_distance_cosine(?, ?)",
"SELECT vec_distance_l1(?, ?)",
"SELECT vec_distance_hamming(?, ?)",
"SELECT vec_add(?, ?)",
"SELECT vec_sub(?, ?)",
};
/* Single-argument vector functions that call cleanup */
static const char *one_arg[] = {
"SELECT vec_f32(?)",
"SELECT vec_int8(?)",
"SELECT vec_bit(?)",
"SELECT vec_length(?)",
"SELECT vec_type(?)",
"SELECT vec_to_json(?)",
"SELECT vec_normalize(?)",
"SELECT vec_quantize_binary(?)",
};
int qIdx2 = b0 % 6;
int qIdx1 = b0 % 8;
int arg1_mode = b1 % 7;
int arg2_mode = b2 % 7;
/*
* Phase 1: Two-arg functions — fuzz arg2, valid arg1
* Exercises: parse-failure cleanup of arg1 (the fixed bug),
* type mismatch cleanup, dimension mismatch cleanup.
*/
/* arg2 as fuzz blob */
run_query(db, two_arg[qIdx2], arg1_mode, 0,
payload, payload_size, /*fuzz_arg=*/2, /*as_text=*/0);
/* arg2 as fuzz text */
run_query(db, two_arg[qIdx2], arg1_mode, 0,
payload, payload_size, /*fuzz_arg=*/2, /*as_text=*/1);
/*
* Phase 2: Two-arg functions — fuzz arg1, valid arg2
* Exercises: parse-failure of arg1 (no cleanup needed), and
* type/dimension mismatch when arg1 parses to unexpected type.
*/
run_query(db, two_arg[qIdx2], 0, arg2_mode,
payload, payload_size, /*fuzz_arg=*/1, /*as_text=*/0);
run_query(db, two_arg[qIdx2], 0, arg2_mode,
payload, payload_size, /*fuzz_arg=*/1, /*as_text=*/1);
/*
* Phase 3: Two-arg — both valid but deliberately mismatched types/dims.
* arg1_mode and arg2_mode often produce different types or dimensions.
* Exercises: type mismatch (lines 1035-1042) and dimension mismatch
* (lines 1044-1051) with sqlite3_free cleanup on both sides.
*/
run_query(db, two_arg[qIdx2], arg1_mode, arg2_mode,
NULL, 0, /*fuzz_arg=*/0, /*as_text=*/0);
/*
* Phase 4: Single-arg functions — fuzz as blob and text.
* Exercises: parse failure paths in vec_f32, vec_int8, vec_bit, etc.
* Also exercises normal cleanup when fuzz data happens to be valid.
*/
run_query(db, one_arg[qIdx1], 0, 0,
payload, payload_size, /*fuzz_arg=*/1, /*as_text=*/0);
run_query(db, one_arg[qIdx1], 0, 0,
payload, payload_size, /*fuzz_arg=*/1, /*as_text=*/1);
sqlite3_close(db);
return 0;
}
================================================
FILE: tests/fuzz/vec0-create-full.c
================================================
#include
#include
#include
#include
#include
#include "sqlite-vec.h"
#include "sqlite3.h"
#include
int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
int rc;
sqlite3 *db;
rc = sqlite3_open(":memory:", &db);
assert(rc == SQLITE_OK);
rc = sqlite3_vec_init(db, NULL, NULL);
assert(rc == SQLITE_OK);
// Create table with fuzz input as column definitions
sqlite3_str *s = sqlite3_str_new(NULL);
assert(s);
sqlite3_str_appendall(s, "CREATE VIRTUAL TABLE v USING vec0(");
sqlite3_str_appendf(s, "%.*s", (int)size, data);
sqlite3_str_appendall(s, ")");
char *zSql = sqlite3_str_finish(s);
assert(zSql);
rc = sqlite3_exec(db, zSql, NULL, NULL, NULL);
sqlite3_free(zSql);
if (rc == SQLITE_OK) {
// Table was created — try to use it. These may fail (errors are fine),
// but must never crash.
sqlite3_exec(db, "INSERT INTO v(rowid) VALUES (1)", NULL, NULL, NULL);
sqlite3_exec(db, "SELECT * FROM v", NULL, NULL, NULL);
sqlite3_exec(db, "SELECT * FROM v WHERE rowid = 1", NULL, NULL, NULL);
sqlite3_exec(db, "DELETE FROM v WHERE rowid = 1", NULL, NULL, NULL);
sqlite3_exec(db, "DROP TABLE v", NULL, NULL, NULL);
}
sqlite3_close(db);
return 0;
}
================================================
FILE: tests/fuzz/vec0-create.c
================================================
#include
#include
#include
#include
#include
#include "sqlite-vec.h"
#include "sqlite3.h"
#include
int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
int rc = SQLITE_OK;
sqlite3 *db;
sqlite3_stmt *stmt;
rc = sqlite3_open(":memory:", &db);
assert(rc == SQLITE_OK);
rc = sqlite3_vec_init(db, NULL, NULL);
assert(rc == SQLITE_OK);
sqlite3_str * s = sqlite3_str_new(NULL);
assert(s);
sqlite3_str_appendall(s, "CREATE VIRTUAL TABLE v USING vec0(");
sqlite3_str_appendf(s, "%.*s", size, data);
sqlite3_str_appendall(s, ")");
const char * zSql = sqlite3_str_finish(s);
assert(zSql);
rc = sqlite3_prepare_v2(db, zSql, -1, &stmt, NULL);
sqlite3_free(zSql);
if(rc == SQLITE_OK) {
sqlite3_step(stmt);
}
sqlite3_finalize(stmt);
sqlite3_close(db);
return 0;
}
================================================
FILE: tests/fuzz/vec0-create.dict
================================================
cname1="aaa"
cname1="bbb"
cname1="ccc"
type1="float"
type2="int8"
type3="bit"
lparen="["
rparen="]"
pk="primary key"
text="text"
distance_metric="distance_metric"
eq="="
l1="l1"
l2="l2"
cosine="cosine"
hamming="hamming"
================================================
FILE: tests/fuzz/vec0-delete-completeness.c
================================================
#include
#include
#include
#include
#include
#include "sqlite-vec.h"
#include "sqlite3.h"
#include
int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
if (size < 6) return 0;
int rc;
sqlite3 *db;
sqlite3_stmt *stmtInsert = NULL;
sqlite3_stmt *stmtDelete = NULL;
sqlite3_stmt *stmtScan = NULL;
sqlite3_stmt *stmtCount = NULL;
rc = sqlite3_open(":memory:", &db);
assert(rc == SQLITE_OK);
rc = sqlite3_vec_init(db, NULL, NULL);
assert(rc == SQLITE_OK);
rc = sqlite3_exec(db,
"CREATE VIRTUAL TABLE v USING vec0(emb float[4], chunk_size=4)",
NULL, NULL, NULL);
if (rc != SQLITE_OK) { sqlite3_close(db); return 0; }
sqlite3_prepare_v2(db,
"INSERT INTO v(rowid, emb) VALUES (?, ?)", -1, &stmtInsert, NULL);
sqlite3_prepare_v2(db,
"DELETE FROM v WHERE rowid = ?", -1, &stmtDelete, NULL);
sqlite3_prepare_v2(db,
"SELECT rowid FROM v", -1, &stmtScan, NULL);
if (!stmtInsert || !stmtDelete || !stmtScan) goto cleanup;
size_t i = 0;
while (i + 2 <= size) {
uint8_t op = data[i++] % 3;
uint8_t rowid_byte = data[i++];
int64_t rowid = (int64_t)(rowid_byte % 16) + 1;
switch (op) {
case 0: {
// INSERT
float vec[4] = {0.0f, 0.0f, 0.0f, 0.0f};
for (int j = 0; j < 4 && i < size; j++, i++) {
vec[j] = (float)((int8_t)data[i]) / 10.0f;
}
sqlite3_reset(stmtInsert);
sqlite3_bind_int64(stmtInsert, 1, rowid);
sqlite3_bind_blob(stmtInsert, 2, vec, sizeof(vec), SQLITE_TRANSIENT);
sqlite3_step(stmtInsert);
break;
}
case 1: {
// DELETE
sqlite3_reset(stmtDelete);
sqlite3_bind_int64(stmtDelete, 1, rowid);
sqlite3_step(stmtDelete);
break;
}
case 2: {
// Full scan
sqlite3_reset(stmtScan);
while (sqlite3_step(stmtScan) == SQLITE_ROW) {}
break;
}
}
}
// Delete all remaining rows
sqlite3_exec(db, "DELETE FROM v", NULL, NULL, NULL);
// Assert all shadow tables are empty after full deletion
sqlite3_prepare_v2(db,
"SELECT count(*) FROM v_rowids", -1, &stmtCount, NULL);
if (stmtCount) {
rc = sqlite3_step(stmtCount);
assert(rc == SQLITE_ROW);
assert(sqlite3_column_int(stmtCount, 0) == 0);
sqlite3_finalize(stmtCount);
stmtCount = NULL;
}
sqlite3_prepare_v2(db,
"SELECT count(*) FROM v_chunks", -1, &stmtCount, NULL);
if (stmtCount) {
rc = sqlite3_step(stmtCount);
assert(rc == SQLITE_ROW);
assert(sqlite3_column_int(stmtCount, 0) == 0);
sqlite3_finalize(stmtCount);
stmtCount = NULL;
}
sqlite3_prepare_v2(db,
"SELECT count(*) FROM v_vector_chunks00", -1, &stmtCount, NULL);
if (stmtCount) {
rc = sqlite3_step(stmtCount);
assert(rc == SQLITE_ROW);
assert(sqlite3_column_int(stmtCount, 0) == 0);
sqlite3_finalize(stmtCount);
stmtCount = NULL;
}
cleanup:
sqlite3_finalize(stmtInsert);
sqlite3_finalize(stmtDelete);
sqlite3_finalize(stmtScan);
sqlite3_close(db);
return 0;
}
================================================
FILE: tests/fuzz/vec0-operations.c
================================================
#include
#include
#include
#include
#include
#include "sqlite-vec.h"
#include "sqlite3.h"
#include
int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {
if (size < 6) return 0;
int rc;
sqlite3 *db;
sqlite3_stmt *stmtInsert = NULL;
sqlite3_stmt *stmtDelete = NULL;
sqlite3_stmt *stmtKnn = NULL;
sqlite3_stmt *stmtScan = NULL;
rc = sqlite3_open(":memory:", &db);
assert(rc == SQLITE_OK);
rc = sqlite3_vec_init(db, NULL, NULL);
assert(rc == SQLITE_OK);
rc = sqlite3_exec(db,
"CREATE VIRTUAL TABLE v USING vec0(emb float[4])",
NULL, NULL, NULL);
if (rc != SQLITE_OK) { sqlite3_close(db); return 0; }
sqlite3_prepare_v2(db,
"INSERT INTO v(rowid, emb) VALUES (?, ?)", -1, &stmtInsert, NULL);
sqlite3_prepare_v2(db,
"DELETE FROM v WHERE rowid = ?", -1, &stmtDelete, NULL);
sqlite3_prepare_v2(db,
"SELECT rowid, distance FROM v WHERE emb MATCH ? LIMIT 3",
-1, &stmtKnn, NULL);
sqlite3_prepare_v2(db,
"SELECT rowid FROM v", -1, &stmtScan, NULL);
if (!stmtInsert || !stmtDelete || !stmtKnn || !stmtScan) goto cleanup;
size_t i = 0;
while (i + 2 <= size) {
uint8_t op = data[i++] % 4;
uint8_t rowid_byte = data[i++];
int64_t rowid = (int64_t)(rowid_byte % 32) + 1;
switch (op) {
case 0: {
// INSERT: consume 16 bytes for 4 floats, or use what's left
float vec[4] = {0.0f, 0.0f, 0.0f, 0.0f};
for (int j = 0; j < 4 && i < size; j++, i++) {
vec[j] = (float)((int8_t)data[i]) / 10.0f;
}
sqlite3_reset(stmtInsert);
sqlite3_bind_int64(stmtInsert, 1, rowid);
sqlite3_bind_blob(stmtInsert, 2, vec, sizeof(vec), SQLITE_TRANSIENT);
sqlite3_step(stmtInsert);
break;
}
case 1: {
// DELETE
sqlite3_reset(stmtDelete);
sqlite3_bind_int64(stmtDelete, 1, rowid);
sqlite3_step(stmtDelete);
break;
}
case 2: {
// KNN query with a fixed query vector
float qvec[4] = {1.0f, 0.0f, 0.0f, 0.0f};
sqlite3_reset(stmtKnn);
sqlite3_bind_blob(stmtKnn, 1, qvec, sizeof(qvec), SQLITE_STATIC);
while (sqlite3_step(stmtKnn) == SQLITE_ROW) {}
break;
}
case 3: {
// Full scan
sqlite3_reset(stmtScan);
while (sqlite3_step(stmtScan) == SQLITE_ROW) {}
break;
}
}
}
// Final operations — must not crash regardless of prior state
sqlite3_exec(db, "SELECT * FROM v", NULL, NULL, NULL);
cleanup:
sqlite3_finalize(stmtInsert);
sqlite3_finalize(stmtDelete);
sqlite3_finalize(stmtKnn);
sqlite3_finalize(stmtScan);
sqlite3_close(db);
return 0;
}
================================================
FILE: tests/fuzz.py
================================================
import sqlite3
EXT_PATH = "dist/vec0"
db = sqlite3.connect(":memory:")
db.execute(
"create temp table base_functions as select name from pragma_function_list"
)
db.execute("create temp table base_modules as select name from pragma_module_list")
db.enable_load_extension(True)
db.load_extension(EXT_PATH)
db.row_factory = sqlite3.Row
loaded_functions = db.execute(
"select name, narg from pragma_function_list where name not in (select name from base_functions) order by name"
).fetchall()
db.execute(
"create temp table loaded_modules as select name from pragma_module_list where name not in (select name from base_modules) order by name"
)
db.row_factory = sqlite3.Row
def trace(sql):
print(sql)
db.set_trace_callback(trace)
def spread_args(n):
return ",".join(["?"] * n)
for f in loaded_functions:
v = [None, 1, 1.2, b"", '', "asdf", b"\xff", b"\x00", "\0\0\0\0"]
for x in v:
try:
db.execute("select {}({}); ".format(f['name'],spread_args(f['narg'])), [x] * f['narg'])
except sqlite3.OperationalError:
pass
================================================
FILE: tests/helpers.py
================================================
import sqlite3
import struct
from collections import OrderedDict
def _f32(list):
return struct.pack("%sf" % len(list), *list)
def _i64(list):
return struct.pack("%sq" % len(list), *list)
def _int8(list):
return struct.pack("%sb" % len(list), *list)
def exec(db, sql, parameters=[]):
try:
rows = db.execute(sql, parameters).fetchall()
except (sqlite3.OperationalError, sqlite3.DatabaseError) as e:
return {
"error": e.__class__.__name__,
"message": str(e),
}
a = []
for row in rows:
o = OrderedDict()
for k in row.keys():
o[k] = row[k]
a.append(o)
result = OrderedDict()
result["sql"] = sql
result["rows"] = a
return result
def vec0_shadow_table_contents(db, v, skip_info=True):
shadow_tables = [
row[0]
for row in db.execute(
"select name from sqlite_master where name like ? order by 1", [f"{v}_%"]
).fetchall()
]
o = {}
for shadow_table in shadow_tables:
if skip_info and shadow_table.endswith("_info"):
continue
o[shadow_table] = exec(db, f"select * from {shadow_table}")
return o
================================================
FILE: tests/leak-fixtures/each.sql
================================================
.load dist/vec0
.mode box
.header on
.eqp on
.echo on
select sqlite_version(), vec_version();
select * from vec_each('[1,2,3]');
select *
from json_each('[
[1,2,3,4],
[1,2,3,4]
]')
join vec_each(json_each.value);
================================================
FILE: tests/leak-fixtures/knn.sql
================================================
.load dist/vec0
.mode box
.header on
.eqp on
.echo on
select sqlite_version(), vec_version();
create virtual table v using vec0(a float[1], chunk_size=8);
insert into v
select value, format('[%f]', value / 100.0)
from generate_series(1, 100);
select
rowid,
vec_to_json(a)
from v
where a match '[.3]'
and k = 2;
select
rowid,
vec_to_json(a)
from v
where a match '[.3]'
and k = 0;
select
rowid,
vec_to_json(a)
from v
where a match '[2.0]'
and k = 2
and rowid in (1,2,3,4,5);
with queries as (
select
rowid as query_id,
json_array(value / 100.0) as value
from generate_series(24, 39)
)
select
query_id,
rowid,
distance,
vec_to_json(a)
from queries, v
where a match queries.value
and k =5;
select *
from v
where rowid in (1,2,3,4);
drop table v;
================================================
FILE: tests/leak-fixtures/vec0-create.sql
================================================
.load dist/vec0
.mode box
.header on
.eqp on
.echo on
create virtual table v using vec0(y);
================================================
FILE: tests/minimum/.gitignore
================================================
dist/
================================================
FILE: tests/minimum/Makefile
================================================
dist/.stammp:
mkdir -p dist
touch $@
dist/sqlite-amalgamation-3310100/.stamp: dist/.stammp
rm -rf dist/sqlite-amalgamation-3310100/ || true
curl -q -o sqlite-amalgamation-3310100.zip https://www.sqlite.org/2020/sqlite-amalgamation-3310100.zip
unzip -d dist/ sqlite-amalgamation-3310100.zip
rm sqlite-amalgamation-3310100.zip
touch $@
dist/t3310100: demo.c dist/sqlite-amalgamation-3310100/.stamp ../../sqlite-vec.c
gcc \
-lm \
-DSQLITE_CORE -DSQLITE_ENABLE_JSON1 \
-I dist/sqlite-amalgamation-3310100 \
-I ../../ \
$< dist/sqlite-amalgamation-3310100/sqlite3.c \
../../sqlite-vec.c \
-o $@
dist/t3310100-threadsafe: demo.c dist/sqlite-amalgamation-3310100/.stamp ../../sqlite-vec.c
gcc \
-lm \
-DSQLITE_CORE -DSQLITE_ENABLE_JSON1 -DSQLITE_THREADSAFE=0 \
-I dist/sqlite-amalgamation-3310100 \
-I ../../ \
$< dist/sqlite-amalgamation-3310100/sqlite3.c \
../../sqlite-vec.c \
-o $@
test: dist/t3310100 dist/t3310100-threadsafe
./dist/t3310100
./dist/t3310100-threadsafe
clean:
rm -rf dist/
================================================
FILE: tests/minimum/demo.c
================================================
#include "sqlite3.h"
#include "sqlite-vec.h"
#include
#include
#include
int main(int argc, char *argv[]) {
int rc = SQLITE_OK;
sqlite3 *db;
sqlite3_stmt *stmt;
rc = sqlite3_open(":memory:", &db);
assert(rc == SQLITE_OK);
rc = sqlite3_vec_init(db, NULL, NULL);
assert(rc == SQLITE_OK);
rc = sqlite3_prepare_v2(db, "SELECT sqlite_version(), vec_version(), (select json_group_array(compile_options) from pragma_compile_options)", -1, &stmt, NULL);
assert(rc == SQLITE_OK);
rc = sqlite3_step(stmt);
printf("sqlite_version=%s, vec_version=%s %s\n", sqlite3_column_text(stmt, 0), sqlite3_column_text(stmt, 1), sqlite3_column_text(stmt, 2));
sqlite3_finalize(stmt);
sqlite3_close(db);
return 0;
}
================================================
FILE: tests/pyproject.toml
================================================
[project]
name = "tests"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.12"
dependencies = [
"pytest", "numpy", "syrupy"
]
================================================
FILE: tests/skip.test-correctness.py
================================================
import sqlite3
import json
db = sqlite3.connect("test2.db")
db.enable_load_extension(True)
db.load_extension("dist/vec0")
db.enable_load_extension(False)
db.row_factory = sqlite3.Row
db.execute('attach database "sift1m-base.db" as sift1m')
#def test_sift1m():
rows = db.execute(
'''
with q as (
select rowid, vector, k100 from sift1m.sift1m_query limit 10
),
results as (
select
q.rowid as query_rowid,
vec_sift1m.rowid as vec_rowid,
distance,
k100 as k100_groundtruth
from q
join vec_sift1m
where
vec_sift1m.vector match q.vector
and k = 100
order by distance
)
select
query_rowid,
json_group_array(vec_rowid order by distance) as topk,
k100_groundtruth,
json_group_array(vec_rowid order by distance) == k100_groundtruth
from results
group by 1;
''').fetchall()
results = []
for row in rows:
actual = json.loads(row["topk"])
expected = json.loads(row["k100_groundtruth"])
ncorrect = sum([x in expected for x in actual])
results.append(ncorrect / 100.0)
from statistics import mean
print(mean(results))
================================================
FILE: tests/sqlite-vec-internal.h
================================================
#ifndef SQLITE_VEC_INTERNAL_H
#define SQLITE_VEC_INTERNAL_H
#include
#include
int min_idx(
const float *distances,
int32_t n,
uint8_t *candidates,
int32_t *out,
int32_t k,
uint8_t *bTaken,
int32_t *k_used
);
// Scanner / tokenizer types and functions
enum Vec0TokenType {
TOKEN_TYPE_IDENTIFIER = 0,
TOKEN_TYPE_DIGIT = 1,
TOKEN_TYPE_LBRACKET = 2,
TOKEN_TYPE_RBRACKET = 3,
TOKEN_TYPE_PLUS = 4,
TOKEN_TYPE_EQ = 5,
TOKEN_TYPE_LPAREN = 6,
TOKEN_TYPE_RPAREN = 7,
TOKEN_TYPE_COMMA = 8,
};
#define VEC0_TOKEN_RESULT_EOF 1
#define VEC0_TOKEN_RESULT_SOME 2
#define VEC0_TOKEN_RESULT_ERROR 3
struct Vec0Token {
enum Vec0TokenType token_type;
char *start;
char *end;
};
struct Vec0Scanner {
char *start;
char *end;
char *ptr;
};
void vec0_scanner_init(struct Vec0Scanner *scanner, const char *source, int source_length);
int vec0_scanner_next(struct Vec0Scanner *scanner, struct Vec0Token *out);
int vec0_token_next(char *start, char *end, struct Vec0Token *out);
// Vector column definition types and parser
enum VectorElementType {
SQLITE_VEC_ELEMENT_TYPE_FLOAT32 = 223 + 0,
SQLITE_VEC_ELEMENT_TYPE_BIT = 223 + 1,
SQLITE_VEC_ELEMENT_TYPE_INT8 = 223 + 2,
};
enum Vec0DistanceMetrics {
VEC0_DISTANCE_METRIC_L2 = 1,
VEC0_DISTANCE_METRIC_COSINE = 2,
VEC0_DISTANCE_METRIC_L1 = 3,
};
struct VectorColumnDefinition {
char *name;
int name_length;
size_t dimensions;
enum VectorElementType element_type;
enum Vec0DistanceMetrics distance_metric;
};
int vec0_parse_vector_column(const char *source, int source_length,
struct VectorColumnDefinition *outColumn);
int vec0_parse_partition_key_definition(const char *source, int source_length,
char **out_column_name,
int *out_column_name_length,
int *out_column_type);
#ifdef SQLITE_VEC_TEST
float _test_distance_l2_sqr_float(const float *a, const float *b, size_t dims);
float _test_distance_cosine_float(const float *a, const float *b, size_t dims);
float _test_distance_hamming(const unsigned char *a, const unsigned char *b, size_t dims);
#endif
#endif /* SQLITE_VEC_INTERNAL_H */
================================================
FILE: tests/test-auxiliary.py
================================================
import sqlite3
from helpers import exec, vec0_shadow_table_contents
def test_constructor_limit(db, snapshot):
assert exec(
db,
f"""
create virtual table v using vec0(
{",".join([f"+aux{x} integer" for x in range(17)])}
v float[1]
)
""",
) == snapshot(name="max 16 auxiliary columns")
def test_normal(db, snapshot):
db.execute(
"create virtual table v using vec0(a float[1], +name text, chunk_size=8)"
)
assert exec(db, "select * from sqlite_master order by name") == snapshot(
name="sqlite_master"
)
db.execute("insert into v(a, name) values (?, ?)", [b"\x11\x11\x11\x11", "alex"])
db.execute("insert into v(a, name) values (?, ?)", [b"\x22\x22\x22\x22", "brian"])
db.execute("insert into v(a, name) values (?, ?)", [b"\x33\x33\x33\x33", "craig"])
assert exec(db, "select * from v") == snapshot()
assert vec0_shadow_table_contents(db, "v") == snapshot()
db.execute("drop table v;")
assert exec(db, "select * from sqlite_master order by name") == snapshot(
name="sqlite_master post drop"
)
def test_types(db, snapshot):
db.execute(
"""
create virtual table v using vec0(
vector float[1],
+aux_int integer,
+aux_float float,
+aux_text text,
+aux_blob blob
)
"""
)
assert exec(db, "select * from v") == snapshot()
INSERT = "insert into v(vector, aux_int, aux_float, aux_text, aux_blob) values (?, ?, ?, ?, ?)"
assert (
exec(db, INSERT, [b"\x11\x11\x11\x11", 1, 1.22, "text", b"blob"]) == snapshot()
)
assert exec(db, "select * from v") == snapshot()
# TODO: integrity test transaction failures in shadow tables
db.commit()
# bad types
db.execute("BEGIN")
assert (
exec(db, INSERT, [b"\x11\x11\x11\x11", "not int", 1.2, "text", b"blob"])
== snapshot()
)
assert (
exec(db, INSERT, [b"\x11\x11\x11\x11", 1, "not float", "text", b"blob"])
== snapshot()
)
assert exec(db, INSERT, [b"\x11\x11\x11\x11", 1, 1.2, 1, b"blob"]) == snapshot()
assert exec(db, INSERT, [b"\x11\x11\x11\x11", 1, 1.2, "text", 1]) == snapshot()
db.execute("ROLLBACK")
# NULLs are totally chill
assert exec(db, INSERT, [b"\x11\x11\x11\x11", None, None, None, None]) == snapshot()
assert exec(db, "select * from v") == snapshot()
def test_updates(db, snapshot):
db.execute(
"create virtual table v using vec0(vector float[1], +name text, chunk_size=8)"
)
db.executemany(
"insert into v(vector, name) values (?, ?)",
[("[1]", "alex"), ("[2]", "brian"), ("[3]", "craig")],
)
assert exec(db, "select rowid, * from v") == snapshot()
assert vec0_shadow_table_contents(db, "v") == snapshot()
assert exec(db, "update v set name = 'ALEX' where rowid = 1") == snapshot()
assert exec(db, "select rowid, * from v") == snapshot()
assert vec0_shadow_table_contents(db, "v") == snapshot()
def test_deletes(db, snapshot):
db.execute(
"create virtual table v using vec0(vector float[1], +name text, chunk_size=8)"
)
db.executemany(
"insert into v(vector, name) values (?, ?)",
[("[1]", "alex"), ("[2]", "brian"), ("[3]", "craig")],
)
assert exec(db, "select rowid, * from v") == snapshot()
assert vec0_shadow_table_contents(db, "v") == snapshot()
assert exec(db, "delete from v where rowid = 1") == snapshot()
assert exec(db, "select rowid, * from v") == snapshot()
assert vec0_shadow_table_contents(db, "v") == snapshot()
def test_knn(db, snapshot):
db.execute("create virtual table v using vec0(vector float[1], +name text)")
db.executemany(
"insert into v(vector, name) values (?, ?)",
[("[1]", "alex"), ("[2]", "brian"), ("[3]", "craig")],
)
assert exec(db, "select * from v") == snapshot()
assert exec(
db, "select *, distance from v where vector match '[5]' and k = 10"
) == snapshot(name="legal KNN w/ aux")
# EVIDENCE-OF: V25623_09693 No aux constraint allowed on KNN queries
assert exec(
db,
"select *, distance from v where vector match '[5]' and k = 10 and name = 'alex'",
) == snapshot(name="illegal KNN w/ aux")
================================================
FILE: tests/test-general.py
================================================
import sqlite3
import pytest
from helpers import exec
@pytest.mark.skipif(
sqlite3.sqlite_version_info[1] < 37,
reason="pragma_table_list was added in SQLite 3.37",
)
def test_shadow(db, snapshot):
db.execute(
"create virtual table v using vec0(a float[1], partition text partition key, metadata text, +name text, chunk_size=8)"
)
assert exec(db, "select * from sqlite_master order by name") == snapshot()
assert (
exec(db, "select * from pragma_table_list where type = 'shadow' order by name") == snapshot()
)
db.execute("drop table v;")
assert (
exec(db, "select * from pragma_table_list where type = 'shadow' order by name") == snapshot()
)
def test_info(db, snapshot):
db.execute("create virtual table v using vec0(a float[1])")
assert exec(db, "select key, typeof(value) from v_info order by 1") == snapshot()
================================================
FILE: tests/test-insert-delete.py
================================================
import sqlite3
import struct
import pytest
from helpers import _f32, _i64, _int8, exec
def test_insert_creates_chunks_and_vectors(db, snapshot):
db.execute("create virtual table v using vec0(emb float[4], chunk_size=8)")
vecs = [
[1.0, 2.0, 3.0, 4.0],
[5.0, 6.0, 7.0, 8.0],
[0.1, 0.2, 0.3, 0.4],
[10.0, 20.0, 30.0, 40.0],
[0.5, 0.5, 0.5, 0.5],
]
for i, v in enumerate(vecs, start=1):
db.execute("insert into v(rowid, emb) values (?, ?)", [i, _f32(v)])
assert exec(db, "select count(*) as cnt from v_rowids") == snapshot(
name="rowids_count"
)
assert exec(db, "select count(*) as cnt from v_vector_chunks00") == snapshot(
name="vector_chunks_count"
)
# Verify round-trip: each inserted vector comes back identical
for i, v in enumerate(vecs, start=1):
rows = db.execute("select emb from v where rowid = ?", [i]).fetchall()
assert len(rows) == 1
assert rows[0][0] == _f32(v)
def test_insert_auto_rowid(db):
db.execute("create virtual table v using vec0(emb float[4], chunk_size=8)")
vecs = [[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0]]
for v in vecs:
db.execute("insert into v(emb) values (?)", [_f32(v)])
rows = db.execute("select rowid from v order by rowid").fetchall()
rowids = [r[0] for r in rows]
assert rowids == [1, 2, 3]
for i, v in enumerate(vecs, start=1):
row = db.execute("select emb from v where rowid = ?", [i]).fetchone()
assert row[0] == _f32(v)
def test_insert_text_primary_key(db, snapshot):
db.execute(
"create virtual table v using vec0(id text primary key, emb float[4], chunk_size=8)"
)
db.execute(
"insert into v(id, emb) values ('doc_a', ?)", [_f32([1.0, 2.0, 3.0, 4.0])]
)
db.execute(
"insert into v(id, emb) values ('doc_b', ?)", [_f32([5.0, 6.0, 7.0, 8.0])]
)
assert exec(db, "select rowid, id, chunk_id, chunk_offset from v_rowids order by rowid") == snapshot(
name="rowids"
)
row = db.execute("select emb from v where id = 'doc_a'").fetchone()
assert row[0] == _f32([1.0, 2.0, 3.0, 4.0])
row = db.execute("select emb from v where id = 'doc_b'").fetchone()
assert row[0] == _f32([5.0, 6.0, 7.0, 8.0])
def test_delete_clears_validity(db):
db.execute("create virtual table v using vec0(emb float[4], chunk_size=8)")
for i, v in enumerate(
[[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0]],
start=1,
):
db.execute("insert into v(rowid, emb) values (?, ?)", [i, _f32(v)])
db.execute("delete from v where rowid = 2")
rows = db.execute("select rowid from v order by rowid").fetchall()
assert [r[0] for r in rows] == [1, 3]
rowid_rows = db.execute("select rowid from v_rowids order by rowid").fetchall()
assert 2 not in [r[0] for r in rowid_rows]
# Inserting a new vector after deletion still works
db.execute("insert into v(rowid, emb) values (4, ?)", [_f32([0.0, 0.0, 0.0, 1.0])])
row = db.execute("select emb from v where rowid = 4").fetchone()
assert row[0] == _f32([0.0, 0.0, 0.0, 1.0])
def test_insert_delete_reinsert(db):
db.execute("create virtual table v using vec0(emb float[4], chunk_size=8)")
db.execute("insert into v(rowid, emb) values (1, ?)", [_f32([1.0, 1.0, 1.0, 1.0])])
db.execute("delete from v where rowid = 1")
db.execute("insert into v(rowid, emb) values (2, ?)", [_f32([2.0, 2.0, 2.0, 2.0])])
rows = db.execute("select rowid from v order by rowid").fetchall()
assert [r[0] for r in rows] == [2]
# KNN query works and returns rowid 2
knn = db.execute(
"select rowid, distance from v where emb match ? and k = 1",
[_f32([2.0, 2.0, 2.0, 2.0])],
).fetchall()
assert len(knn) == 1
assert knn[0][0] == 2
def test_insert_validates_dimensions(db):
db.execute("create virtual table v using vec0(emb float[4], chunk_size=8)")
result = exec(db, "insert into v(rowid, emb) values (1, ?)", [_f32([1.0, 2.0, 3.0])])
assert result["error"] == "OperationalError"
assert "Dimension mismatch" in result["message"]
assert "Expected 4" in result["message"]
assert "3" in result["message"]
result = exec(
db, "insert into v(rowid, emb) values (1, ?)", [_f32([1.0, 2.0, 3.0, 4.0, 5.0])]
)
assert result["error"] == "OperationalError"
assert "Dimension mismatch" in result["message"]
assert "Expected 4" in result["message"]
assert "5" in result["message"]
def test_insert_validates_type(db):
db.execute("create virtual table v using vec0(emb float[4], chunk_size=8)")
int8_vec = struct.pack("4b", 1, 2, 3, 4)
result = exec(
db,
"insert into v(rowid, emb) values (1, vec_int8(?))",
[int8_vec],
)
assert "error" in result
assert "float32" in result["message"]
assert "int8" in result["message"]
def test_info_table_contents(db, snapshot):
db.execute("create virtual table v using vec0(emb float[4], chunk_size=8)")
assert exec(db, "select key, value from v_info where key not like 'CREATE_VERSION%' order by key") == snapshot()
# Smoke-check that version keys exist without pinning exact values
version_rows = exec(db, "select key from v_info where key like 'CREATE_VERSION%' order by key")
keys = [r["key"] for r in version_rows["rows"]]
assert keys == ["CREATE_VERSION", "CREATE_VERSION_MAJOR", "CREATE_VERSION_MINOR", "CREATE_VERSION_PATCH"]
def test_delete_zeroes_rowid_blob(db):
db.execute("create virtual table v using vec0(emb float[4], chunk_size=8)")
for i in range(1, 4):
db.execute(
"insert into v(rowid, emb) values (?, ?)",
[i, _f32([float(i)] * 4)],
)
db.execute("delete from v where rowid = 2")
blob = db.execute("select rowids from v_chunks where rowid = 1").fetchone()[0]
rowids = struct.unpack("<8q", blob)
assert rowids[0] == 1 # slot 0 intact
assert rowids[1] == 0 # slot 1 zeroed (was rowid 2)
assert rowids[2] == 3 # slot 2 intact
def test_delete_zeroes_vector_blob(db):
db.execute("create virtual table v using vec0(emb float[4], chunk_size=8)")
db.execute(
"insert into v(rowid, emb) values (1, ?)", [_f32([1.0, 2.0, 3.0, 4.0])]
)
db.execute(
"insert into v(rowid, emb) values (2, ?)", [_f32([5.0, 6.0, 7.0, 8.0])]
)
db.execute("delete from v where rowid = 1")
blob = db.execute(
"select vectors from v_vector_chunks00 where rowid = 1"
).fetchone()[0]
# First slot (4 floats = 16 bytes) should be zeroed
first_slot = struct.unpack("<4f", blob[:16])
assert first_slot == (0.0, 0.0, 0.0, 0.0)
# Second slot should be unchanged
second_slot = struct.unpack("<4f", blob[16:32])
assert second_slot == (5.0, 6.0, 7.0, 8.0)
def test_delete_all_rows_deletes_chunk(db):
db.execute("create virtual table v using vec0(emb float[4], chunk_size=8)")
for i in range(1, 9):
db.execute(
"insert into v(rowid, emb) values (?, ?)",
[i, _f32([float(i)] * 4)],
)
for i in range(1, 9):
db.execute("delete from v where rowid = ?", [i])
assert (
db.execute("select count(*) from v_chunks").fetchone()[0] == 0
)
assert (
db.execute("select count(*) from v_vector_chunks00").fetchone()[0] == 0
)
# Inserting after full deletion still works
db.execute(
"insert into v(rowid, emb) values (100, ?)", [_f32([9.0, 9.0, 9.0, 9.0])]
)
row = db.execute("select emb from v where rowid = 100").fetchone()
assert row[0] == _f32([9.0, 9.0, 9.0, 9.0])
def test_delete_chunk_multiple_chunks(db):
db.execute("create virtual table v using vec0(emb float[4], chunk_size=8)")
for i in range(1, 17):
db.execute(
"insert into v(rowid, emb) values (?, ?)",
[i, _f32([float(i)] * 4)],
)
# Delete all rows from the first chunk (rows 1-8)
for i in range(1, 9):
db.execute("delete from v where rowid = ?", [i])
# Only 1 chunk should remain
assert db.execute("select count(*) from v_chunks").fetchone()[0] == 1
# Rows 9-16 still queryable
for i in range(9, 17):
row = db.execute("select emb from v where rowid = ?", [i]).fetchone()
assert row[0] == _f32([float(i)] * 4)
def test_delete_with_metadata_columns(db):
db.execute(
"create virtual table v using vec0("
"emb float[4], "
"m_bool boolean, "
"m_int integer, "
"m_float float, "
"m_text text, "
"chunk_size=8"
")"
)
for i in range(1, 9):
db.execute(
"insert into v(rowid, emb, m_bool, m_int, m_float, m_text) "
"values (?, ?, ?, ?, ?, ?)",
[i, _f32([float(i)] * 4), i % 2 == 0, i * 10, float(i) / 2.0, f"text_{i}"],
)
for i in range(1, 9):
db.execute("delete from v where rowid = ?", [i])
assert db.execute("select count(*) from v_chunks").fetchone()[0] == 0
assert db.execute("select count(*) from v_vector_chunks00").fetchone()[0] == 0
assert db.execute("select count(*) from v_metadatachunks00").fetchone()[0] == 0
assert db.execute("select count(*) from v_metadatachunks01").fetchone()[0] == 0
assert db.execute("select count(*) from v_metadatachunks02").fetchone()[0] == 0
assert db.execute("select count(*) from v_metadatachunks03").fetchone()[0] == 0
def test_delete_with_auxiliary_columns(db):
db.execute(
"create virtual table v using vec0("
"emb float[4], "
"+aux_text text, "
"chunk_size=8"
")"
)
for i in range(1, 9):
db.execute(
"insert into v(rowid, emb, aux_text) values (?, ?, ?)",
[i, _f32([float(i)] * 4), f"aux_{i}"],
)
for i in range(1, 9):
db.execute("delete from v where rowid = ?", [i])
assert db.execute("select count(*) from v_chunks").fetchone()[0] == 0
assert db.execute("select count(*) from v_auxiliary").fetchone()[0] == 0
def test_delete_with_text_primary_key(db):
db.execute(
"create virtual table v using vec0("
"id text primary key, emb float[4], chunk_size=8"
")"
)
db.execute(
"insert into v(id, emb) values ('a', ?)", [_f32([1.0, 2.0, 3.0, 4.0])]
)
db.execute(
"insert into v(id, emb) values ('b', ?)", [_f32([5.0, 6.0, 7.0, 8.0])]
)
db.execute("delete from v where id = 'a'")
# Vector blob slot 0 should be zeroed
blob = db.execute(
"select vectors from v_vector_chunks00 where rowid = 1"
).fetchone()[0]
first_slot = struct.unpack("<4f", blob[:16])
assert first_slot == (0.0, 0.0, 0.0, 0.0)
# Remaining row still queryable
row = db.execute("select emb from v where id = 'b'").fetchone()
assert row[0] == _f32([5.0, 6.0, 7.0, 8.0])
def test_delete_with_partition_keys(db):
db.execute(
"create virtual table v using vec0("
"part text partition key, emb float[4], chunk_size=8"
")"
)
for i in range(1, 9):
db.execute(
"insert into v(rowid, part, emb) values (?, 'A', ?)",
[i, _f32([float(i)] * 4)],
)
for i in range(9, 17):
db.execute(
"insert into v(rowid, part, emb) values (?, 'B', ?)",
[i, _f32([float(i)] * 4)],
)
# Delete all from partition A
for i in range(1, 9):
db.execute("delete from v where rowid = ?", [i])
# 1 chunk should remain (partition B's)
assert db.execute("select count(*) from v_chunks").fetchone()[0] == 1
# Partition B rows intact
for i in range(9, 17):
row = db.execute("select emb from v where rowid = ?", [i]).fetchone()
assert row[0] == _f32([float(i)] * 4)
# Re-insert into partition A works
db.execute(
"insert into v(rowid, part, emb) values (100, 'A', ?)",
[_f32([99.0, 99.0, 99.0, 99.0])],
)
row = db.execute("select emb from v where rowid = 100").fetchone()
assert row[0] == _f32([99.0, 99.0, 99.0, 99.0])
def test_delete_int8_vectors(db):
db.execute("create virtual table v using vec0(emb int8[4], chunk_size=8)")
db.execute(
"insert into v(rowid, emb) values (1, vec_int8(?))",
[_int8([1, 2, 3, 4])],
)
db.execute(
"insert into v(rowid, emb) values (2, vec_int8(?))",
[_int8([5, 6, 7, 8])],
)
db.execute("delete from v where rowid = 1")
blob = db.execute(
"select vectors from v_vector_chunks00 where rowid = 1"
).fetchone()[0]
# int8[4] = 4 bytes per slot
first_slot = struct.unpack("<4b", blob[:4])
assert first_slot == (0, 0, 0, 0)
second_slot = struct.unpack("<4b", blob[4:8])
assert second_slot == (5, 6, 7, 8)
def test_delete_bit_vectors(db):
db.execute("create virtual table v using vec0(emb bit[8], chunk_size=8)")
db.execute(
"insert into v(rowid, emb) values (1, vec_bit(?))",
[bytes([0xFF])],
)
db.execute(
"insert into v(rowid, emb) values (2, vec_bit(?))",
[bytes([0xAA])],
)
db.execute("delete from v where rowid = 1")
blob = db.execute(
"select vectors from v_vector_chunks00 where rowid = 1"
).fetchone()[0]
# bit[8] = 1 byte per slot
assert blob[0:1] == bytes([0x00])
assert blob[1:2] == bytes([0xAA])
def _file_db(tmp_path):
"""Open a file-backed DB (required for page_count to shrink after VACUUM)."""
db = sqlite3.connect(str(tmp_path / "test.db"))
db.row_factory = sqlite3.Row
db.enable_load_extension(True)
db.load_extension("dist/vec0")
db.enable_load_extension(False)
return db
def test_delete_chunk_shrinks_pages(tmp_path):
"""Use large vectors (float[256]) so each chunk blob spans multiple pages,
making the page_count difference measurable after VACUUM."""
dims = 256
db = _file_db(tmp_path)
db.execute(f"create virtual table v using vec0(emb float[{dims}], chunk_size=8)")
for i in range(1, 25): # 3 full chunks of 8
db.execute(
"insert into v(rowid, emb) values (?, ?)",
[i, _f32([float(i)] * dims)],
)
db.commit()
pages_before = db.execute("pragma page_count").fetchone()[0]
# Delete all rows
for i in range(1, 25):
db.execute("delete from v where rowid = ?", [i])
db.commit()
assert db.execute("select count(*) from v_chunks").fetchone()[0] == 0
db.execute("vacuum")
pages_after = db.execute("pragma page_count").fetchone()[0]
assert pages_after < pages_before, (
f"page_count should shrink after deleting all chunks and vacuum: "
f"{pages_before} -> {pages_after}"
)
db.close()
def test_delete_one_chunk_of_two_shrinks_pages(tmp_path):
"""Use large vectors (float[256]) so each chunk blob spans multiple pages,
making the page_count difference measurable after VACUUM."""
dims = 256
db = _file_db(tmp_path)
db.execute(f"create virtual table v using vec0(emb float[{dims}], chunk_size=8)")
for i in range(1, 17): # 2 full chunks of 8
db.execute(
"insert into v(rowid, emb) values (?, ?)",
[i, _f32([float(i)] * dims)],
)
db.commit()
pages_before = db.execute("pragma page_count").fetchone()[0]
# Delete all rows from the first chunk (rows 1-8)
for i in range(1, 9):
db.execute("delete from v where rowid = ?", [i])
db.commit()
assert db.execute("select count(*) from v_chunks").fetchone()[0] == 1
db.execute("vacuum")
pages_after = db.execute("pragma page_count").fetchone()[0]
assert pages_after < pages_before, (
f"page_count should shrink after deleting one chunk and vacuum: "
f"{pages_before} -> {pages_after}"
)
# Remaining rows still queryable after vacuum
for i in range(9, 17):
row = db.execute("select emb from v where rowid = ?", [i]).fetchone()
assert row[0] == _f32([float(i)] * dims)
db.close()
================================================
FILE: tests/test-knn-distance-constraints.py
================================================
import sqlite3
from helpers import exec
def test_normal(db, snapshot):
db.execute("create virtual table v using vec0(embedding float[1], is_odd boolean, chunk_size=8)")
db.executemany(
"insert into v(rowid, is_odd, embedding) values (?1, ?1 % 2, ?2)",
[
[1, "[1]"],
[2, "[2]"],
[3, "[3]"],
[4, "[4]"],
[5, "[5]"],
[6, "[6]"],
[7, "[7]"],
[8, "[8]"],
[9, "[9]"],
[10, "[10]"],
[11, "[11]"],
[12, "[12]"],
[13, "[13]"],
[14, "[14]"],
[15, "[15]"],
[16, "[16]"],
[17, "[17]"],
],
)
assert exec(db,"SELECT * FROM v") == snapshot()
BASE_KNN = "select rowid, distance from v where embedding match ? and k = ? "
assert exec(db, BASE_KNN, ["[1]", 5]) == snapshot()
assert exec(db, BASE_KNN + "AND distance > 5", ["[1]", 5]) == snapshot()
assert exec(db, BASE_KNN + "AND distance >= 5", ["[1]", 5]) == snapshot()
assert exec(db, BASE_KNN + "AND distance < 3", ["[1]", 5]) == snapshot()
assert exec(db, BASE_KNN + "AND distance <= 3", ["[1]", 5]) == snapshot()
assert exec(db, BASE_KNN + "AND distance > 7 AND distance <= 10", ["[1]", 5]) == snapshot()
assert exec(db, BASE_KNN + "AND distance BETWEEN 7 AND 10", ["[1]", 5]) == snapshot()
assert exec(db, BASE_KNN + "AND is_odd == TRUE AND distance BETWEEN 7 AND 10", ["[1]", 5]) == snapshot()
class Row:
def __init__(self):
pass
def __repr__(self) -> str:
return repr()
================================================
FILE: tests/test-loadable.py
================================================
# ruff: noqa: E731
import re
from typing import List
import sqlite3
import unittest
from random import random
import struct
import inspect
import pytest
import json
import numpy as np
from math import isclose
EXT_PATH = "./dist/vec0"
SUPPORTS_SUBTYPE = sqlite3.sqlite_version_info[1] > 38
SUPPORTS_DROP_COLUMN = sqlite3.sqlite_version_info[1] >= 35
SUPPORTS_VTAB_IN = sqlite3.sqlite_version_info[1] >= 38
SUPPORTS_VTAB_LIMIT = sqlite3.sqlite_version_info[1] >= 41
def bitmap_full(n: int) -> bytearray:
assert (n % 8) == 0
return bytes([0xFF] * int(n / 8))
def bitmap_zerod(n: int) -> bytearray:
assert (n % 8) == 0
return bytes([0x00] * int(n / 8))
def f32_zerod(n: int) -> bytearray:
return bytes([0x00, 0x00, 0x00, 0x00] * int(n))
CHAR_BIT = 8
def _f32(list):
return struct.pack("%sf" % len(list), *list)
def _i64(list):
return struct.pack("%sq" % len(list), *list)
def _int8(list):
return struct.pack("%sb" % len(list), *list)
def bitmap(bitstring):
return bytes([int(bitstring, 2)])
def connect(ext, path=":memory:", extra_entrypoint=None):
db = sqlite3.connect(path)
db.execute(
"create temp table base_functions as select name from pragma_function_list"
)
db.execute("create temp table base_modules as select name from pragma_module_list")
db.enable_load_extension(True)
db.load_extension(ext)
if extra_entrypoint:
db.execute("select load_extension(?, ?)", [ext, extra_entrypoint])
db.execute(
"create temp table loaded_functions as select name from pragma_function_list where name not in (select name from base_functions) order by name"
)
db.execute(
"create temp table loaded_modules as select name from pragma_module_list where name not in (select name from base_modules) order by name"
)
db.row_factory = sqlite3.Row
return db
db = connect(EXT_PATH)
def explain_query_plan(sql, db=db):
return db.execute("explain query plan " + sql).fetchone()["detail"]
def execute_all(cursor, sql, args=None):
if args is None:
args = []
results = cursor.execute(sql, args).fetchall()
return list(map(lambda x: dict(x), results))
def spread_args(args):
return ",".join(["?"] * len(args))
FUNCTIONS = [
"vec_add",
"vec_bit",
"vec_debug",
"vec_distance_cosine",
"vec_distance_hamming",
"vec_distance_l1",
"vec_distance_l2",
"vec_f32",
"vec_int8",
"vec_length",
"vec_normalize",
"vec_quantize_binary",
"vec_quantize_int8",
"vec_slice",
"vec_sub",
"vec_to_json",
"vec_type",
"vec_version",
]
MODULES = [
"vec0",
"vec_each",
# "vec_static_blob_entries",
# "vec_static_blobs",
]
def register_numpy(db, name: str, array):
ptr = array.__array_interface__["data"][0]
nvectors, dimensions = array.__array_interface__["shape"]
element_type = array.__array_interface__["typestr"]
assert element_type == "\x9a\x99\x99>",
},
{
"vector": b"fff?\xcd\xccL?",
},
]
assert execute_all(db, "select rowid, (vector) from z") == [
{
"rowid": 0,
"vector": b"\xcd\xcc\xcc=\xcd\xcc\xcc=\xcd\xcc\xcc=\xcd\xcc\xcc=",
},
{
"rowid": 1,
"vector": b"\xcd\xccL>\xcd\xccL>\xcd\xccL>\xcd\xccL>",
},
{
"rowid": 2,
"vector": b"\x9a\x99\x99>\x9a\x99\x99>\x9a\x99\x99>\x9a\x99\x99>",
},
{
"rowid": 3,
"vector": b"\xcd\xcc\xcc>\xcd\xcc\xcc>\xcd\xcc\xcc>\xcd\xcc\xcc>",
},
{
"rowid": 4,
"vector": b"\x00\x00\x00?\x00\x00\x00?\x00\x00\x00?\x00\x00\x00?",
},
]
assert execute_all(
db,
"select rowid, vec_to_json(vector) as v from z where vector match ? and k = 3 order by distance;",
[np.array([0.3, 0.3, 0.3, 0.3], dtype=np.float32)],
) == [
{
"rowid": 2,
"v": "[0.300000,0.300000,0.300000,0.300000]",
},
{
"rowid": 3,
"v": "[0.400000,0.400000,0.400000,0.400000]",
},
{
"rowid": 1,
"v": "[0.200000,0.200000,0.200000,0.200000]",
},
]
assert execute_all(
db,
"select rowid, vec_to_json(vector) as v from z where vector match ? and k = 3 order by distance;",
[np.array([0.6, 0.6, 0.6, 0.6], dtype=np.float32)],
) == [
{
"rowid": 4,
"v": "[0.500000,0.500000,0.500000,0.500000]",
},
{
"rowid": 3,
"v": "[0.400000,0.400000,0.400000,0.400000]",
},
{
"rowid": 2,
"v": "[0.300000,0.300000,0.300000,0.300000]",
},
]
def test_limits():
db = connect(EXT_PATH)
with _raises(
"vec0 constructor error: Dimension on vector column too large, provided 8193, maximum 8192"
):
db.execute("create virtual table v using vec0(a float[8193])")
with _raises("vec0 constructor error: chunk_size too large"):
db.execute("create virtual table v using vec0(a float[4], chunk_size=8200)")
db.execute("create virtual table v using vec0(a float[1])")
with _raises("k value in knn query too large, provided 8193 and the limit is 4096"):
db.execute("select * from v where a match '[0.1]' and k = 8193")
def test_funcs():
funcs = list(
map(
lambda a: a[0],
db.execute("select name from loaded_functions").fetchall(),
)
)
assert funcs == FUNCTIONS
def test_modules():
modules = list(
map(lambda a: a[0], db.execute("select name from loaded_modules").fetchall())
)
assert modules == MODULES
def test_vec_version():
vec_version = lambda *args: db.execute("select vec_version()", args).fetchone()[0]
assert vec_version()[0] == "v"
def test_vec_debug():
vec_debug = lambda *args: db.execute("select vec_debug()", args).fetchone()[0]
d = vec_debug().split("\n")
assert len(d) == 4
def test_vec_bit():
vec_bit = lambda *args: db.execute("select vec_bit(?)", args).fetchone()[0]
assert vec_bit(b"\xff") == b"\xff"
if SUPPORTS_SUBTYPE:
assert db.execute("select subtype(vec_bit(X'FF'))").fetchone()[0] == 224
with pytest.raises(
sqlite3.OperationalError, match="zero-length vectors are not supported."
):
db.execute("select vec_bit(X'')").fetchone()
for x in [None, "text", 1, 1.999]:
with pytest.raises(
sqlite3.OperationalError, match="Unknown type for bitvector."
):
db.execute("select vec_bit(?)", [x]).fetchone()
def test_vec_f32():
vec_f32 = lambda *args: db.execute("select vec_f32(?)", args).fetchone()[0]
assert vec_f32(b"\x00\x00\x00\x00") == b"\x00\x00\x00\x00"
assert vec_f32("[0.0000]") == b"\x00\x00\x00\x00"
# fmt: off
tests = [
[0],
[0, 0, 0, 0],
[1, -1, 10, -10],
[-0, 0, .0001, -.0001],
]
# fmt: on
for test in tests:
assert vec_f32(json.dumps(test)) == _f32(test)
if SUPPORTS_SUBTYPE:
assert db.execute("select subtype(vec_f32(X'00000000'))").fetchone()[0] == 223
with pytest.raises(
sqlite3.OperationalError, match="zero-length vectors are not supported."
):
vec_f32(b"")
for invalid in [None, 1, 1.2]:
with pytest.raises(
sqlite3.OperationalError,
match=re.escape(
"Input must have type BLOB (compact format) or TEXT (JSON)",
),
):
vec_f32(invalid)
with pytest.raises(
sqlite3.OperationalError,
match="invalid float32 vector BLOB length. Must be divisible by 4, found 5",
):
vec_f32(b"aaaaa")
with pytest.raises(
sqlite3.OperationalError,
match=re.escape("JSON array parsing error: Input does not start with '['"),
):
vec_f32("1]")
# TODO mas tests
# TODO different error message
with _raises("zero-length vectors are not supported."):
vec_f32("[")
with _raises("zero-length vectors are not supported."):
vec_f32("[]")
# with _raises("zero-length vectors are not supported."):
# vec_f32("[1.2")
# vec_f32("[]")
def test_vec_int8():
vec_int8 = lambda *args: db.execute("select vec_int8(?)", args).fetchone()[0]
assert vec_int8(b"\x00") == _int8([0])
assert vec_int8(b"\x00\x0f") == _int8([0, 15])
assert vec_int8("[0]") == _int8([0])
assert vec_int8("[1, 2, 3]") == _int8([1, 2, 3])
if SUPPORTS_SUBTYPE:
assert db.execute("select subtype(vec_int8(?))", [b"\x00"]).fetchone()[0] == 225
def npy_cosine(a, b):
return 1 - (np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))
def npy_l2(a, b):
return np.linalg.norm(a - b)
def test_vec_distance_cosine():
vec_distance_cosine = lambda *args, a="?", b="?": db.execute(
f"select vec_distance_cosine({a}, {b})", args
).fetchone()[0]
def check(a, b, dtype=np.float32):
if dtype == np.float32:
transform = "?"
elif dtype == np.int8:
transform = "vec_int8(?)"
a = np.array(a, dtype=dtype)
b = np.array(b, dtype=dtype)
x = vec_distance_cosine(a, b, a=transform, b=transform)
y = npy_cosine(a, b)
assert isclose(x, y, abs_tol=1e-6)
check([1.2, 0.1], [0.4, -0.4])
check([-1.2, -0.1], [-0.4, 0.4])
check([1, 2, 3], [-9, -8, -7], dtype=np.int8)
assert vec_distance_cosine("[1.1, 1.0]", "[1.2, 1.2]") == 0.001131898257881403
def test_vec_distance_hamming():
vec_distance_hamming = lambda *args: db.execute(
"select vec_distance_hamming(vec_bit(?), vec_bit(?))", args
).fetchone()[0]
assert vec_distance_hamming(b"\xff", b"\x00") == 8
assert vec_distance_hamming(b"\xff", b"\x01") == 7
assert vec_distance_hamming(b"\xab", b"\xab") == 0
with pytest.raises(
sqlite3.OperationalError,
match="Cannot calculate hamming distance between two float32 vectors.",
):
db.execute("select vec_distance_hamming(vec_f32('[1.0]'), vec_f32('[1.0]'))")
with pytest.raises(
sqlite3.OperationalError,
match="Cannot calculate hamming distance between two int8 vectors.",
):
db.execute("select vec_distance_hamming(vec_int8(X'FF'), vec_int8(X'FF'))")
def test_vec_distance_l1():
vec_distance_l1 = lambda *args, a="?", b="?": db.execute(
f"select vec_distance_l1({a}, {b})", args
).fetchone()[0]
def check(a, b, dtype=np.float32):
if dtype == np.float32:
transform = "?"
elif dtype == np.int8:
transform = "vec_int8(?)"
a_sql_t = np.array(a, dtype=dtype)
b_sql_t = np.array(b, dtype=dtype)
x = vec_distance_l1(a_sql_t, b_sql_t, a=transform, b=transform)
# dont use dtype here bc overflow
y = np.sum(np.abs(np.array(a) - np.array(b)))
assert isclose(x, y, abs_tol=1e-6)
check([1, 2, 3], [-9, -8, -7], dtype=np.int8)
# check overflow
check([127] * 20, [-128] * 20, dtype=np.int8)
check([-128, 127], [127, -128], dtype=np.int8)
check(
[1, 2, 3, 4, 5, 6, 7, 8, 1, 1, 2, 3, 4, 5, 6, 7, 8, 1],
[1, 20, 38, 23, 29, 4, 10, 9, 3, 1, 20, 38, 23, 29, 4, 10, 9, 3],
dtype=np.int8,
)
check([0] * 20, [0] * 20, dtype=np.int8)
check(
[5, 15, -20, 5, 15, -20, 5, 15, -20, 5, 15, -20, 5, 15, -20, 5, 15, -20],
[5, 15, -20, 5, 15, -20, 5, 15, -20, 5, 15, -20, 5, 15, -20, 5, 15, -20],
dtype=np.int8,
)
check([100] * 20, [-100] * 20, dtype=np.int8)
check([127] * 1000000, [-128] * 1000000, dtype=np.int8)
check(
[1.2, 0.1, 0.5, 0.9, 1.4, 4.5],
[0.4, -0.4, 0.1, 0.1, 0.5, 0.9],
dtype=np.float32,
)
check([1.0, 2.0, 3.0], [-1.0, -2.0, -3.0], dtype=np.float32)
check(
[1e10, 2e10, np.finfo(np.float32).max],
[-1e10, -2e10, np.finfo(np.float32).min],
dtype=np.float32,
)
# overflow in leftover elements
check(
[1e10, 2e10, 1e10, 2e10, np.finfo(np.float32).max],
[-1e10, -2e10, -1e10, -2e10, np.finfo(np.float32).min],
dtype=np.float32,
)
# overflow in neon elements
check(
[np.finfo(np.float32).max, 1e10, 2e10, 1e10, 2e10],
[np.finfo(np.float32).min, -1e10, -2e10, -1e10, -2e10],
dtype=np.float32,
)
def test_vec_distance_l2():
vec_distance_l2 = lambda *args, a="?", b="?": db.execute(
f"select vec_distance_l2({a}, {b})", args
).fetchone()[0]
def check(a, b, dtype=np.float32):
if dtype == np.float32:
transform = "?"
elif dtype == np.int8:
transform = "vec_int8(?)"
a_sql_t = np.array(a, dtype=dtype)
b_sql_t = np.array(b, dtype=dtype)
x = vec_distance_l2(a_sql_t, b_sql_t, a=transform, b=transform)
y = npy_l2(np.array(a), np.array(b))
assert isclose(x, y, abs_tol=1e-6)
check([1.2, 0.1], [0.4, -0.4])
check([-1.2, -0.1], [-0.4, 0.4])
check([1, 2, 3], [-9, -8, -7], dtype=np.int8)
def test_vec_length():
def test_f32():
vec_length = lambda *args: db.execute("select vec_length(?)", args).fetchone()[
0
]
assert vec_length(b"\xAA\xBB\xCC\xDD") == 1
assert vec_length(b"\xAA\xBB\xCC\xDD\x01\x02\x03\x04") == 2
assert vec_length(f32_zerod(1024)) == 1024
with pytest.raises(
sqlite3.OperationalError, match="zero-length vectors are not supported."
):
assert vec_length(b"") == 0
with pytest.raises(
sqlite3.OperationalError, match="zero-length vectors are not supported."
):
vec_length("[]")
def test_int8():
vec_length_int8 = lambda *args: db.execute(
"select vec_length(vec_int8(?))", args
).fetchone()[0]
assert vec_length_int8(b"\xAA") == 1
assert vec_length_int8(b"\xAA\xBB\xCC\xDD") == 4
assert vec_length_int8(b"\xAA\xBB\xCC\xDD\x01\x02\x03\x04") == 8
with pytest.raises(
sqlite3.OperationalError, match="zero-length vectors are not supported."
):
assert vec_length_int8(b"") == 0
def test_bit():
vec_length_bit = lambda *args: db.execute(
"select vec_length(vec_bit(?))", args
).fetchone()[0]
assert vec_length_bit(b"\xAA") == 8
assert vec_length_bit(b"\xAA\xBB\xCC\xDD") == 8 * 4
assert vec_length_bit(b"\xAA\xBB\xCC\xDD\x01\x02\x03\x04") == 8 * 8
with pytest.raises(
sqlite3.OperationalError, match="zero-length vectors are not supported."
):
assert vec_length_bit(b"") == 0
test_f32()
test_int8()
test_bit()
def test_vec_normalize():
vec_normalize = lambda *args: db.execute(
"select vec_normalize(?)", args
).fetchone()[0]
assert list(struct.unpack_from("4f", vec_normalize(_f32([1, 2, -1, -2])))) == [
0.3162277638912201,
0.6324555277824402,
-0.3162277638912201,
-0.6324555277824402,
]
def test_vec_slice():
vec_slice = lambda *args, f="?": db.execute(
f"select vec_slice({f}, ?, ?)", args
).fetchone()[0]
assert vec_slice(_f32([1.1, 2.2, 3.3]), 0, 3) == _f32([1.1, 2.2, 3.3])
assert vec_slice(_f32([1.1, 2.2, 3.3]), 0, 2) == _f32([1.1, 2.2])
assert vec_slice(_f32([1.1, 2.2, 3.3]), 0, 1) == _f32([1.1])
assert vec_slice(_int8([1, 2, 3]), 0, 3, f="vec_int8(?)") == _int8([1, 2, 3])
assert vec_slice(_int8([1, 2, 3]), 0, 2, f="vec_int8(?)") == _int8([1, 2])
assert vec_slice(_int8([1, 2, 3]), 0, 1, f="vec_int8(?)") == _int8([1])
assert vec_slice(b"\xAA\xBB\xCC\xDD", 0, 8, f="vec_bit(?)") == b"\xAA"
assert vec_slice(b"\xAA\xBB\xCC\xDD", 8, 16, f="vec_bit(?)") == b"\xBB"
assert vec_slice(b"\xAA\xBB\xCC\xDD", 8, 24, f="vec_bit(?)") == b"\xBB\xCC"
assert vec_slice(b"\xAA\xBB\xCC\xDD", 0, 32, f="vec_bit(?)") == b"\xAA\xBB\xCC\xDD"
with pytest.raises(
sqlite3.OperationalError, match="start index must be divisible by 8."
):
vec_slice(b"\xAA\xBB\xCC\xDD", 2, 32, f="vec_bit(?)")
with pytest.raises(
sqlite3.OperationalError, match="end index must be divisible by 8."
):
vec_slice(b"\xAA\xBB\xCC\xDD", 0, 31, f="vec_bit(?)")
with pytest.raises(
sqlite3.OperationalError, match="slice 'start' index must be a postive number."
):
vec_slice(b"\xab\xab\xab\xab", -1, 1)
with pytest.raises(
sqlite3.OperationalError, match="slice 'end' index must be a postive number."
):
vec_slice(b"\xab\xab\xab\xab", 0, -3)
with pytest.raises(
sqlite3.OperationalError,
match="slice 'start' index is greater than the number of dimensions",
):
vec_slice(b"\xab\xab\xab\xab", 2, 3)
with pytest.raises(
sqlite3.OperationalError,
match="slice 'end' index is greater than the number of dimensions",
):
vec_slice(b"\xab\xab\xab\xab", 0, 2)
with pytest.raises(
sqlite3.OperationalError,
match="slice 'start' index is greater than 'end' index",
):
vec_slice(b"\xab\xab\xab\xab", 1, 0)
with _raises(
"slice 'start' index is equal to the 'end' index, vectors must have non-zero length"
):
vec_slice(b"\xab\xab\xab\xab", 0, 0)
def test_vec_type():
vec_type = lambda *args, a="?": db.execute(
f"select vec_type({a})", args
).fetchone()[0]
assert vec_type("[1]") == "float32"
assert vec_type(b"\xaa\xbb\xcc\xdd") == "float32"
assert vec_type("[1]", a="vec_f32(?)") == "float32"
assert vec_type("[1]", a="vec_int8(?)") == "int8"
assert vec_type(b"\xaa", a="vec_bit(?)") == "bit"
with _raises("invalid float32 vector"):
vec_type(b"\xaa")
with _raises("found NULL"):
vec_type(None)
def test_vec_add():
vec_add = lambda *args, a="?", b="?": db.execute(
f"select vec_add({a}, {b})", args
).fetchone()[0]
assert vec_add("[1]", "[2]") == _f32([3])
assert vec_add("[.1]", "[.2]") == _f32([0.3])
assert vec_add(_int8([1]), _int8([2]), a="vec_int8(?)", b="vec_int8(?)") == _int8(
[3]
)
with pytest.raises(
sqlite3.OperationalError,
match="Cannot add two bitvectors together.",
):
vec_add(b"0xff", b"0xff", a="vec_bit(?)", b="vec_bit(?)")
with pytest.raises(
sqlite3.OperationalError,
match="Vector type mistmatch. First vector has type float32, while the second has type int8.",
):
vec_add(_f32([1]), _int8([2]), b="vec_int8(?)")
with pytest.raises(
sqlite3.OperationalError,
match="Vector type mistmatch. First vector has type int8, while the second has type float32.",
):
vec_add(_int8([2]), _f32([1]), a="vec_int8(?)")
def test_vec_sub():
vec_sub = lambda *args, a="?", b="?": db.execute(
f"select vec_sub({a}, {b})", args
).fetchone()[0]
assert vec_sub("[1]", "[2]") == _f32([-1])
assert vec_sub("[.1]", "[.2]") == _f32([-0.1])
assert vec_sub(_int8([11]), _int8([2]), a="vec_int8(?)", b="vec_int8(?)") == _int8(
[9]
)
with pytest.raises(
sqlite3.OperationalError,
match="Cannot subtract two bitvectors together.",
):
vec_sub(b"0xff", b"0xff", a="vec_bit(?)", b="vec_bit(?)")
with pytest.raises(
sqlite3.OperationalError,
match="Vector type mistmatch. First vector has type float32, while the second has type int8.",
):
vec_sub(_f32([1]), _int8([2]), b="vec_int8(?)")
with pytest.raises(
sqlite3.OperationalError,
match="Vector type mistmatch. First vector has type int8, while the second has type float32.",
):
vec_sub(_int8([2]), _f32([1]), a="vec_int8(?)")
def test_vec_to_json():
vec_to_json = lambda *args, input="?": db.execute(
f"select vec_to_json({input})", args
).fetchone()[0]
assert vec_to_json("[1, 2, 3]") == "[1.000000,2.000000,3.000000]"
assert vec_to_json(b"\x00\x00\x00\x00\x00\x00\x80\xbf") == "[0.000000,-1.000000]"
assert vec_to_json(b"\x04", input="vec_int8(?)") == "[4]"
assert vec_to_json(b"\x04\xff", input="vec_int8(?)") == "[4,-1]"
assert vec_to_json(b"\xff", input="vec_bit(?)") == "[1,1,1,1,1,1,1,1]"
assert vec_to_json(b"\x0f", input="vec_bit(?)") == "[1,1,1,1,0,0,0,0]"
@pytest.mark.skip(reason="TODO")
def test_vec_quantize_int8():
vec_quantize_int8 = lambda *args: db.execute(
"select vec_quantize_int8()", args
).fetchone()[0]
assert vec_quantize_int8() == 111
def test_vec_quantize_binary():
vec_quantize_binary = lambda *args, input="?": db.execute(
f"select vec_quantize_binary({input})", args
).fetchone()[0]
assert vec_quantize_binary("[-1, -1, -1, -1, 1, 1, 1, 1]") == b"\xf0"
@pytest.mark.skip(reason="TODO")
def test_vec0():
pass
def test_vec0_inserts():
db = connect(EXT_PATH)
db.execute(
"""
create virtual table t using vec0(
aaa float[128],
bbb int8[128],
ccc bit[128]
);
"""
)
db.execute(
"insert into t values (?, ?, vec_int8(?), vec_bit(?))",
[
1,
np.full((128,), 0.0001, dtype="float32"),
np.full((128,), 4, dtype="int8"),
bitmap_full(128),
],
)
assert execute_all(db, "select * from t") == [
{
"rowid": 1,
"aaa": _f32([0.0001] * 128),
"bbb": _int8([4] * 128),
"ccc": bitmap_full(128),
}
]
# db.execute(
# "update t set aaa = ? where rowid = ?",
# [np.full((128,), 0.00011, dtype="float32"), 1],
# )
# assert execute_all(db, "select * from t") == [
# {
# "rowid": 1,
# "aaa": _f32([0.00011] * 128),
# "bbb": _int8([4] * 128),
# "ccc": bitmap_full(128),
# }
# ]
db.execute("create virtual table t1 using vec0(aaa float[4], chunk_size=8)")
db.execute(
"create virtual table txt_pk using vec0( txt_id text primary key, aaa float[4])"
)
# EVIDENCE-OF: V06519_23358 vec0 INSERT validates vector
with _raises(
'Inserted vector for the "aaa" column is invalid: Input must have type BLOB (compact format) or TEXT (JSON)'
):
db.execute("insert into t1 values (1, ?)", [None])
# EVIDENCE-OF: V08221_25059 vec0 INSERT validates vector type
with _raises(
'Inserted vector for the "aaa" column is expected to be of type float32, but a bit vector was provided.'
):
db.execute("insert into t1 values (1, vec_bit(?))", [b"\xff\xff\xff\xff"])
# EVIDENCE-OF: V01145_17984 vec0 INSERT validates vector dimension match
with _raises(
'Dimension mismatch for inserted vector for the "aaa" column. Expected 4 dimensions but received 3.'
):
db.execute("insert into t1 values (1, ?)", ["[1,2,3]"])
# EVIDENCE-OF: V24228_08298 vec0 INSERT ensure no value provided for "distance" hidden column.
with _raises('A value was provided for the hidden "distance" column.'):
db.execute("insert into t1(rowid, aaa, distance) values (1, '[1,2,3,4]', 1)")
# EVIDENCE-OF: V11875_28713 vec0 INSERT ensure no value provided for "distance" hidden column.
with _raises('A value was provided for the hidden "k" column.'):
db.execute("insert into t1(rowid, aaa, k) values (1, '[1,2,3,4]', 1)")
# EVIDENCE-OF: V17090_01160 vec0 INSERT duplicated int primary key raises uniqueness error
db.execute("insert into t1 values (1, '[1,1,1,1]')")
with _raises("UNIQUE constraint failed on t1 primary key"):
db.execute("insert into t1 values (1, '[2,2,2,2]')")
# similate error on rowids shadow table
db.commit()
db.set_authorizer(authorizer_deny_on(sqlite3.SQLITE_INSERT, "t1_rowids"))
# EVIDENCE-OF: V04679_21517 vec0 INSERT failed on _rowid shadow insert raises error
with _raises(
"Internal sqlite-vec error: could not initialize 'insert rowids' statement",
sqlite3.DatabaseError,
):
db.execute("insert into t1 values (2, '[2,2,2,2]')")
db.set_authorizer(None)
db.rollback()
db.execute("insert into t1 values (2, '[2,2,2,2]')")
# test inserts where no rowid is provided
db.execute("insert into t1(aaa) values ('[3,3,3,3]')")
# EVIDENCE-OF: V30855_14925 vec0 INSERT non-integer/text primary key value rauses error
with _raises("Only integers are allows for primary key values on t1"):
db.execute("insert into t1 values (1.2, '[4,4,4,4]')")
# similate error on rowids shadow table, when rowid is not provided
# EVIDENCE-OF: V15177_32015 vec0 INSERT error on _rowids shadow insert raises error
db.set_authorizer(authorizer_deny_on(sqlite3.SQLITE_INSERT, "t1_rowids"))
with _raises("Error inserting id into rowids shadow table: not authorized"):
db.execute("insert into t1(aaa) values ('[2,2,2,2]')")
db.set_authorizer(None)
# EVIDENCE-OF: V31559_15629 vec0 INSERT error on _chunks shadow insert raises error
db.set_authorizer(authorizer_deny_on(sqlite3.SQLITE_READ, "t1_chunks", "chunk_id"))
with _raises("Internal sqlite-vec error: Could not find latest chunk"):
db.execute("insert into t1 values (999, '[2,2,2,2]')")
db.set_authorizer(None)
# EVIDENCE-OF: V22053_06123 vec0 INSERT error on reading validity blob
if SUPPORTS_DROP_COLUMN:
db.commit()
db.execute("begin")
db.execute("ALTER TABLE t1_chunks DROP COLUMN validity")
with _raises(
"Internal sqlite-vec error: could not open validity blob on main.t1_chunks.1"
):
db.execute("insert into t1 values (9999, '[2,2,2,2]')")
db.rollback()
# EVIDENCE-OF: V29362_13432 vec0 INSERT validity blob size mismatch with chunk_size
db.commit()
db.execute("begin")
db.execute("UPDATE t1_chunks SET validity = zeroblob(101)")
with _raises(
"Internal sqlite-vec error: validity blob size mismatch on main.t1_chunks.1, expected 1 but received 101."
):
db.execute("insert into t1 values (9999, '[2,2,2,2]')")
db.rollback()
# EVIDENCE-OF: V16386_00456 vec0 INSERT valdates vector blob column sizes
db.commit()
db.execute("begin")
db.execute("UPDATE t1_vector_chunks00 SET vectors = zeroblob(101)")
with _raises(
"Internal sqlite-vec error: vector blob size mismatch on main.t1_vector_chunks00.1. Expected 128, actual 101"
):
db.execute("insert into t1 values (9999, '[2,2,2,2]')")
db.rollback()
# EVIDENCE-OF: V09221_26060 vec0 INSERT rowids blob open error
if SUPPORTS_DROP_COLUMN:
db.commit()
db.execute("begin")
db.execute("ALTER TABLE t1_chunks DROP COLUMN rowids")
with _raises(
"Internal sqlite-vec error: could not open rowids blob on main.t1_chunks.1"
):
db.execute("insert into t1 values (9999, '[2,2,2,2]')")
db.rollback()
# EVIDENCE-OF: V12779_29618 vec0 INSERT rowids blob validates size
db.commit()
db.execute("begin")
db.execute("UPDATE t1_chunks SET rowids = zeroblob(101)")
with _raises(
"Internal sqlite-vec error: rowids blob size mismatch on main.t1_chunks.1. Expected 64, actual 101"
):
db.execute("insert into t1 values (9999, '[2,2,2,2]')")
db.rollback()
# EVIDENCE-OF: V21925_05995 vec0 INSERT error on "rowids update position" raises error
db.commit()
db.execute("begin")
db.execute("insert into t1 values (9998, '[2,2,2,2]')")
db.set_authorizer(
authorizer_deny_on(sqlite3.SQLITE_UPDATE, "t1_rowids", "chunk_id")
)
with _raises(
"Internal sqlite-vec error: could not update rowids position for rowid=9999, chunk_rowid=1, chunk_offset=4"
):
db.execute("insert into t1 values (9999, '[2,2,2,2]')")
db.set_authorizer(None)
db.rollback()
########## testing inserts on text primary key tables ##########
# EVIDENCE-OF: V04200_21039 vec0 table with text primary key ensure text values
with _raises(
"The txt_pk virtual table was declared with a TEXT primary key, but a non-TEXT value was provided in an INSERT."
):
db.execute("insert into txt_pk(txt_id, aaa) values (1, '[1,2,3,4]')")
db.execute("insert into txt_pk(txt_id, aaa) values ('a', '[1,2,3,4]')")
# EVIDENCE-OF: V20497_04568 vec0 table with text primary key raises uniqueness error on duplicate values
with _raises("UNIQUE constraint failed on txt_pk primary key"):
db.execute("insert into txt_pk(txt_id, aaa) values ('a', '[5,6,7,8]')")
# EVIDENCE-OF: V24016_08086 vec0 table with text primary key raises error on rowid write error
db.set_authorizer(authorizer_deny_on(sqlite3.SQLITE_INSERT, "txt_pk_rowids"))
with _raises("Error inserting id into rowids shadow table: not authorized"):
db.execute("insert into txt_pk(txt_id, aaa) values ('b', '[2,2,2,2]')")
db.set_authorizer(None)
db.execute("insert into txt_pk(txt_id, aaa) values ('b', '[2,2,2,2]')")
def test_vec0_insert_errors2():
db = connect(EXT_PATH)
db.execute("create virtual table t1 using vec0(aaa float[4], chunk_size=8)")
db.execute(
"""
insert into t1(aaa) values
('[1,1,1,1]'),
('[2,1,1,1]'),
('[3,1,1,1]'),
('[4,1,1,1]'),
('[5,1,1,1]'),
('[6,1,1,1]')
"""
)
assert execute_all(db, "select * from t1_chunks") == [
{
"chunk_id": 1,
"rowids": b"\x01\x00\x00\x00\x00\x00\x00\x00"
+ b"\x02\x00\x00\x00\x00\x00\x00\x00"
+ b"\x03\x00\x00\x00\x00\x00\x00\x00"
+ b"\x04\x00\x00\x00\x00\x00\x00\x00"
+ b"\x05\x00\x00\x00\x00\x00\x00\x00"
+ b"\x06\x00\x00\x00\x00\x00\x00\x00"
+ b"\x00\x00\x00\x00\x00\x00\x00\x00"
+ b"\x00\x00\x00\x00\x00\x00\x00\x00",
"size": 8,
"validity": b"?", # 0b00111111
}
]
db.execute(
"""
insert into t1(aaa) values
('[7,1,1,1]'),
('[8,1,1,1]')
"""
)
assert execute_all(db, "select * from t1_chunks") == [
{
"chunk_id": 1,
"rowids": b"\x01\x00\x00\x00\x00\x00\x00\x00"
+ b"\x02\x00\x00\x00\x00\x00\x00\x00"
+ b"\x03\x00\x00\x00\x00\x00\x00\x00"
+ b"\x04\x00\x00\x00\x00\x00\x00\x00"
+ b"\x05\x00\x00\x00\x00\x00\x00\x00"
+ b"\x06\x00\x00\x00\x00\x00\x00\x00"
+ b"\x07\x00\x00\x00\x00\x00\x00\x00"
+ b"\x08\x00\x00\x00\x00\x00\x00\x00",
"size": 8,
"validity": b"\xff", # 0b11111111
}
]
# EVIDENCE-OF: V08441_25279 vec0 INSERT error on new chunk creation raises error
db.set_authorizer(authorizer_deny_on(sqlite3.SQLITE_INSERT, "t1_chunks"))
with _raises("Internal sqlite-vec error: Could not insert a new vector chunk"):
db.execute("insert into t1(aaa) values ('[9,1,1,1]')")
db.set_authorizer(None)
def test_vec0_drops():
db = connect(EXT_PATH)
db.execute(
"create virtual table t1 using vec0(aaa float[4], bbb float[4], chunk_size=8)"
)
assert [
row["name"]
for row in execute_all(
db, "select name from sqlite_master where name like 't1%' order by 1"
)
] == [
"t1",
"t1_chunks",
"t1_info",
"t1_rowids",
"t1_vector_chunks00",
"t1_vector_chunks01",
]
db.execute("drop table t1")
assert [
row["name"]
for row in execute_all(
db, "select name from sqlite_master where name like 't1%' order by 1"
)
] == []
def test_vec0_delete():
db = connect(EXT_PATH)
db.execute("create virtual table t1 using vec0(aaa float[4], chunk_size=8)")
db.execute(
"""
insert into t1(aaa) values
('[1,1,1,1]'),
('[2,1,1,1]'),
('[3,1,1,1]'),
('[4,1,1,1]'),
('[5,1,1,1]'),
('[6,1,1,1]')
"""
)
assert execute_all(db, "select * from t1_rowids") == [
{
"chunk_id": 1,
"chunk_offset": 0,
"id": None,
"rowid": 1,
},
{
"chunk_id": 1,
"chunk_offset": 1,
"id": None,
"rowid": 2,
},
{
"chunk_id": 1,
"chunk_offset": 2,
"id": None,
"rowid": 3,
},
{
"chunk_id": 1,
"chunk_offset": 3,
"id": None,
"rowid": 4,
},
{
"chunk_id": 1,
"chunk_offset": 4,
"id": None,
"rowid": 5,
},
{
"chunk_id": 1,
"chunk_offset": 5,
"id": None,
"rowid": 6,
},
]
assert execute_all(db, "select * from t1_chunks") == [
{
"chunk_id": 1,
"rowids": _i64([1, 2, 3, 4, 5, 6, 0, 0]),
"size": 8,
"validity": bitmap("00111111"),
}
]
assert execute_all(db, "select * from t1_vector_chunks00") == [
{
"rowid": 1,
"vectors": _f32([1, 1, 1, 1])
+ _f32([2, 1, 1, 1])
+ _f32([3, 1, 1, 1])
+ _f32([4, 1, 1, 1])
+ _f32([5, 1, 1, 1])
+ _f32([6, 1, 1, 1])
+ _f32([0, 0, 0, 0])
+ _f32([0, 0, 0, 0]),
}
]
db.execute("DELETE FROM t1 WHERE rowid = 1")
assert execute_all(db, "select * from t1_rowids") == [
{
"chunk_id": 1,
"chunk_offset": 1,
"id": None,
"rowid": 2,
},
{
"chunk_id": 1,
"chunk_offset": 2,
"id": None,
"rowid": 3,
},
{
"chunk_id": 1,
"chunk_offset": 3,
"id": None,
"rowid": 4,
},
{
"chunk_id": 1,
"chunk_offset": 4,
"id": None,
"rowid": 5,
},
{
"chunk_id": 1,
"chunk_offset": 5,
"id": None,
"rowid": 6,
},
]
# TODO finish delete support
# assert execute_all(db, "select * from t1_chunks") == [
# {
# 'chunk_id': 1,
# 'rowids': _i64([0,2,3,4,5,6,0,0]),
# 'size': 8,
# 'validity': bitmap("00111110"),
# }
# ]
# assert execute_all(db, "select * from t1_vector_chunks00") == [
# {
# 'rowid': 1,
# 'vectors': _f32([0,0,0,0])
# +_f32([2,1,1,1])
# +_f32([3,1,1,1])
# +_f32([4,1,1,1])
# +_f32([5,1,1,1])
# +_f32([6,1,1,1])
# +_f32([0,0,0,0])
# +_f32([0,0,0,0])
# }
# ]
# TODO test with text primary keys
def test_vec0_delete_errors():
db = connect(EXT_PATH)
db.execute("create virtual table t1 using vec0(aaa float[4], chunk_size=8)")
db.execute(
"""
insert into t1(aaa) values
('[1,1,1,1]'),
('[2,1,1,1]'),
('[3,1,1,1]'),
('[4,1,1,1]'),
('[5,1,1,1]'),
('[6,1,1,1]')
"""
)
# db.commit()
# db.execute("begin")
# db.execute("DELETE FROM t1_rowids WHERE rowid = 1")
# with _raises("XXX"):
# db.execute("DELETE FROM t1 where rowid = 1")
# db.rollback()
# EVIDENCE-OF: V26002_10073 vec0 DELETE error on reading validity blob
if SUPPORTS_DROP_COLUMN:
db.commit()
db.execute("begin")
db.execute("ALTER TABLE t1_chunks DROP COLUMN validity")
with _raises("could not open validity blob for main.t1_chunks.1"):
db.execute("delete from t1 where rowid = 1")
db.rollback()
# EVIDENCE-OF: V21193_05263 vec0 DELETE verifies that the validity bit is 1 before clearing
db.commit()
db.execute("begin")
db.execute("UPDATE t1_chunks SET validity = zeroblob(1)")
with _raises(
"vec0 deletion error: validity bit is not set for main.t1_chunks.1 at 0"
):
db.execute("delete from t1 where rowid = 1")
db.rollback()
# EVIDENCE-OF: V21193_05263 vec0 DELETE raises error on validity blob error
db.commit()
db.execute("begin")
db.execute("UPDATE t1_chunks SET validity = zeroblob(0)")
with _raises("could not read validity blob for main.t1_chunks.1 at 0"):
db.execute("delete from t1 where rowid = 1")
db.rollback()
if False: # TODO
with _raises("XXX"):
db.execute("DELETE FROM t1 WHERE rowid = 999")
if False: # TODO
db.commit()
db.execute("begin")
db.execute("DELETE FROM t1_rowids WHERE rowid = 1")
with _raises("XXX"):
db.execute("DELETE FROM t1 where rowid = 1")
db.rollback()
def test_vec0_updates():
db = connect(EXT_PATH)
db.execute(
"""
create virtual table t3 using vec0(
aaa float[8],
bbb int8[8],
ccc bit[8]
);
"""
)
db.execute(
"""
INSERT INTO t3 VALUES
(1, :x, vec_quantize_int8(:x, 'unit') ,vec_quantize_binary(:x)),
(2, :y, vec_quantize_int8(:y, 'unit') ,vec_quantize_binary(:y)),
(3, :z, vec_quantize_int8(:z, 'unit') ,vec_quantize_binary(:z));
""",
{
"x": "[.1, .1, .1, .1, -.1, -.1, -.1, -.1]",
"y": "[-.2, .2, .2, .2, .2, .2, -.2, .2]",
"z": "[.3, .3, .3, .3, .3, .3, .3, .3]",
},
)
assert execute_all(db, "select * from t3") == [
{
"rowid": 1,
"aaa": _f32([0.1, 0.1, 0.1, 0.1, -0.1, -0.1, -0.1, -0.1]),
"bbb": _int8([12, 12, 12, 12, -13, -13, -13, -13]),
"ccc": bitmap("00001111"),
},
{
"rowid": 2,
"aaa": _f32([-0.2, 0.2, 0.2, 0.2, 0.2, 0.2, -0.2, 0.2]),
"bbb": _int8([-26, 24, 24, 24, 24, 24, -26, 24]),
"ccc": bitmap("10111110"),
},
{
"rowid": 3,
"aaa": _f32([0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3]),
"bbb": _int8(
[
37,
37,
37,
37,
37,
37,
37,
37,
]
),
"ccc": bitmap("11111111"),
},
]
db.execute("UPDATE t3 SET aaa = ? WHERE rowid = 1", ["[.9,.9,.9,.9,.9,.9,.9,.9]"])
assert execute_all(db, "select * from t3") == [
{
"rowid": 1,
"aaa": _f32([0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9]),
"bbb": _int8([12, 12, 12, 12, -13, -13, -13, -13]),
"ccc": bitmap("00001111"),
},
{
"rowid": 2,
"aaa": _f32([-0.2, 0.2, 0.2, 0.2, 0.2, 0.2, -0.2, 0.2]),
"bbb": _int8([-26, 24, 24, 24, 24, 24, -26, 24]),
"ccc": bitmap("10111110"),
},
{
"rowid": 3,
"aaa": _f32([0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3]),
"bbb": _int8(
[
37,
37,
37,
37,
37,
37,
37,
37,
]
),
"ccc": bitmap("11111111"),
},
]
# EVIDENCE-OF: V15203_32042 vec0 UPDATE validates vector
with _raises(
'Updated vector for the "aaa" column is invalid: invalid float32 vector BLOB length. Must be divisible by 4, found 1'
):
db.execute("UPDATE t3 SET aaa = X'AB' WHERE rowid = 1")
# EVIDENCE-OF: V25739_09810 vec0 UPDATE validates dimension length
with _raises(
'Dimension mismatch for new updated vector for the "aaa" column. Expected 8 dimensions but received 1.'
):
db.execute("UPDATE t3 SET aaa = vec_bit(X'AABBCCDD') WHERE rowid = 1")
# EVIDENCE-OF: V03643_20481 vec0 UPDATE validates vector column type
with _raises(
'Updated vector for the "bbb" column is expected to be of type int8, but a float32 vector was provided.'
):
db.execute("UPDATE t3 SET bbb = X'ABABABAB' WHERE rowid = 1")
db.execute("CREATE VIRTUAL TABLE t2 USING vec0(a float[2], b float[2])")
db.execute("INSERT INTO t2(rowid, a, b) VALUES (1, '[.1, .1]', '[.2, .2]')")
assert execute_all(db, "select * from t2") == [
{
"rowid": 1,
"a": _f32([0.1, 0.1]),
"b": _f32([0.2, 0.2]),
}
]
# sanity check: the 1st column UPDATE "works", but since the 2nd one fails,
# then aaa should remain unchanged.
with _raises(
'Dimension mismatch for new updated vector for the "b" column. Expected 2 dimensions but received 3.'
):
db.execute(
"UPDATE t2 SET a = '[.11, .11]', b = '[.22, .22, .22]' WHERE rowid = 1"
)
assert execute_all(db, "select * from t2") == [
{
"rowid": 1,
"a": _f32([0.1, 0.1]),
"b": _f32([0.2, 0.2]),
}
]
# TODO: set UPDATEs on int8/bit columns
# db.execute("UPDATE t3 SET ccc = vec_bit(?) WHERE rowid = 3", [bitmap('01010101')])
# assert execute_all(db, "select * from t3") == [
# {
# "rowid": 1,
# "aaa": _f32([0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9]),
# "bbb": _int8([12, 12, 12, 12, -13, -13, -13, -13]),
# "ccc": bitmap("00001111"),
# },
# {
# "rowid": 2,
# "aaa": _f32([-0.2, 0.2, 0.2, 0.2, 0.2, 0.2, -0.2, 0.2]),
# "bbb": _int8([-26, 24, 24, 24, 24, 24, -26, 24]),
# "ccc": bitmap("10111110"),
# },
# {
# "rowid": 3,
# "aaa": _f32([0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3]),
# "bbb": _int8([37, 37, 37, 37, 37, 37, 37, 37, ]),
# "ccc": bitmap("11111111"),
# },
# ]
def test_vec0_point():
db = connect(EXT_PATH)
db.execute("CREATE VIRTUAL TABLE t USING vec0(a float[1], b float[1])")
db.execute(
"INSERT INTO t VALUES (1, X'AABBCCDD', X'00112233'), (2, X'AABBCCDD', X'99887766');"
)
assert execute_all(db, "select * from t where rowid = 1") == [
{
"a": b"\xaa\xbb\xcc\xdd",
"b": b'\x00\x11"3',
"rowid": 1,
}
]
assert execute_all(db, "select * from t where rowid = 999") == []
db.execute(
"CREATE VIRTUAL TABLE t2 USING vec0(id text primary key, a float[1], b float[1])"
)
db.execute(
"INSERT INTO t2 VALUES ('A', X'AABBCCDD', X'00112233'), ('B', X'AABBCCDD', X'99887766');"
)
assert execute_all(db, "select * from t2 where id = 'A'") == [
{
"a": b"\xaa\xbb\xcc\xdd",
"b": b'\x00\x11"3',
"id": "A",
}
]
assert execute_all(db, "select * from t2 where id = 'xxx'") == []
def test_vec0_text_pk():
db = connect(EXT_PATH)
db.execute(
"""
create virtual table t using vec0(
t_id text primary key,
aaa float[1],
bbb float8[1],
chunk_size=8
);
"""
)
assert execute_all(db, "select * from t") == []
with _raises(
"The t virtual table was declared with a TEXT primary key, but a non-TEXT value was provided in an INSERT."
):
db.execute("INSERT INTO t VALUES (1, X'AABBCCDD', X'AABBCCDD')")
db.executemany(
"INSERT INTO t VALUES (:t_id, :aaa, :bbb)",
[
{
"t_id": "t_1",
"aaa": "[.1]",
"bbb": "[-.1]",
},
{
"t_id": "t_2",
"aaa": "[.2]",
"bbb": "[-.2]",
},
{
"t_id": "t_3",
"aaa": "[.3]",
"bbb": "[-.3]",
},
],
)
assert execute_all(db, "select t_id from t") == [
{"t_id": "t_1"},
{"t_id": "t_2"},
{"t_id": "t_3"},
]
assert execute_all(db, "select * from t") == [
{"t_id": "t_1", "aaa": _f32([0.1]), "bbb": _f32([-0.1])},
{"t_id": "t_2", "aaa": _f32([0.2]), "bbb": _f32([-0.2])},
{"t_id": "t_3", "aaa": _f32([0.3]), "bbb": _f32([-0.3])},
]
# EVIDENCE-OF: V09901_26739 vec0 full scan catches _rowid prep error
db.set_authorizer(authorizer_deny_on(sqlite3.SQLITE_READ, "t_rowids", "rowid"))
with _raises(
"Error preparing rowid scan: access to t_rowids.rowid is prohibited",
sqlite3.DatabaseError,
):
db.execute("select * from t")
db.set_authorizer(None)
assert execute_all(
db, "select t_id, distance from t where aaa match ? and k = 3", ["[.01]"]
) == [
{
"t_id": "t_1",
"distance": 0.09000000357627869,
},
{
"t_id": "t_2",
"distance": 0.1899999976158142,
},
{
"t_id": "t_3",
"distance": 0.2900000214576721,
},
]
if SUPPORTS_VTAB_IN:
assert re.match(
("SCAN (TABLE )?t VIRTUAL TABLE INDEX 0:3{___}___\[___"),
explain_query_plan(
"select t_id, distance from t where aaa match '' and k = 3 and t_id in ('t_2', 't_3')",
db=db,
),
)
assert execute_all(
db,
"select t_id, distance from t where aaa match ? and k = 3 and t_id in ('t_2', 't_3')",
["[.01]"],
) == [
{
"t_id": "t_2",
"distance": 0.1899999976158142,
},
{
"t_id": "t_3",
"distance": 0.2900000214576721,
},
]
# test deletes on text primary keys
db.execute("delete from t where t_id = 't_1'")
assert execute_all(db, "select * from t") == [
{"t_id": "t_2", "aaa": _f32([0.2]), "bbb": _f32([-0.2])},
{"t_id": "t_3", "aaa": _f32([0.3]), "bbb": _f32([-0.3])},
]
# test updates on text primary keys
db.execute("update t set aaa = '[999]' where t_id = 't_2'")
assert execute_all(db, "select * from t") == [
{"t_id": "t_2", "aaa": _f32([999]), "bbb": _f32([-0.2])},
{"t_id": "t_3", "aaa": _f32([0.3]), "bbb": _f32([-0.3])},
]
# EVIDENCE-OF: V08886_25725 vec0 primary keys don't allow updates on PKs
with _raises("UPDATEs on vec0 primary key values are not allowed."):
db.execute("update t set t_id = 'xxx' where t_id = 't_2'")
def test_vec0_best_index():
db = connect(EXT_PATH)
db.execute(
"""
create virtual table t using vec0(
aaa float[1],
bbb float8[1]
);
"""
)
with _raises("only 1 MATCH operator is allowed in a single vec0 query"):
db.execute("select * from t where aaa match NULL and bbb match NULL")
if SUPPORTS_VTAB_IN:
with _raises(
"only 1 'rowid in (..)' operator is allowed in a single vec0 query"
):
db.execute("select * from t where rowid in(4,5,6) and rowid in (1, 2,3)")
with _raises("A LIMIT or 'k = ?' constraint is required on vec0 knn queries."):
db.execute("select * from t where aaa MATCH ?")
if SUPPORTS_VTAB_LIMIT:
with _raises("Only LIMIT or 'k =?' can be provided, not both"):
db.execute("select * from t where aaa MATCH ? and k = 10 limit 20")
with _raises(
"Only a single 'ORDER BY distance' clause is allowed on vec0 KNN queries"
):
db.execute(
"select * from t where aaa MATCH NULL and k = 10 order by distance, distance"
)
with _raises(
"Only ascending in ORDER BY distance clause is supported, DESC is not supported yet."
):
db.execute(
"select * from t where aaa MATCH NULL and k = 10 order by distance desc"
)
def authorizer_deny_on(operation, x1, x2=None):
def _auth(op, p1, p2, p3, p4):
if op == operation and p1 == x1 and p2 == x2:
return sqlite3.SQLITE_DENY
return sqlite3.SQLITE_OK
return _auth
def authorizer_debug(op, p1, p2, p3, p4):
print(op, p1, p2, p3, p4)
return sqlite3.SQLITE_OK
from contextlib import contextmanager
@contextmanager
def _raises(message, error=sqlite3.OperationalError):
with pytest.raises(error, match=re.escape(message)):
yield
def test_vec_each():
vec_each_f32 = lambda *args: execute_all(
db, "select rowid, * from vec_each(vec_f32(?))", args
)
assert vec_each_f32(_f32([1.0, 2.0, 3.0])) == [
{"rowid": 0, "value": 1.0},
{"rowid": 1, "value": 2.0},
{"rowid": 2, "value": 3.0},
]
with _raises("Input must have type BLOB (compact format) or TEXT (JSON), found NULL"):
vec_each_f32(None)
import io
def to_npy(arr):
buf = io.BytesIO()
np.save(buf, arr)
buf.seek(0)
return buf.read()
def test_vec_npy_each():
db = connect(EXT_PATH, extra_entrypoint="sqlite3_vec_numpy_init")
vec_npy_each = lambda *args: execute_all(
db, "select rowid, * from vec_npy_each(?)", args
)
assert vec_npy_each(to_npy(np.array([1.1, 2.2, 3.3], dtype=np.float32))) == [
{
"rowid": 0,
"vector": _f32([1.1, 2.2, 3.3]),
},
]
assert vec_npy_each(to_npy(np.array([[1.1, 2.2, 3.3]], dtype=np.float32))) == [
{
"rowid": 0,
"vector": _f32([1.1, 2.2, 3.3]),
},
]
assert vec_npy_each(
to_npy(np.array([[1.1, 2.2, 3.3], [9.9, 8.8, 7.7]], dtype=np.float32))
) == [
{
"rowid": 0,
"vector": _f32([1.1, 2.2, 3.3]),
},
{
"rowid": 1,
"vector": _f32([9.9, 8.8, 7.7]),
},
]
assert vec_npy_each(to_npy(np.array([], dtype=np.float32))) == []
def test_vec_npy_each_errors():
db = connect(EXT_PATH, extra_entrypoint="sqlite3_vec_numpy_init")
vec_npy_each = lambda *args: execute_all(
db, "select rowid, * from vec_npy_each(?)", args
)
full = b"\x93NUMPY\x01\x00v\x00{'descr': ' npt.NDArray[np.float32]:
return np.sqrt(np.sum((mat - vec) ** 2, axis=1))
def np_topk(
vec: npt.NDArray[np.float32],
mat: npt.NDArray[np.float32],
k: int = 5,
) -> tuple[npt.NDArray[np.int32], npt.NDArray[np.float32]]:
distances = np_distance_l2(vec, mat)
# Rather than sorting all similarities and taking the top K, it's faster to
# argpartition and then just sort the top K.
# The difference is O(N logN) vs O(N + k logk)
indices = np.argpartition(distances, kth=k)[:k]
top_indices = indices[np.argsort(distances[indices])]
return top_indices, distances[top_indices]
# import faiss
@pytest.mark.skip(reason="TODO")
def test_correctness_npy():
db = connect(EXT_PATH)
np.random.seed(420 + 1 + 2)
mat = np.random.uniform(low=-1.0, high=1.0, size=(10000, 24)).astype(np.float32)
queries = np.random.uniform(low=-1.0, high=1.0, size=(1000, 24)).astype(np.float32)
# sqlite-vec with vec0
db.execute("create virtual table v using vec0(a float[24], chunk_size=8)")
for v in mat:
db.execute("insert into v(a) values (?)", [v])
# sqlite-vec with scalar functions
db.execute("create table t(a float[24])")
for v in mat:
db.execute("insert into t(a) values (?)", [v])
faiss_index = faiss.IndexFlatL2(24)
faiss_index.add(mat)
k = 10000 - 1
for idx, q in enumerate(queries):
print(idx)
result = execute_all(
db,
"select rowid - 1 as idx, distance from v where a match ? and k = ?",
[q, k],
)
vec_vtab_rowids = [row["idx"] for row in result]
vec_vtab_distances = [row["distance"] for row in result]
result = execute_all(
db,
"select rowid - 1 as idx, vec_distance_l2(a, ?) as distance from t order by 2 limit ?",
[q, k],
)
vec_scalar_rowids = [row["idx"] for row in result]
vec_scalar_distances = [row["distance"] for row in result]
assert vec_scalar_rowids == vec_vtab_rowids
assert vec_scalar_distances == vec_vtab_distances
faiss_distances, faiss_rowids = faiss_index.search(np.array([q]), k)
faiss_distances = np.sqrt(faiss_distances)
assert faiss_rowids[0].tolist() == vec_scalar_rowids
assert faiss_distances[0].tolist() == vec_scalar_distances
assert faiss_distances[0].tolist() == vec_vtab_distances
assert faiss_rowids[0].tolist() == vec_vtab_rowids
np_rowids, np_distances = np_topk(mat, q, k=k)
# assert vec_vtab_rowids == np_rowids.tolist()
# assert vec_vtab_distances == np_distances.tolist()
def test_smoke():
db.execute("drop table if exists vec_xyz")
db.execute("create virtual table vec_xyz using vec0( a float[2] )")
assert execute_all(
db,
"select name from sqlite_master where name like 'vec_xyz%' order by name;",
) == [
{
"name": "vec_xyz",
},
{
"name": "vec_xyz_chunks",
},
{
"name": "vec_xyz_info",
},
{
"name": "vec_xyz_rowids",
},
{
"name": "vec_xyz_vector_chunks00",
},
]
chunk = db.execute("select * from vec_xyz_chunks").fetchone()
# as of TODO, no initial row is inside the chunks table
assert chunk is None
# assert chunk["chunk_id"] == 1
# assert chunk["validity"] == bytearray(int(1024 / 8))
# assert chunk["rowids"] == bytearray(int(1024 * 8))
# vchunk = db.execute("select * from vec_xyz_vector_chunks00").fetchone()
# assert vchunk["rowid"] == 1
# assert vchunk["vectors"] == bytearray(int(1024 * 4 * 2))
assert re.match(
"SCAN (TABLE )?vec_xyz VIRTUAL TABLE INDEX 0:3{___}___",
explain_query_plan(
"select * from vec_xyz where a match X'' and k = 10 order by distance"
),
)
if SUPPORTS_VTAB_LIMIT:
assert re.match(
"SCAN (TABLE )?vec_xyz VIRTUAL TABLE INDEX 0:3{___}___",
explain_query_plan(
"select * from vec_xyz where a match X'' order by distance limit 10"
),
)
assert re.match(
"SCAN (TABLE )?vec_xyz VIRTUAL TABLE INDEX 0:1",
explain_query_plan("select * from vec_xyz"),
)
assert re.match(
"SCAN (TABLE )?vec_xyz VIRTUAL TABLE INDEX 3:2",
explain_query_plan("select * from vec_xyz where rowid = 4"),
)
db.execute("insert into vec_xyz(rowid, a) select 1, X'000000000000803f'")
chunk = db.execute("select * from vec_xyz_chunks").fetchone()
assert chunk["chunk_id"] == 1
assert chunk["validity"] == b"\x01" + bytearray(int(1024 / 8) - 1)
assert chunk["rowids"] == b"\x01\x00\x00\x00\x00\x00\x00\x00" + bytearray(
int(1024 * 8) - 8
)
vchunk = db.execute("select * from vec_xyz_vector_chunks00").fetchone()
assert vchunk["rowid"] == 1
assert vchunk["vectors"] == b"\x00\x00\x00\x00\x00\x00\x80\x3f" + bytearray(
int(1024 * 4 * 2) - (2 * 4)
)
db.execute("insert into vec_xyz(rowid, a) select 2, X'0000000000000040'")
chunk = db.execute("select * from vec_xyz_chunks").fetchone()
assert (
chunk["rowids"]
== b"\x01\x00\x00\x00\x00\x00\x00\x00"
+ b"\x02\x00\x00\x00\x00\x00\x00\x00"
+ bytearray(int(1024 * 8) - 8 * 2)
)
assert chunk["chunk_id"] == 1
assert chunk["validity"] == b"\x03" + bytearray(int(1024 / 8) - 1)
vchunk = db.execute("select * from vec_xyz_vector_chunks00").fetchone()
assert vchunk["rowid"] == 1
assert (
vchunk["vectors"]
== b"\x00\x00\x00\x00\x00\x00\x80\x3f"
+ b"\x00\x00\x00\x00\x00\x00\x00\x40"
+ bytearray(int(1024 * 4 * 2) - (2 * 4 * 2))
)
db.execute("insert into vec_xyz(rowid, a) select 3, X'00000000000080bf'")
chunk = db.execute("select * from vec_xyz_chunks").fetchone()
assert chunk["chunk_id"] == 1
assert chunk["validity"] == b"\x07" + bytearray(int(1024 / 8) - 1)
assert (
chunk["rowids"]
== b"\x01\x00\x00\x00\x00\x00\x00\x00"
+ b"\x02\x00\x00\x00\x00\x00\x00\x00"
+ b"\x03\x00\x00\x00\x00\x00\x00\x00"
+ bytearray(int(1024 * 8) - 8 * 3)
)
vchunk = db.execute("select * from vec_xyz_vector_chunks00").fetchone()
assert vchunk["rowid"] == 1
assert (
vchunk["vectors"]
== b"\x00\x00\x00\x00\x00\x00\x80\x3f"
+ b"\x00\x00\x00\x00\x00\x00\x00\x40"
+ b"\x00\x00\x00\x00\x00\x00\x80\xbf"
+ bytearray(int(1024 * 4 * 2) - (2 * 4 * 3))
)
# db.execute("select * from vec_xyz")
assert execute_all(db, "select * from vec_xyz") == [
{"rowid": 1, "a": b"\x00\x00\x00\x00\x00\x00\x80?"},
{"rowid": 2, "a": b"\x00\x00\x00\x00\x00\x00\x00@"},
{"rowid": 3, "a": b"\x00\x00\x00\x00\x00\x00\x80\xbf"},
]
def test_vec0_stress_small_chunks():
data = np.zeros((1000, 8), dtype=np.float32)
for i in range(1000):
data[i] = np.array([(i + 1) * 0.1] * 8)
db.execute("drop table if exists vec_small")
db.execute("create virtual table vec_small using vec0(chunk_size=8, a float[8])")
assert execute_all(db, "select rowid, * from vec_small") == []
with db:
for row in data:
db.execute("insert into vec_small(a) values (?) ", [row])
assert execute_all(db, "select rowid, * from vec_small limit 8") == [
{"rowid": 1, "a": _f32([0.1] * 8)},
{"rowid": 2, "a": _f32([0.2] * 8)},
{"rowid": 3, "a": _f32([0.3] * 8)},
{"rowid": 4, "a": _f32([0.4] * 8)},
{"rowid": 5, "a": _f32([0.5] * 8)},
{"rowid": 6, "a": _f32([0.6] * 8)},
{"rowid": 7, "a": _f32([0.7] * 8)},
{"rowid": 8, "a": _f32([0.8] * 8)},
]
assert db.execute("select count(*) from vec_small").fetchone()[0] == 1000
assert execute_all(
db, "select rowid, * from vec_small order by rowid desc limit 8"
) == [
{"rowid": 1000, "a": _f32([100.0] * 8)},
{"rowid": 999, "a": _f32([99.9] * 8)},
{"rowid": 998, "a": _f32([99.8] * 8)},
{"rowid": 997, "a": _f32([99.7] * 8)},
{"rowid": 996, "a": _f32([99.6] * 8)},
{"rowid": 995, "a": _f32([99.5] * 8)},
{"rowid": 994, "a": _f32([99.4] * 8)},
{"rowid": 993, "a": _f32([99.3] * 8)},
]
assert execute_all(
db,
"""
select rowid, a, distance
from vec_small
where a match ?
and k = 9
order by distance
""",
[_f32([50.0] * 8)],
) == [
{
"a": _f32([500 * 0.1] * 8),
"distance": 0.0,
"rowid": 500,
},
{
"a": _f32([501 * 0.1] * 8),
"distance": 0.2828384041786194,
"rowid": 501,
},
{
"a": _f32([499 * 0.1] * 8),
"distance": 0.2828384041786194,
"rowid": 499,
},
{
"a": _f32([502 * 0.1] * 8),
"distance": 0.5656875967979431,
"rowid": 502,
},
{
"a": _f32([498 * 0.1] * 8),
"distance": 0.5656875967979431,
"rowid": 498,
},
{
"a": _f32([503 * 0.1] * 8),
"distance": 0.8485260009765625,
"rowid": 503,
},
{
"a": _f32([497 * 0.1] * 8),
"distance": 0.8485260009765625,
"rowid": 497,
},
{
"a": _f32([496 * 0.1] * 8),
"distance": 1.1313751935958862,
"rowid": 496,
},
{
"a": _f32([504 * 0.1] * 8),
"distance": 1.1313751935958862,
"rowid": 504,
},
]
def test_vec0_distance_metric():
base = "('[1, 2]'), ('[3, 4]'), ('[5, 6]')"
q = "[-1, -2]"
db = connect(EXT_PATH)
db.execute("create virtual table v1 using vec0( a float[2])")
db.execute(f"insert into v1(a) values {base}")
db.execute("create virtual table v2 using vec0( a float[2] distance_metric=l2)")
db.execute(f"insert into v2(a) values {base}")
db.execute("create virtual table v3 using vec0( a float[2] distance_metric=l1)")
db.execute(f"insert into v3(a) values {base}")
db.execute("create virtual table v4 using vec0( a float[2] distance_metric=cosine)")
db.execute(f"insert into v4(a) values {base}")
# default (L2)
assert execute_all(
db, "select rowid, distance from v1 where a match ? and k = 3", [q]
) == [
{"rowid": 1, "distance": 4.4721360206604},
{"rowid": 2, "distance": 7.211102485656738},
{"rowid": 3, "distance": 10.0},
]
# l2
assert execute_all(
db, "select rowid, distance from v2 where a match ? and k = 3", [q]
) == [
{"rowid": 1, "distance": 4.4721360206604},
{"rowid": 2, "distance": 7.211102485656738},
{"rowid": 3, "distance": 10.0},
]
# l1
assert execute_all(
db, "select rowid, distance from v3 where a match ? and k = 3", [q]
) == [
{"rowid": 1, "distance": 6},
{"rowid": 2, "distance": 10},
{"rowid": 3, "distance": 14},
]
# consine
assert execute_all(
db, "select rowid, distance from v4 where a match ? and k = 3", [q]
) == [
{"rowid": 3, "distance": 1.9734171628952026},
{"rowid": 2, "distance": 1.9838699102401733},
{"rowid": 1, "distance": 2},
]
def test_vec0_vacuum():
db = connect(EXT_PATH)
db.execute("create virtual table vec_t using vec0(a float[1]);")
db.execute("begin")
db.execute("insert into vec_t(a) values (X'AABBCCDD')")
db.commit()
db.execute("vacuum")
def rowids_value(buffer: bytearray) -> List[int]:
assert (len(buffer) % 8) == 0
n = int(len(buffer) / 8)
return list(struct.unpack_from(f"<{n}q", buffer))
import numpy.typing as npt
def cosine_similarity(
vec: npt.NDArray[np.float32], mat: npt.NDArray[np.float32], do_norm: bool = True
) -> npt.NDArray[np.float32]:
sim = vec @ mat.T
if do_norm:
sim /= np.linalg.norm(vec) * np.linalg.norm(mat, axis=1)
return sim
def topk(
vec: npt.NDArray[np.float32],
mat: npt.NDArray[np.float32],
k: int = 5,
do_norm: bool = True,
) -> tuple[npt.NDArray[np.int32], npt.NDArray[np.float32]]:
sim = cosine_similarity(vec, mat, do_norm=do_norm)
# Rather than sorting all similarities and taking the top K, it's faster to
# argpartition and then just sort the top K.
# The difference is O(N logN) vs O(N + k logk)
indices = np.argpartition(-sim, kth=k)[:k]
top_indices = np.argsort(-sim[indices])
return indices[top_indices], sim[top_indices]
def test_stress1():
np.random.seed(1234)
data = np.random.uniform(-1.0, 1.0, (8000, 128)).astype(np.float32)
db.execute("drop table if exists vec_stress1")
db.execute(
"create virtual table vec_stress1 using vec0( a float[128] distance_metric=cosine)"
)
with db:
for idx, row in enumerate(data):
db.execute("insert into vec_stress1 values (?, ?)", [idx, row])
queries = np.random.uniform(-1.0, 1.0, (100, 128)).astype(np.float32)
for q in queries:
ids, distances = topk(q, data, k=10)
rows = db.execute(
"""
select rowid, distance
from vec_stress1
where a match ? and k = ?
order by distance
""",
[q, 10],
).fetchall()
assert len(ids) == 10
assert len(rows) == 10
vec_ids = [row[0] for row in rows]
assert ids.tolist() == vec_ids
@pytest.mark.skip(reason="slow")
def test_stress():
db.execute("create virtual table vec_t1 using vec0( a float[1536])")
def rand_vec(n):
return struct.pack("%sf" % n, *list(map(lambda x: random(), range(n))))
for i in range(1025):
db.execute("insert into vec_t1(a) values (?)", [rand_vec(1536)])
rows = db.execute("select validity, rowids from vec_t1_chunks").fetchall()
assert len(rows) == 2
assert len(rows[0]["validity"]) == 1024 / CHAR_BIT
assert len(rows[0]["rowids"]) == 1024 * CHAR_BIT
assert rows[0]["validity"] == bitmap_full(1024)
assert rowids_value(rows[0]["rowids"]) == [x + 1 for x in range(1024)]
assert len(rows[1]["validity"]) == 1024 / CHAR_BIT
assert len(rows[1]["rowids"]) == 1024 * CHAR_BIT
assert rows[1]["validity"] == bytes([0b0000_0001]) + bitmap_zerod(1024)[1:]
assert rowids_value(rows[1]["rowids"])[0] == 1025
def test_coverage():
current_module = inspect.getmodule(inspect.currentframe())
test_methods = [
member[0]
for member in inspect.getmembers(current_module)
if member[0].startswith("test_")
]
funcs_with_tests = set([x.replace("test_", "") for x in test_methods])
for func in [*FUNCTIONS, *MODULES]:
assert func in funcs_with_tests, f"{func} is not tested"
if __name__ == "__main__":
unittest.main()
================================================
FILE: tests/test-metadata.py
================================================
import pytest
import sqlite3
from collections import OrderedDict
import json
from helpers import exec, vec0_shadow_table_contents
def test_constructor_limit(db, snapshot):
assert exec(
db,
f"""
create virtual table v using vec0(
{",".join([f"metadata{x} integer" for x in range(17)])}
v float[1]
)
""",
) == snapshot(name="max 16 metadata columns")
def test_normal(db, snapshot):
db.execute(
"create virtual table v using vec0(vector float[1], b boolean, n int, f float, t text, chunk_size=8)"
)
assert exec(
db, "select * from sqlite_master where type = 'table' order by name"
) == snapshot(name="sqlite_master")
assert vec0_shadow_table_contents(db, "v") == snapshot()
INSERT = "insert into v(vector, b, n, f, t) values (?, ?, ?, ?, ?)"
assert exec(db, INSERT, [b"\x11\x11\x11\x11", 1, 1, 1.1, "one"]) == snapshot()
assert exec(db, INSERT, [b"\x22\x22\x22\x22", 1, 2, 2.2, "two"]) == snapshot()
assert exec(db, INSERT, [b"\x33\x33\x33\x33", 1, 3, 3.3, "three"]) == snapshot()
assert exec(db, "select * from v") == snapshot()
assert vec0_shadow_table_contents(db, "v") == snapshot()
assert exec(db, "drop table v") == snapshot()
assert exec(db, "select * from sqlite_master") == snapshot()
#
# assert exec(db, "select * from v") == snapshot()
# assert vec0_shadow_table_contents(db, "v") == snapshot()
#
# db.execute("drop table v;")
# assert exec(db, "select * from sqlite_master order by name") == snapshot(
# name="sqlite_master post drop"
# )
def test_text_knn(db, snapshot):
db.execute(
"create virtual table v using vec0(vector float[1], name text, chunk_size=8)"
)
assert vec0_shadow_table_contents(db, "v") == snapshot()
INSERT = "insert into v(vector, name) values (?, ?)"
db.execute(
"""
INSERT INTO v(vector, name) VALUES
('[.11]', 'aaa'),
('[.22]', 'bbb'),
('[.33]', 'ccc'),
('[.44]', 'ddd'),
('[.55]', 'eee'),
('[.66]', 'fff'),
('[.77]', 'ggg'),
('[.88]', 'hhh'),
('[.99]', 'iii');
"""
)
assert exec(db, "select * from v") == snapshot()
assert vec0_shadow_table_contents(db, "v") == snapshot()
assert (
exec(
db,
"select rowid, name, distance from v where vector match '[1]' and k = 5",
)
== snapshot()
)
assert (
exec(
db,
"select rowid, name, distance from v where vector match '[1]' and k = 5 and name < 'ddd'",
)
== snapshot()
)
assert (
exec(
db,
"select rowid, name, distance from v where vector match '[1]' and k = 5 and name <= 'ddd'",
)
== snapshot()
)
assert (
exec(
db,
"select rowid, name, distance from v where vector match '[1]' and k = 5 and name > 'fff'",
)
== snapshot()
)
assert (
exec(
db,
"select rowid, name, distance from v where vector match '[1]' and k = 5 and name >= 'fff'",
)
== snapshot()
)
assert (
exec(
db,
"select rowid, name, distance from v where vector match '[1]' and k = 5 and name = 'aaa'",
)
== snapshot()
)
assert (
exec(
db,
"select rowid, name, distance from v where vector match '[.01]' and k = 5 and name != 'aaa'",
)
== snapshot()
)
def test_long_text_updates(db, snapshot):
db.execute(
"create virtual table v using vec0(vector float[1], name text, chunk_size=8)"
)
assert vec0_shadow_table_contents(db, "v") == snapshot()
INSERT = "insert into v(vector, name) values (?, ?)"
exec(db, INSERT, [b"\x11\x11\x11\x11", "123456789a12"])
exec(db, INSERT, [b"\x11\x11\x11\x11", "123456789a123"])
assert exec(db, "select * from v") == snapshot()
assert vec0_shadow_table_contents(db, "v") == snapshot()
def test_long_text_knn(db, snapshot):
db.execute(
"create virtual table v using vec0(vector float[1], name text, chunk_size=8)"
)
INSERT = "insert into v(vector, name) values (?, ?)"
exec(db, INSERT, ["[1]", "aaaa"])
exec(db, INSERT, ["[2]", "aaaaaaaaaaaa_aaa"])
exec(db, INSERT, ["[3]", "bbbb"])
exec(db, INSERT, ["[4]", "bbbbbbbbbbbb_bbb"])
exec(db, INSERT, ["[5]", "cccc"])
exec(db, INSERT, ["[6]", "cccccccccccc_ccc"])
tests = [
"bbbb",
"bb",
"bbbbbb",
"bbbbbbbbbbbb_bbb",
"bbbbbbbbbbbb_aaa",
"bbbbbbbbbbbb_ccc",
"longlonglonglonglonglonglong",
]
ops = ["=", "!=", "<", "<=", ">", ">="]
op_names = ["eq", "ne", "lt", "le", "gt", "ge"]
for test in tests:
for op, op_name in zip(ops, op_names):
assert exec(
db,
f"select rowid, name, distance from v where vector match '[100]' and k = 5 and name {op} ?",
[test],
) == snapshot(name=f"{op_name}-{test}")
def test_types(db, snapshot):
db.execute(
"create virtual table v using vec0(vector float[1], b boolean, n int, f float, t text, chunk_size=8)"
)
INSERT = "insert into v(vector, b, n, f, t) values (?, ?, ?, ?, ?)"
assert exec(db, INSERT, [b"\x11\x11\x11\x11", 1, 1, 1.1, "test"]) == snapshot(
name="legal"
)
# fmt: off
assert exec(db, INSERT, [b"\x11\x11\x11\x11", 'illegal', 1, 1.1, 'test']) == snapshot(name="illegal-type-boolean")
assert exec(db, INSERT, [b"\x11\x11\x11\x11", 1, 'illegal', 1.1, 'test']) == snapshot(name="illegal-type-int")
assert exec(db, INSERT, [b"\x11\x11\x11\x11", 1, 1, 'illegal', 'test']) == snapshot(name="illegal-type-float")
assert exec(db, INSERT, [b"\x11\x11\x11\x11", 1, 1, 1.1, 420]) == snapshot(name="illegal-type-text")
# fmt: on
assert exec(db, INSERT, [b"\x11\x11\x11\x11", 44, 1, 1.1, "test"]) == snapshot(
name="illegal-boolean"
)
def test_updates(db, snapshot):
db.execute(
"create virtual table v using vec0(vector float[1], b boolean, n int, f float, t text, chunk_size=8)"
)
INSERT = "insert into v(rowid, vector, b, n, f, t) values (?, ?, ?, ?, ?, ?)"
exec(db, INSERT, [1, b"\x11\x11\x11\x11", 1, 1, 1.1, "test1"])
exec(db, INSERT, [2, b"\x22\x22\x22\x22", 1, 2, 2.2, "test2"])
exec(db, INSERT, [3, b"\x33\x33\x33\x33", 1, 3, 3.3, "1234567890123"])
assert exec(db, "select * from v") == snapshot(name="1-init-contents")
assert vec0_shadow_table_contents(db, "v") == snapshot(name="1-init-shadow")
assert exec(
db, "UPDATE v SET b = 0, n = 11, f = 11.11, t = 'newtest1' where rowid = 1"
)
assert exec(db, "select * from v") == snapshot(name="general-update-contents")
assert vec0_shadow_table_contents(db, "v") == snapshot(
name="general-update-shaodnw"
)
# string update #1: long string updated to long string
exec(db, "UPDATE v SET t = '1234567890123-updated' where rowid = 3")
assert exec(db, "select * from v") == snapshot(name="string-update-1-contents")
assert vec0_shadow_table_contents(db, "v") == snapshot(
name="string-update-1-shadow"
)
# string update #2: short string updated to short string
exec(db, "UPDATE v SET t = 'test2-short' where rowid = 2")
assert exec(db, "select * from v") == snapshot(name="string-update-2-contents")
assert vec0_shadow_table_contents(db, "v") == snapshot(
name="string-update-2-shadow"
)
# string update #3: short string updated to long string
exec(db, "UPDATE v SET t = 'test2-long-long-long' where rowid = 2")
assert exec(db, "select * from v") == snapshot(name="string-update-3-contents")
assert vec0_shadow_table_contents(db, "v") == snapshot(
name="string-update-3-shadow"
)
# string update #4: long string updated to short string
exec(db, "UPDATE v SET t = 'test2-shortx' where rowid = 2")
assert exec(db, "select * from v") == snapshot(name="string-update-4-contents")
assert vec0_shadow_table_contents(db, "v") == snapshot(
name="string-update-4-shadow"
)
def test_deletes(db, snapshot):
db.execute(
"create virtual table v using vec0(vector float[1], b boolean, n int, f float, t text, chunk_size=8)"
)
INSERT = "insert into v(rowid, vector, b, n, f, t) values (?, ?, ?, ?, ?, ?)"
assert exec(db, INSERT, [1, b"\x11\x11\x11\x11", 1, 1, 1.1, "test1"]) == snapshot()
assert exec(db, INSERT, [2, b"\x22\x22\x22\x22", 1, 2, 2.2, "test2"]) == snapshot()
assert (
exec(db, INSERT, [3, b"\x33\x33\x33\x33", 1, 3, 3.3, "1234567890123"])
== snapshot()
)
assert exec(db, "select * from v") == snapshot()
assert vec0_shadow_table_contents(db, "v") == snapshot()
assert exec(db, "DELETE FROM v where rowid = 1") == snapshot()
assert exec(db, "select * from v") == snapshot()
assert vec0_shadow_table_contents(db, "v") == snapshot()
assert exec(db, "DELETE FROM v where rowid = 3") == snapshot()
assert exec(db, "select * from v") == snapshot()
assert vec0_shadow_table_contents(db, "v") == snapshot()
def test_knn(db, snapshot):
db.execute(
"create virtual table v using vec0(vector float[1], name text, chunk_size=8)"
)
assert exec(
db, "select * from sqlite_master where type = 'table' order by name"
) == snapshot(name="sqlite_master")
db.executemany(
"insert into v(vector, name) values (?, ?)",
[("[1]", "alex"), ("[2]", "brian"), ("[3]", "craig")],
)
# EVIDENCE-OF: V16511_00582 catches "illegal" constraints on metadata columns
assert (
exec(
db,
"select *, distance from v where vector match '[5]' and k = 3 and name like 'illegal'",
)
== snapshot()
)
SUPPORTS_VTAB_IN = sqlite3.sqlite_version_info[1] >= 38
@pytest.mark.skipif(
not SUPPORTS_VTAB_IN, reason="requires vtab `x in (...)` support in SQLite >=3.38"
)
def test_vtab_in(db, snapshot):
db.execute(
"create virtual table v using vec0(vector float[1], n int, t text, b boolean, f float, chunk_size=8)"
)
db.executemany(
"insert into v(rowid, vector, n, t, b, f) values (?, ?, ?, ?, ?, ?)",
[
(1, "[1]", 999, "aaaa", 0, 1.1),
(2, "[2]", 555, "aaaa", 0, 1.1),
(3, "[3]", 999, "aaaa", 0, 1.1),
(4, "[4]", 555, "aaaa", 0, 1.1),
(5, "[5]", 999, "zzzz", 0, 1.1),
(6, "[6]", 555, "zzzz", 0, 1.1),
(7, "[7]", 999, "zzzz", 0, 1.1),
(8, "[8]", 555, "zzzz", 0, 1.1),
],
)
# EVIDENCE-OF: V15248_32086
assert exec(
db, "select * from v where vector match '[0]' and k = 8 and b in (1, 0)"
) == snapshot(name="block-bool")
assert exec(
db, "select * from v where vector match '[0]' and k = 8 and f in (1.1, 0.0)"
) == snapshot(name="block-float")
assert exec(
db,
"select rowid, n, distance from v where vector match '[0]' and k = 8 and n in (555, 999)",
) == snapshot(name="allow-int-all")
assert exec(
db,
"select rowid, n, distance from v where vector match '[0]' and k = 8 and n in (555, -1, -2)",
) == snapshot(name="allow-int-superfluous")
assert exec(
db,
"select rowid, t, distance from v where vector match '[0]' and k = 8 and t in ('aaaa', 'zzzz')",
) == snapshot(name="allow-text-all")
assert exec(
db,
"select rowid, t, distance from v where vector match '[0]' and k = 8 and t in ('aaaa', 'foo', 'bar')",
) == snapshot(name="allow-text-superfluous")
def test_vtab_in_long_text(db, snapshot):
db.execute(
"create virtual table v using vec0(vector float[1], t text, chunk_size=8)"
)
data = [
(1, "aaaa"),
(2, "aaaaaaaaaaaa_aaa"),
(3, "bbbb"),
(4, "bbbbbbbbbbbb_bbb"),
(5, "cccc"),
(6, "cccccccccccc_ccc"),
]
db.executemany(
"insert into v(rowid, vector, t) values (:rowid, printf('[%d]', :rowid), :vector)",
[{"rowid": row[0], "vector": row[1]} for row in data],
)
for _, lookup in data:
assert exec(
db,
"select rowid, t from v where vector match '[0]' and k = 10 and t in (?, 'nonsense')",
[lookup],
) == snapshot(name=f"individual-{lookup}")
assert exec(
db,
"select rowid, t from v where vector match '[0]' and k = 10 and t in (select value from json_each(?))",
[json.dumps([row[1] for row in data])],
) == snapshot(name="all")
def test_idxstr(db, snapshot):
db.execute(
"""
create virtual table vec_movies using vec0(
movie_id integer primary key,
synopsis_embedding float[1],
+title text,
is_favorited boolean,
genre text,
num_reviews int,
mean_rating float,
chunk_size=8
);
"""
)
assert (
eqp(
db,
"select * from vec_movies where synopsis_embedding match '' and k = 0 and is_favorited = true",
)
== snapshot()
)
ops = ["<", ">", "<=", ">=", "!="]
for op in ops:
assert eqp(
db,
f"select * from vec_movies where synopsis_embedding match '' and k = 0 and genre {op} NULL",
) == snapshot(name=f"knn-constraint-text {op}")
for op in ops:
assert eqp(
db,
f"select * from vec_movies where synopsis_embedding match '' and k = 0 and num_reviews {op} NULL",
) == snapshot(name=f"knn-constraint-int {op}")
for op in ops:
assert eqp(
db,
f"select * from vec_movies where synopsis_embedding match '' and k = 0 and mean_rating {op} NULL",
) == snapshot(name=f"knn-constraint-float {op}")
# for op in ops:
# assert eqp(
# db,
# f"select * from vec_movies where synopsis_embedding match '' and k = 0 and is_favorited {op} NULL",
# ) == snapshot(name=f"knn-constraint-boolean {op}")
def eqp(db, sql):
o = OrderedDict()
o["sql"] = sql
o["plan"] = [
dict(row) for row in db.execute(f"explain query plan {sql}").fetchall()
]
for p in o["plan"]:
# value is different on macos-aarch64 in github actions, not sure why
del p["notused"]
return o
def test_stress(db, snapshot):
db.execute(
"""
create virtual table vec_movies using vec0(
movie_id integer primary key,
synopsis_embedding float[1],
+title text,
is_favorited boolean,
genre text,
num_reviews int,
mean_rating float,
chunk_size=8
);
"""
)
db.execute(
"""
INSERT INTO vec_movies(movie_id, synopsis_embedding, is_favorited, genre, title, num_reviews, mean_rating)
VALUES
(1, '[1]', 0, 'horror', 'The Conjuring', 153, 4.6),
(2, '[2]', 0, 'comedy', 'Dumb and Dumber', 382, 2.6),
(3, '[3]', 0, 'scifi', 'Interstellar', 53, 5.0),
(4, '[4]', 0, 'fantasy', 'The Lord of the Rings: The Fellowship of the Ring', 210, 4.2),
(5, '[5]', 1, 'documentary', 'An Inconvenient Truth', 93, 3.4),
(6, '[6]', 1, 'horror', 'Hereditary', 167, 4.7),
(7, '[7]', 1, 'comedy', 'Anchorman: The Legend of Ron Burgundy', 482, 2.9),
(8, '[8]', 0, 'scifi', 'Blade Runner 2049', 301, 5.0),
(9, '[9]', 1, 'fantasy', 'Harry Potter and the Sorcerer''s Stone', 134, 4.1),
(10, '[10]', 0, 'documentary', 'Free Solo', 66, 3.2),
(11, '[11]', 1, 'horror', 'Get Out', 88, 4.9),
(12, '[12]', 0, 'comedy', 'The Hangover', 59, 2.8),
(13, '[13]', 1, 'scifi', 'The Matrix', 423, 4.5),
(14, '[14]', 0, 'fantasy', 'Pan''s Labyrinth', 275, 3.6),
(15, '[15]', 1, 'documentary', '13th', 191, 4.4),
(16, '[16]', 0, 'horror', 'It Follows', 314, 4.3),
(17, '[17]', 1, 'comedy', 'Step Brothers', 74, 3.0),
(18, '[18]', 1, 'scifi', 'Inception', 201, 5.0),
(19, '[19]', 1, 'fantasy', 'The Shape of Water', 399, 2.7),
(20, '[20]', 1, 'documentary', 'Won''t You Be My Neighbor?', 186, 4.8),
(21, '[21]', 1, 'scifi', 'Gravity', 342, 4.0),
(22, '[22]', 1, 'scifi', 'Dune', 451, 4.4),
(23, '[23]', 1, 'scifi', 'The Martian', 522, 4.6),
(24, '[24]', 1, 'horror', 'A Quiet Place', 271, 4.3),
(25, '[25]', 1, 'fantasy', 'The Chronicles of Narnia: The Lion, the Witch and the Wardrobe', 310, 3.9);
"""
)
assert vec0_shadow_table_contents(db, "vec_movies") == snapshot()
assert (
exec(
db,
"""
select
movie_id,
title,
genre,
num_reviews,
mean_rating,
is_favorited,
distance
from vec_movies
where synopsis_embedding match '[15.5]'
and genre = 'scifi'
and num_reviews between 100 and 500
and mean_rating > 3.5
and k = 5;
""",
)
== snapshot()
)
assert (
exec(
db,
"select movie_id, genre, distance from vec_movies where synopsis_embedding match '[100]' and k = 5 and genre = 'horror'",
)
== snapshot()
)
assert (
exec(
db,
"select movie_id, genre, distance from vec_movies where synopsis_embedding match '[100]' and k = 5 and genre = 'comedy'",
)
== snapshot()
)
assert (
exec(
db,
"select movie_id, num_reviews, distance from vec_movies where synopsis_embedding match '[100]' and k = 5 and num_reviews between 100 and 500",
)
== snapshot()
)
assert (
exec(
db,
"select movie_id, num_reviews, distance from vec_movies where synopsis_embedding match '[100]' and k = 5 and num_reviews >= 500",
)
== snapshot()
)
assert (
exec(
db,
"select movie_id, mean_rating, distance from vec_movies where synopsis_embedding match '[100]' and k = 5 and mean_rating < 3.0",
)
== snapshot()
)
assert (
exec(
db,
"select movie_id, mean_rating, distance from vec_movies where synopsis_embedding match '[100]' and k = 5 and mean_rating between 4.0 and 5.0",
)
== snapshot()
)
assert exec(
db,
"select movie_id, is_favorited, distance from vec_movies where synopsis_embedding match '[100]' and k = 5 and is_favorited = TRUE",
) == snapshot(name="bool-eq-true")
assert exec(
db,
"select movie_id, is_favorited, distance from vec_movies where synopsis_embedding match '[100]' and k = 5 and is_favorited != TRUE",
) == snapshot(name="bool-ne-true")
assert exec(
db,
"select movie_id, is_favorited, distance from vec_movies where synopsis_embedding match '[100]' and k = 5 and is_favorited = FALSE",
) == snapshot(name="bool-eq-false")
assert exec(
db,
"select movie_id, is_favorited, distance from vec_movies where synopsis_embedding match '[100]' and k = 5 and is_favorited != FALSE",
) == snapshot(name="bool-ne-false")
# EVIDENCE-OF: V10145_26984
assert exec(
db,
"select movie_id, is_favorited, distance from vec_movies where synopsis_embedding match '[100]' and k = 5 and is_favorited >= 999",
) == snapshot(name="bool-other-op")
def test_errors(db, snapshot):
db.execute("create virtual table v using vec0(vector float[1], t text)")
db.execute("insert into v(vector, t) values ('[1]', 'aaaaaaaaaaaax')")
assert exec(db, "select * from v") == snapshot()
# EVIDENCE-OF: V15466_32305
db.set_authorizer(
authorizer_deny_on(sqlite3.SQLITE_READ, "v_metadatatext00", "data")
)
assert exec(db, "select * from v") == snapshot()
def authorizer_deny_on(operation, x1, x2=None):
def _auth(op, p1, p2, p3, p4):
if op == operation and p1 == x1 and p2 == x2:
return sqlite3.SQLITE_DENY
return sqlite3.SQLITE_OK
return _auth
================================================
FILE: tests/test-partition-keys.py
================================================
import sqlite3
from helpers import exec, vec0_shadow_table_contents
def test_constructor_limit(db, snapshot):
assert exec(
db,
"""
create virtual table v using vec0(
p1 int partition key,
p2 int partition key,
p3 int partition key,
p4 int partition key,
p5 int partition key,
v float[1]
)
""",
) == snapshot(name="max 4 partition keys")
def test_normal(db, snapshot):
db.execute(
"create virtual table v using vec0(p1 int partition key, a float[1], chunk_size=8)"
)
db.execute("insert into v(rowid, p1, a) values (1, 100, X'11223344')")
assert vec0_shadow_table_contents(db, "v") == snapshot(name="1 row")
db.execute("insert into v(rowid, p1, a) values (2, 100, X'44556677')")
assert vec0_shadow_table_contents(db, "v") == snapshot(name="2 rows, same parition")
db.execute("insert into v(rowid, p1, a) values (3, 200, X'8899aabb')")
assert vec0_shadow_table_contents(db, "v") == snapshot(name="3 rows, 2 partitions")
def test_types(db, snapshot):
db.execute(
"create virtual table v using vec0(p1 int partition key, a float[1], chunk_size=8)"
)
# EVIDENCE-OF: V11454_28292
assert exec(
db, "insert into v(p1, a) values(?, ?)", ["not int", b"\x11\x22\x33\x44"]
) == snapshot(name="1. raises type error")
assert vec0_shadow_table_contents(db, "v") == snapshot(name="2. empty DB")
# but allow NULLs
assert exec(
db, "insert into v(p1, a) values(?, ?)", [None, b"\x11\x22\x33\x44"]
) == snapshot(name="3. allow nulls")
assert vec0_shadow_table_contents(db, "v") == snapshot(
name="4. show NULL partition key"
)
def test_updates(db, snapshot):
db.execute(
"create virtual table v using vec0(p text partition key, a float[1], chunk_size=8)"
)
db.execute(
"insert into v(rowid, p, a) values (?, ?, ?)", [1, "a", b"\x11\x11\x11\x11"]
)
db.execute(
"insert into v(rowid, p, a) values (?, ?, ?)", [2, "a", b"\x22\x22\x22\x22"]
)
db.execute(
"insert into v(rowid, p, a) values (?, ?, ?)", [3, "a", b"\x33\x33\x33\x33"]
)
assert exec(db, "select * from v") == snapshot(name="1. Initial dataset")
assert exec(db, "update v set p = ? where rowid = ?", ["new", 1]) == snapshot(
name="2. update #1"
)
class Row:
def __init__(self):
pass
def __repr__(self) -> str:
return repr()
================================================
FILE: tests/test-unit.c
================================================
#include "../sqlite-vec.h"
#include "sqlite-vec-internal.h"
#include
#include
#include
#include
#define countof(x) (sizeof(x) / sizeof((x)[0]))
// Tests vec0_token_next(), the low-level tokenizer that extracts the next
// token from a raw char range. Covers every token type (identifier, digit,
// brackets, plus, equals), whitespace skipping, EOF on empty/whitespace-only
// input, error on unrecognised characters, and boundary behaviour where
// identifiers and digits stop at the next non-matching character.
void test_vec0_token_next() {
printf("Starting %s...\n", __func__);
struct Vec0Token token;
int rc;
char *input;
// Single-character tokens
input = "+";
rc = vec0_token_next(input, input + 1, &token);
assert(rc == VEC0_TOKEN_RESULT_SOME);
assert(token.token_type == TOKEN_TYPE_PLUS);
input = "[";
rc = vec0_token_next(input, input + 1, &token);
assert(rc == VEC0_TOKEN_RESULT_SOME);
assert(token.token_type == TOKEN_TYPE_LBRACKET);
input = "]";
rc = vec0_token_next(input, input + 1, &token);
assert(rc == VEC0_TOKEN_RESULT_SOME);
assert(token.token_type == TOKEN_TYPE_RBRACKET);
input = "=";
rc = vec0_token_next(input, input + 1, &token);
assert(rc == VEC0_TOKEN_RESULT_SOME);
assert(token.token_type == TOKEN_TYPE_EQ);
// Identifier
input = "hello";
rc = vec0_token_next(input, input + 5, &token);
assert(rc == VEC0_TOKEN_RESULT_SOME);
assert(token.token_type == TOKEN_TYPE_IDENTIFIER);
assert(token.start == input);
assert(token.end == input + 5);
// Identifier with underscores and digits
input = "col_1a";
rc = vec0_token_next(input, input + 6, &token);
assert(rc == VEC0_TOKEN_RESULT_SOME);
assert(token.token_type == TOKEN_TYPE_IDENTIFIER);
assert(token.end - token.start == 6);
// Digit sequence
input = "1234";
rc = vec0_token_next(input, input + 4, &token);
assert(rc == VEC0_TOKEN_RESULT_SOME);
assert(token.token_type == TOKEN_TYPE_DIGIT);
assert(token.start == input);
assert(token.end == input + 4);
// Leading whitespace is skipped
input = " abc";
rc = vec0_token_next(input, input + 5, &token);
assert(rc == VEC0_TOKEN_RESULT_SOME);
assert(token.token_type == TOKEN_TYPE_IDENTIFIER);
assert(token.end - token.start == 3);
// Tab/newline whitespace
input = "\t\n\r X";
rc = vec0_token_next(input, input + 5, &token);
assert(rc == VEC0_TOKEN_RESULT_SOME);
assert(token.token_type == TOKEN_TYPE_IDENTIFIER);
// Empty input
input = "";
rc = vec0_token_next(input, input, &token);
assert(rc == VEC0_TOKEN_RESULT_EOF);
// Only whitespace
input = " ";
rc = vec0_token_next(input, input + 3, &token);
assert(rc == VEC0_TOKEN_RESULT_EOF);
// Unrecognized character
input = "@";
rc = vec0_token_next(input, input + 1, &token);
assert(rc == VEC0_TOKEN_RESULT_ERROR);
input = "!";
rc = vec0_token_next(input, input + 1, &token);
assert(rc == VEC0_TOKEN_RESULT_ERROR);
// Identifier stops at bracket
input = "foo[";
rc = vec0_token_next(input, input + 4, &token);
assert(rc == VEC0_TOKEN_RESULT_SOME);
assert(token.token_type == TOKEN_TYPE_IDENTIFIER);
assert(token.end - token.start == 3);
// Digit stops at non-digit
input = "42abc";
rc = vec0_token_next(input, input + 5, &token);
assert(rc == VEC0_TOKEN_RESULT_SOME);
assert(token.token_type == TOKEN_TYPE_DIGIT);
assert(token.end - token.start == 2);
// Left paren
input = "(";
rc = vec0_token_next(input, input + 1, &token);
assert(rc == VEC0_TOKEN_RESULT_SOME);
assert(token.token_type == TOKEN_TYPE_LPAREN);
// Right paren
input = ")";
rc = vec0_token_next(input, input + 1, &token);
assert(rc == VEC0_TOKEN_RESULT_SOME);
assert(token.token_type == TOKEN_TYPE_RPAREN);
// Comma
input = ",";
rc = vec0_token_next(input, input + 1, &token);
assert(rc == VEC0_TOKEN_RESULT_SOME);
assert(token.token_type == TOKEN_TYPE_COMMA);
printf(" All vec0_token_next tests passed.\n");
}
// Tests Vec0Scanner, the stateful wrapper around vec0_token_next() that
// tracks position and yields successive tokens. Verifies correct tokenisation
// of full sequences like "abc float[128]" and "key=value", empty input,
// whitespace-heavy input, and expressions with operators ("a+b").
void test_vec0_scanner() {
printf("Starting %s...\n", __func__);
struct Vec0Scanner scanner;
struct Vec0Token token;
int rc;
// Scan "abc float[128]"
{
const char *input = "abc float[128]";
vec0_scanner_init(&scanner, input, (int)strlen(input));
rc = vec0_scanner_next(&scanner, &token);
assert(rc == VEC0_TOKEN_RESULT_SOME);
assert(token.token_type == TOKEN_TYPE_IDENTIFIER);
assert(token.end - token.start == 3);
assert(strncmp(token.start, "abc", 3) == 0);
rc = vec0_scanner_next(&scanner, &token);
assert(rc == VEC0_TOKEN_RESULT_SOME);
assert(token.token_type == TOKEN_TYPE_IDENTIFIER);
assert(token.end - token.start == 5);
assert(strncmp(token.start, "float", 5) == 0);
rc = vec0_scanner_next(&scanner, &token);
assert(rc == VEC0_TOKEN_RESULT_SOME);
assert(token.token_type == TOKEN_TYPE_LBRACKET);
rc = vec0_scanner_next(&scanner, &token);
assert(rc == VEC0_TOKEN_RESULT_SOME);
assert(token.token_type == TOKEN_TYPE_DIGIT);
assert(strncmp(token.start, "128", 3) == 0);
rc = vec0_scanner_next(&scanner, &token);
assert(rc == VEC0_TOKEN_RESULT_SOME);
assert(token.token_type == TOKEN_TYPE_RBRACKET);
rc = vec0_scanner_next(&scanner, &token);
assert(rc == VEC0_TOKEN_RESULT_EOF);
}
// Scan "key=value"
{
const char *input = "key=value";
vec0_scanner_init(&scanner, input, (int)strlen(input));
rc = vec0_scanner_next(&scanner, &token);
assert(rc == VEC0_TOKEN_RESULT_SOME);
assert(token.token_type == TOKEN_TYPE_IDENTIFIER);
assert(strncmp(token.start, "key", 3) == 0);
rc = vec0_scanner_next(&scanner, &token);
assert(rc == VEC0_TOKEN_RESULT_SOME);
assert(token.token_type == TOKEN_TYPE_EQ);
rc = vec0_scanner_next(&scanner, &token);
assert(rc == VEC0_TOKEN_RESULT_SOME);
assert(token.token_type == TOKEN_TYPE_IDENTIFIER);
assert(strncmp(token.start, "value", 5) == 0);
rc = vec0_scanner_next(&scanner, &token);
assert(rc == VEC0_TOKEN_RESULT_EOF);
}
// Scan empty string
{
const char *input = "";
vec0_scanner_init(&scanner, input, 0);
rc = vec0_scanner_next(&scanner, &token);
assert(rc == VEC0_TOKEN_RESULT_EOF);
}
// Scan with lots of whitespace
{
const char *input = " a b ";
vec0_scanner_init(&scanner, input, (int)strlen(input));
rc = vec0_scanner_next(&scanner, &token);
assert(rc == VEC0_TOKEN_RESULT_SOME);
assert(token.token_type == TOKEN_TYPE_IDENTIFIER);
assert(token.end - token.start == 1);
assert(*token.start == 'a');
rc = vec0_scanner_next(&scanner, &token);
assert(rc == VEC0_TOKEN_RESULT_SOME);
assert(token.token_type == TOKEN_TYPE_IDENTIFIER);
assert(token.end - token.start == 1);
assert(*token.start == 'b');
rc = vec0_scanner_next(&scanner, &token);
assert(rc == VEC0_TOKEN_RESULT_EOF);
}
// Scan "a+b"
{
const char *input = "a+b";
vec0_scanner_init(&scanner, input, (int)strlen(input));
rc = vec0_scanner_next(&scanner, &token);
assert(rc == VEC0_TOKEN_RESULT_SOME);
assert(token.token_type == TOKEN_TYPE_IDENTIFIER);
rc = vec0_scanner_next(&scanner, &token);
assert(rc == VEC0_TOKEN_RESULT_SOME);
assert(token.token_type == TOKEN_TYPE_PLUS);
rc = vec0_scanner_next(&scanner, &token);
assert(rc == VEC0_TOKEN_RESULT_SOME);
assert(token.token_type == TOKEN_TYPE_IDENTIFIER);
rc = vec0_scanner_next(&scanner, &token);
assert(rc == VEC0_TOKEN_RESULT_EOF);
}
// Scan "diskann(k=v, k2=v2)"
{
const char *input = "diskann(k=v, k2=v2)";
vec0_scanner_init(&scanner, input, (int)strlen(input));
rc = vec0_scanner_next(&scanner, &token);
assert(rc == VEC0_TOKEN_RESULT_SOME);
assert(token.token_type == TOKEN_TYPE_IDENTIFIER);
assert(strncmp(token.start, "diskann", 7) == 0);
rc = vec0_scanner_next(&scanner, &token);
assert(rc == VEC0_TOKEN_RESULT_SOME);
assert(token.token_type == TOKEN_TYPE_LPAREN);
rc = vec0_scanner_next(&scanner, &token);
assert(rc == VEC0_TOKEN_RESULT_SOME);
assert(token.token_type == TOKEN_TYPE_IDENTIFIER);
assert(strncmp(token.start, "k", 1) == 0);
rc = vec0_scanner_next(&scanner, &token);
assert(rc == VEC0_TOKEN_RESULT_SOME);
assert(token.token_type == TOKEN_TYPE_EQ);
rc = vec0_scanner_next(&scanner, &token);
assert(rc == VEC0_TOKEN_RESULT_SOME);
assert(token.token_type == TOKEN_TYPE_IDENTIFIER);
assert(strncmp(token.start, "v", 1) == 0);
rc = vec0_scanner_next(&scanner, &token);
assert(rc == VEC0_TOKEN_RESULT_SOME);
assert(token.token_type == TOKEN_TYPE_COMMA);
rc = vec0_scanner_next(&scanner, &token);
assert(rc == VEC0_TOKEN_RESULT_SOME);
assert(token.token_type == TOKEN_TYPE_IDENTIFIER);
assert(strncmp(token.start, "k2", 2) == 0);
rc = vec0_scanner_next(&scanner, &token);
assert(rc == VEC0_TOKEN_RESULT_SOME);
assert(token.token_type == TOKEN_TYPE_EQ);
rc = vec0_scanner_next(&scanner, &token);
assert(rc == VEC0_TOKEN_RESULT_SOME);
assert(token.token_type == TOKEN_TYPE_IDENTIFIER);
assert(strncmp(token.start, "v2", 2) == 0);
rc = vec0_scanner_next(&scanner, &token);
assert(rc == VEC0_TOKEN_RESULT_SOME);
assert(token.token_type == TOKEN_TYPE_RPAREN);
rc = vec0_scanner_next(&scanner, &token);
assert(rc == VEC0_TOKEN_RESULT_EOF);
}
printf(" All vec0_scanner tests passed.\n");
}
// Tests vec0_parse_vector_column(), which parses a vec0 column definition
// string like "embedding float[768] distance_metric=cosine" into a
// VectorColumnDefinition struct. Covers all element types (float/f32, int8/i8,
// bit), column names with underscores/digits, all distance metrics (L2, L1,
// cosine), the default metric, and error cases: empty input, missing type,
// unknown type, missing dimensions, unknown metric, unknown option key, and
// distance_metric on bit columns.
void test_vec0_parse_vector_column() {
printf("Starting %s...\n", __func__);
struct VectorColumnDefinition col;
int rc;
// Basic float column
{
const char *input = "embedding float[768]";
rc = vec0_parse_vector_column(input, (int)strlen(input), &col);
assert(rc == SQLITE_OK);
assert(col.name_length == 9);
assert(strncmp(col.name, "embedding", 9) == 0);
assert(col.element_type == SQLITE_VEC_ELEMENT_TYPE_FLOAT32);
assert(col.dimensions == 768);
assert(col.distance_metric == VEC0_DISTANCE_METRIC_L2);
sqlite3_free(col.name);
}
// f32 alias
{
const char *input = "v f32[3]";
rc = vec0_parse_vector_column(input, (int)strlen(input), &col);
assert(rc == SQLITE_OK);
assert(col.element_type == SQLITE_VEC_ELEMENT_TYPE_FLOAT32);
assert(col.dimensions == 3);
sqlite3_free(col.name);
}
// int8 column
{
const char *input = "quantized int8[256]";
rc = vec0_parse_vector_column(input, (int)strlen(input), &col);
assert(rc == SQLITE_OK);
assert(col.element_type == SQLITE_VEC_ELEMENT_TYPE_INT8);
assert(col.dimensions == 256);
assert(col.name_length == 9);
assert(strncmp(col.name, "quantized", 9) == 0);
sqlite3_free(col.name);
}
// i8 alias
{
const char *input = "q i8[64]";
rc = vec0_parse_vector_column(input, (int)strlen(input), &col);
assert(rc == SQLITE_OK);
assert(col.element_type == SQLITE_VEC_ELEMENT_TYPE_INT8);
assert(col.dimensions == 64);
sqlite3_free(col.name);
}
// bit column
{
const char *input = "bvec bit[1024]";
rc = vec0_parse_vector_column(input, (int)strlen(input), &col);
assert(rc == SQLITE_OK);
assert(col.element_type == SQLITE_VEC_ELEMENT_TYPE_BIT);
assert(col.dimensions == 1024);
sqlite3_free(col.name);
}
// Column name with underscores and digits
{
const char *input = "col_name_2 float[10]";
rc = vec0_parse_vector_column(input, (int)strlen(input), &col);
assert(rc == SQLITE_OK);
assert(col.name_length == 10);
assert(strncmp(col.name, "col_name_2", 10) == 0);
sqlite3_free(col.name);
}
// distance_metric=cosine
{
const char *input = "emb float[128] distance_metric=cosine";
rc = vec0_parse_vector_column(input, (int)strlen(input), &col);
assert(rc == SQLITE_OK);
assert(col.distance_metric == VEC0_DISTANCE_METRIC_COSINE);
assert(col.dimensions == 128);
sqlite3_free(col.name);
}
// distance_metric=L2 (explicit)
{
const char *input = "emb float[128] distance_metric=L2";
rc = vec0_parse_vector_column(input, (int)strlen(input), &col);
assert(rc == SQLITE_OK);
assert(col.distance_metric == VEC0_DISTANCE_METRIC_L2);
sqlite3_free(col.name);
}
// distance_metric=L1
{
const char *input = "emb float[128] distance_metric=l1";
rc = vec0_parse_vector_column(input, (int)strlen(input), &col);
assert(rc == SQLITE_OK);
assert(col.distance_metric == VEC0_DISTANCE_METRIC_L1);
sqlite3_free(col.name);
}
// SQLITE_EMPTY: empty string
{
const char *input = "";
rc = vec0_parse_vector_column(input, 0, &col);
assert(rc == SQLITE_EMPTY);
}
// SQLITE_EMPTY: non-vector column (text primary key)
{
const char *input = "document_id text primary key";
rc = vec0_parse_vector_column(input, (int)strlen(input), &col);
assert(rc == SQLITE_EMPTY);
}
// SQLITE_EMPTY: non-vector column (partition key)
{
const char *input = "user_id integer partition key";
rc = vec0_parse_vector_column(input, (int)strlen(input), &col);
assert(rc == SQLITE_EMPTY);
}
// SQLITE_EMPTY: no type (single identifier)
{
const char *input = "emb";
rc = vec0_parse_vector_column(input, (int)strlen(input), &col);
assert(rc == SQLITE_EMPTY);
}
// SQLITE_EMPTY: unknown type
{
const char *input = "emb double[128]";
rc = vec0_parse_vector_column(input, (int)strlen(input), &col);
assert(rc == SQLITE_EMPTY);
}
// SQLITE_EMPTY: unknown type (unknowntype)
{
const char *input = "v unknowntype[128]";
rc = vec0_parse_vector_column(input, (int)strlen(input), &col);
assert(rc == SQLITE_EMPTY);
}
// SQLITE_EMPTY: missing brackets entirely
{
const char *input = "emb float";
rc = vec0_parse_vector_column(input, (int)strlen(input), &col);
assert(rc == SQLITE_EMPTY);
}
// Error: zero dimensions
{
const char *input = "v float[0]";
rc = vec0_parse_vector_column(input, (int)strlen(input), &col);
assert(rc == SQLITE_ERROR);
}
// Error: empty brackets (no dimensions)
{
const char *input = "v float[]";
rc = vec0_parse_vector_column(input, (int)strlen(input), &col);
assert(rc == SQLITE_ERROR);
}
// Error: unknown distance metric
{
const char *input = "emb float[128] distance_metric=hamming";
rc = vec0_parse_vector_column(input, (int)strlen(input), &col);
assert(rc == SQLITE_ERROR);
}
// Error: unknown distance metric (foo)
{
const char *input = "v float[128] distance_metric=foo";
rc = vec0_parse_vector_column(input, (int)strlen(input), &col);
assert(rc == SQLITE_ERROR);
}
// Error: unknown option key
{
const char *input = "emb float[128] foobar=baz";
rc = vec0_parse_vector_column(input, (int)strlen(input), &col);
assert(rc == SQLITE_ERROR);
}
// Error: distance_metric on bit type
{
const char *input = "emb bit[64] distance_metric=cosine";
rc = vec0_parse_vector_column(input, (int)strlen(input), &col);
assert(rc == SQLITE_ERROR);
}
printf(" All vec0_parse_vector_column tests passed.\n");
}
// Tests vec0_parse_partition_key_definition(), which parses a vec0 partition
// key column definition like "user_id integer partition key". Verifies correct
// parsing of integer and text partition keys, column name extraction, and
// rejection of invalid inputs: empty strings, non-partition-key definitions
// ("primary key"), and misspelled keywords.
void test_vec0_parse_partition_key_definition() {
printf("Starting %s...\n", __func__);
typedef struct {
char * test;
int expected_rc;
const char *expected_column_name;
int expected_column_type;
} TestCase;
TestCase suite[] = {
{"user_id integer partition key", SQLITE_OK, "user_id", SQLITE_INTEGER},
{"USER_id int partition key", SQLITE_OK, "USER_id", SQLITE_INTEGER},
{"category text partition key", SQLITE_OK, "category", SQLITE_TEXT},
{"", SQLITE_EMPTY, "", 0},
{"document_id text primary key", SQLITE_EMPTY, "", 0},
{"document_id text partition keyy", SQLITE_EMPTY, "", 0},
};
for(int i = 0; i < countof(suite); i++) {
char * out_column_name;
int out_column_name_length;
int out_column_type;
int rc;
rc = vec0_parse_partition_key_definition(
suite[i].test,
strlen(suite[i].test),
&out_column_name,
&out_column_name_length,
&out_column_type
);
assert(rc == suite[i].expected_rc);
if(rc == SQLITE_OK) {
assert(out_column_name_length == strlen(suite[i].expected_column_name));
assert(strncmp(out_column_name, suite[i].expected_column_name, out_column_name_length) == 0);
assert(out_column_type == suite[i].expected_column_type);
}
printf(" Passed: \"%s\"\n", suite[i].test);
}
}
void test_distance_l2_sqr_float() {
printf("Starting %s...\n", __func__);
float d;
// Identical vectors: distance = 0
{
float a[] = {1.0f, 2.0f, 3.0f};
float b[] = {1.0f, 2.0f, 3.0f};
d = _test_distance_l2_sqr_float(a, b, 3);
assert(d == 0.0f);
}
// Orthogonal unit vectors: sqrt(1+1) = sqrt(2)
{
float a[] = {1.0f, 0.0f, 0.0f};
float b[] = {0.0f, 1.0f, 0.0f};
d = _test_distance_l2_sqr_float(a, b, 3);
assert(fabsf(d - sqrtf(2.0f)) < 1e-6f);
}
// Known computation: [1,2,3] vs [4,5,6] = sqrt(9+9+9) = sqrt(27)
{
float a[] = {1.0f, 2.0f, 3.0f};
float b[] = {4.0f, 5.0f, 6.0f};
d = _test_distance_l2_sqr_float(a, b, 3);
assert(fabsf(d - sqrtf(27.0f)) < 1e-5f);
}
// Single dimension: sqrt(16) = 4.0
{
float a[] = {3.0f};
float b[] = {7.0f};
d = _test_distance_l2_sqr_float(a, b, 1);
assert(d == 4.0f);
}
printf(" All distance_l2_sqr_float tests passed.\n");
}
void test_distance_cosine_float() {
printf("Starting %s...\n", __func__);
float d;
// Identical direction: distance = 0.0
{
float a[] = {1.0f, 0.0f};
float b[] = {2.0f, 0.0f};
d = _test_distance_cosine_float(a, b, 2);
assert(fabsf(d - 0.0f) < 1e-6f);
}
// Orthogonal: distance = 1.0
{
float a[] = {1.0f, 0.0f};
float b[] = {0.0f, 1.0f};
d = _test_distance_cosine_float(a, b, 2);
assert(fabsf(d - 1.0f) < 1e-6f);
}
// Opposite direction: distance = 2.0
{
float a[] = {1.0f, 0.0f};
float b[] = {-1.0f, 0.0f};
d = _test_distance_cosine_float(a, b, 2);
assert(fabsf(d - 2.0f) < 1e-6f);
}
printf(" All distance_cosine_float tests passed.\n");
}
void test_distance_hamming() {
printf("Starting %s...\n", __func__);
float d;
// Identical bitmaps: distance = 0
{
unsigned char a[] = {0xFF};
unsigned char b[] = {0xFF};
d = _test_distance_hamming(a, b, 8);
assert(d == 0.0f);
}
// All different: distance = 8
{
unsigned char a[] = {0xFF};
unsigned char b[] = {0x00};
d = _test_distance_hamming(a, b, 8);
assert(d == 8.0f);
}
// Half different: 0xFF vs 0x0F = 4 bits differ
{
unsigned char a[] = {0xFF};
unsigned char b[] = {0x0F};
d = _test_distance_hamming(a, b, 8);
assert(d == 4.0f);
}
// Multi-byte: [0xFF, 0x00] vs [0x00, 0xFF] = 16 bits differ
{
unsigned char a[] = {0xFF, 0x00};
unsigned char b[] = {0x00, 0xFF};
d = _test_distance_hamming(a, b, 16);
assert(d == 16.0f);
}
printf(" All distance_hamming tests passed.\n");
}
int main() {
printf("Starting unit tests...\n");
#ifdef SQLITE_VEC_ENABLE_AVX
printf("SQLITE_VEC_ENABLE_AVX=1\n");
#endif
#ifdef SQLITE_VEC_ENABLE_NEON
printf("SQLITE_VEC_ENABLE_NEON=1\n");
#endif
#if !defined(SQLITE_VEC_ENABLE_AVX) && !defined(SQLITE_VEC_ENABLE_NEON)
printf("SIMD: none\n");
#endif
test_vec0_token_next();
test_vec0_scanner();
test_vec0_parse_vector_column();
test_vec0_parse_partition_key_definition();
test_distance_l2_sqr_float();
test_distance_cosine_float();
test_distance_hamming();
printf("All unit tests passed.\n");
}
================================================
FILE: tests/test-wasm.mjs
================================================
async function main() {
const { default: init } = await import("../dist/.wasm/sqlite3.mjs");
const sqlite3 = await init();
const vec_version = new sqlite3.oo1.DB(":memory:").selectValue(
"select vec_version()",
);
console.log(vec_version);
}
main();
================================================
FILE: tests/unittest.rs
================================================
fn main() {
println!("Hello, world!");
println!("{:?}", _min_idx(vec![3.0, 2.0, 1.0, f32::MAX, f32::MAX, f32::MAX, f32::MAX, f32::MAX], 2));
}
fn _min_idx(distances: Vec, k: i32) -> Vec {
let n = distances.len();
assert!(n % 8 == 0, "distances.len() must be a multiple of 8");
let mut out: Vec = vec![0; k as usize];
let bitmap_bytes = n / 8;
let mut candidates: Vec = vec![0xFF; bitmap_bytes];
let mut b_taken: Vec = vec![0; bitmap_bytes];
let mut k_used: i32 = 0;
unsafe {
min_idx(
distances.as_ptr(),
n as i32,
candidates.as_mut_ptr(),
out.as_mut_ptr(),
k,
b_taken.as_mut_ptr(),
&mut k_used,
);
}
out.truncate(k_used as usize);
out
}
fn _merge_sorted_lists(
a: &Vec,
a_rowids: &Vec,
b: &Vec,
b_rowids: &Vec,
b_top_idx: &Vec,
n: usize,
) -> (Vec, Vec) {
let mut out_used: i64 = 0;
let mut out: Vec = Vec::with_capacity(n);
let mut out_rowids: Vec = Vec::with_capacity(n);
unsafe {
merge_sorted_lists(
a.as_ptr().cast(),
a_rowids.as_ptr().cast(),
a.len() as i64,
b.as_ptr().cast(),
b_rowids.as_ptr().cast(),
b_top_idx.as_ptr().cast(),
b.len() as i64,
out.as_ptr().cast(),
out_rowids.as_ptr().cast(),
n as i64,
&mut out_used,
);
out.set_len(out_used as usize);
out_rowids.set_len(out_used as usize);
}
(out_rowids, out)
}
#[link(name = "sqlite-vec-internal")]
extern "C" {
fn min_idx(
distances: *const f32,
n: i32,
candidates: *mut u8,
out: *mut i32,
k: i32,
b_taken: *mut u8,
k_used: *mut i32,
) -> i32;
fn merge_sorted_lists(
a: *const f32,
a_rowids: *const i64,
a_length: i64,
b: *const f32,
b_rowids: *const i64,
b_top_idx: *const i32,
b_length: i64,
out: *const f32,
out_rowids: *const i64,
out_length: i64,
out_used: *mut i64,
);
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_basic() {
let pad = |v: &[f32]| -> Vec {
let mut r = v.to_vec();
r.resize(8, f32::MAX);
r
};
assert_eq!(_min_idx(pad(&[1.0, 2.0, 3.0]), 3), vec![0, 1, 2]);
assert_eq!(_min_idx(pad(&[3.0, 2.0, 1.0]), 3), vec![2, 1, 0]);
assert_eq!(_min_idx(pad(&[1.0, 2.0, 3.0]), 2), vec![0, 1]);
assert_eq!(_min_idx(pad(&[3.0, 2.0, 1.0]), 2), vec![2, 1]);
}
#[test]
fn test_merge_sorted_lists() {
let a = &vec![0.01, 0.02, 0.03];
let a_rowids = &vec![1, 2, 3];
//let b = &vec![0.1, 0.2, 0.3, 0.4];
//let b_rowids = &vec![4, 5, 6, 7];
let b = &vec![0.4, 0.2, 0.3, 0.1];
let b_rowids = &vec![7, 5, 6, 4];
let b_top_idx = &vec![3, 1, 2, 0];
assert_eq!(
_merge_sorted_lists(a, a_rowids, b, b_rowids, b_top_idx, 0),
(vec![], vec![])
);
assert_eq!(
_merge_sorted_lists(a, a_rowids, b, b_rowids, b_top_idx, 1),
(vec![1], vec![0.01])
);
assert_eq!(
_merge_sorted_lists(a, a_rowids, b, b_rowids, b_top_idx, 2),
(vec![1, 2], vec![0.01, 0.02])
);
assert_eq!(
_merge_sorted_lists(a, a_rowids, b, b_rowids, b_top_idx, 3),
(vec![1, 2, 3], vec![0.01, 0.02, 0.03])
);
assert_eq!(
_merge_sorted_lists(a, a_rowids, b, b_rowids, b_top_idx, 4),
(vec![1, 2, 3, 4], vec![0.01, 0.02, 0.03, 0.1])
);
assert_eq!(
_merge_sorted_lists(a, a_rowids, b, b_rowids, b_top_idx, 5),
(vec![1, 2, 3, 4, 5], vec![0.01, 0.02, 0.03, 0.1, 0.2])
);
assert_eq!(
_merge_sorted_lists(a, a_rowids, b, b_rowids, b_top_idx, 6),
(
vec![1, 2, 3, 4, 5, 6],
vec![0.01, 0.02, 0.03, 0.1, 0.2, 0.3]
)
);
assert_eq!(
_merge_sorted_lists(a, a_rowids, b, b_rowids, b_top_idx, 7),
(
vec![1, 2, 3, 4, 5, 6, 7],
vec![0.01, 0.02, 0.03, 0.1, 0.2, 0.3, 0.4]
)
);
assert_eq!(
_merge_sorted_lists(a, a_rowids, b, b_rowids, b_top_idx, 8),
(
vec![1, 2, 3, 4, 5, 6, 7],
vec![0.01, 0.02, 0.03, 0.1, 0.2, 0.3, 0.4]
)
);
}
/*
#[test]
fn test_merge_sorted_lists_empty() {
let x = vec![0.1, 0.2, 0.3];
let x_rowids = vec![666, 888, 777];
assert_eq!(
_merge_sorted_lists(&x, &x_rowids, &vec![], &vec![], 3),
(vec![666, 888, 777], vec![0.1, 0.2, 0.3])
);
assert_eq!(
_merge_sorted_lists(&vec![], &vec![], &x, &x_rowids, 3),
(vec![666, 888, 777], vec![0.1, 0.2, 0.3])
);
assert_eq!(
_merge_sorted_lists(&vec![], &vec![], &x, &x_rowids, 4),
(vec![666, 888, 777], vec![0.1, 0.2, 0.3])
);
assert_eq!(
_merge_sorted_lists(&vec![], &vec![], &x, &x_rowids, 2),
(vec![666, 888], vec![0.1, 0.2])
);
}*/
}
================================================
FILE: tests/utils.py
================================================
import numpy as np
from io import BytesIO
def to_npy(arr):
buf = BytesIO()
np.save(buf, arr)
buf.seek(0)
return buf.read()
to_npy(np.array([[1.0, 2.0, 3.0], [2.0, 3.0, 4.0]], dtype=np.float32))
print(to_npy(np.array([[1.0, 2.0]], dtype=np.float32)))
print(to_npy(np.array([1.0, 2.0], dtype=np.float32)))
to_npy(
np.array(
[np.zeros(10), np.zeros(10), np.zeros(10), np.zeros(10), np.zeros(10)],
dtype=np.float32,
)
)
================================================
FILE: tmp-static.py
================================================
import sqlite3
import numpy as np
db = sqlite3.connect(":memory:")
db.enable_load_extension(True)
db.load_extension("./dist/vec0")
db.execute("select load_extension('./dist/vec0', 'sqlite3_vec_raw_init')")
db.enable_load_extension(False)
x = np.array([[0.1, 0.2, 0.3, 0.4], [0.9, 0.8, 0.7, 0.6]], dtype=np.float32)
y = np.array([[0.2, 0.3], [0.9, 0.8], [0.6, 0.5]], dtype=np.float32)
z = np.array(
[
[0.1, 0.1, 0.1, 0.1],
[0.2, 0.2, 0.2, 0.2],
[0.3, 0.3, 0.3, 0.3],
[0.4, 0.4, 0.4, 0.4],
[0.5, 0.5, 0.5, 0.5],
],
dtype=np.float32,
)
def register_np(array, name):
ptr = array.__array_interface__["data"][0]
nvectors, dimensions = array.__array_interface__["shape"]
element_type = array.__array_interface__["typestr"]
assert element_type == "