Repository: oasysai/oasysdb Branch: main Commit: 9aebb4426dae Files: 42 Total size: 105.5 KB Directory structure: gitextract_y86uggxu/ ├── .cargo/ │ └── config.toml ├── .editorconfig ├── .flake8 ├── .github/ │ ├── CODE_OF_CONDUCT.md │ ├── ISSUE_TEMPLATE/ │ │ ├── bug_report.md │ │ ├── config.yml │ │ ├── do_chore.md │ │ └── feature_request.md │ ├── PULL_REQUEST_TEMPLATE.md │ ├── SECURITY.md │ └── workflows/ │ ├── publish-docs.yml │ └── quality-check.yml ├── .gitignore ├── .prettierrc.yml ├── Cargo.toml ├── LICENSE ├── README.md ├── build.rs ├── docs/ │ ├── CNAME │ ├── blog/ │ │ ├── .authors.yml │ │ └── index.md │ ├── changelog.md │ ├── contributing.md │ ├── css/ │ │ └── style.css │ └── index.md ├── mkdocs.yml ├── protos/ │ └── database.proto ├── requirements.txt ├── rustfmt.toml └── src/ ├── cores/ │ ├── database.rs │ ├── index.rs │ ├── mod.rs │ └── storage.rs ├── main.rs ├── protos.rs ├── types/ │ ├── filter.rs │ ├── metric.rs │ ├── mod.rs │ ├── record.rs │ └── vector.rs └── utils/ ├── kmeans.rs └── mod.rs ================================================ FILE CONTENTS ================================================ ================================================ FILE: .cargo/config.toml ================================================ [env] RUST_TEST_THREADS = "1" ================================================ FILE: .editorconfig ================================================ root=true [*] charset = utf-8 indent_style = space insert_final_newline = true trim_trailing_whitespace = true max_line_length = 80 [*.{rs, py}] indent_size = 4 [*.{yml, html, css, js, ts, md}] indent_size = 2 ================================================ FILE: .flake8 ================================================ [flake8] exclude = .venv, target ================================================ FILE: .github/CODE_OF_CONDUCT.md ================================================ # Contributor Covenant Code of Conduct ## Our Pledge We as members, contributors, and leaders pledge to make participation in our community a harassment-free experience for everyone, regardless of age, body size, visible or invisible disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, caste, color, religion, or sexual identity and orientation. We pledge to act and interact in ways that contribute to an open, welcoming, diverse, inclusive, and healthy community. ## Our Standards Examples of behavior that contributes to a positive environment for our community include: - Demonstrating empathy and kindness toward other people - Being respectful of differing opinions, viewpoints, and experiences - Giving and gracefully accepting constructive feedback - Accepting responsibility and apologizing to those affected by our mistakes, and learning from the experience - Focusing on what is best not just for us as individuals, but for the overall community Examples of unacceptable behavior include: - The use of sexualized language or imagery, and sexual attention or advances of any kind - Trolling, insulting or derogatory comments, and personal or political attacks - Public or private harassment - Publishing others' private information, such as a physical or email address, without their explicit permission - Other conduct which could reasonably be considered inappropriate in a professional setting ## Enforcement Responsibilities Community leaders are responsible for clarifying and enforcing our standards of acceptable behavior and will take appropriate and fair corrective action in response to any behavior that they deem inappropriate, threatening, offensive, or harmful. Community leaders have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, and will communicate reasons for moderation decisions when appropriate. ## Scope This Code of Conduct applies within all community spaces, and also applies when an individual is officially representing the community in public spaces. Examples of representing our community include using an official email address, posting via an official social media account, or acting as an appointed representative at an online or offline event. ## Enforcement Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to the community leaders responsible for enforcement at edwin@oasysai.com. All complaints will be reviewed and investigated promptly and fairly. All community leaders are obligated to respect the privacy and security of the reporter of any incident. ## Enforcement Guidelines Community leaders will follow these Community Impact Guidelines in determining the consequences for any action they deem in violation of this Code of Conduct: ### 1. Correction **Community Impact**: Use of inappropriate language or other behavior deemed unprofessional or unwelcome in the community. **Consequence**: A private, written warning from community leaders, providing clarity around the nature of the violation and an explanation of why the behavior was inappropriate. A public apology may be requested. ### 2. Warning **Community Impact**: A violation through a single incident or series of actions. **Consequence**: A warning with consequences for continued behavior. No interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, for a specified period of time. This includes avoiding interactions in community spaces as well as external channels like social media. Violating these terms may lead to a temporary or permanent ban. ### 3. Temporary Ban **Community Impact**: A serious violation of community standards, including sustained inappropriate behavior. **Consequence**: A temporary ban from any sort of interaction or public communication with the community for a specified period of time. No public or private interaction with the people involved, including unsolicited interaction with those enforcing the Code of Conduct, is allowed during this period. Violating these terms may lead to a permanent ban. ### 4. Permanent Ban **Community Impact**: Demonstrating a pattern of violation of community standards, including sustained inappropriate behavior, harassment of an individual, or aggression toward or disparagement of classes of individuals. **Consequence**: A permanent ban from any sort of public interaction within the community. ## Attribution This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 2.1. The Community Impact Guidelines were inspired by [Mozilla's Code of Conduct Enforcement Ladder][mozilla_coc]. [homepage]: https://www.contributor-covenant.org [mozilla_coc]: https://github.com/mozilla/diversity ================================================ FILE: .github/ISSUE_TEMPLATE/bug_report.md ================================================ --- name: 🐞 Report Bug about: Report an unexpected behavior or a malfunctioning feature. title: "BUG: " labels: bug assignees: "" --- ### Short Description Please describe the issue you are experiencing in a few sentences. ### Error Message If you received an error message, please paste some parts of it here. ```txt ``` ### Steps to Reproduce What are the minimal steps to reproduce the behavior? Example: 1. Import the library in ... 2. Initialize the object with ... 3. Call the function ... ### Expected Behavior What do you expect to happen? ### Additional Context Add any other context about the problem here like error traces, etc. ================================================ FILE: .github/ISSUE_TEMPLATE/config.yml ================================================ blank_issues_enabled: false contact_links: - name: ❓ Ask Question url: https://github.com/oasysai/oasysdb/discussions about: Ask general questions or share ideas on Discussions. - name: 💬 Join Discord url: https://discord.gg/bDhQrkqNP4 about: Join the Discord server to help shape the future of OasysDB. ================================================ FILE: .github/ISSUE_TEMPLATE/do_chore.md ================================================ --- name: 🧹 Do Chore about: Documentation updates, code refactoring, or other chores. title: "CHORE: " labels: chore assignees: "" --- ### Description Please describe the chore you suggest in a few sentences. Chore examples: - Updating documentation - Adding tests or examples - Refactoring parts of the codebase ### Context Why is this chore beneficial for the project and its community? ================================================ FILE: .github/ISSUE_TEMPLATE/feature_request.md ================================================ --- name: 🛠️ Feature Request about: Request a new feature or an improvement to an existing feature. title: "FEAT: " labels: enhancement assignees: "" --- ### Use Case What's the use case for this feature? How would you use it? ### Potential Solution On the high level, how would you like the feature to be implemented? ### Additional Context Add context about the feature like links to similar implementations. For example: - Link to a similar feature in another project - Screenshot of the feature functionality - Research papers or articles about the feature ================================================ FILE: .github/PULL_REQUEST_TEMPLATE.md ================================================ ### Purpose Describe the problem solved or feature added by this PR. ### Approach How does this PR solve the problem or add the feature? ### Testing - [ ] I have tested this PR locally. - [ ] If applicable, I added tests to cover my changes. How did you test this PR? How should the reviewer test this PR? ### Chore Checklist - [ ] I formatted my code according to the style and linter guidelines. - [ ] If applicable, I updated the documentation accordingly. ================================================ FILE: .github/SECURITY.md ================================================ # Security Policy Thank you for taking the time to report a security issue. We are trying our best to make this project safe for everyone. We appreciate your efforts to disclose the issue responsibly and will make every effort to acknowledge your contributions. ## Reporting a vulnerability **Please do not report security vulnerabilities through public GitHub issues.** If you believe you have found a security vulnerability, please send an email to edwin@oasysai.com. Please include as many details as possible, these may include: - Impact of the vulnerability. - Steps to reproduce. - Possible solutions. - Location of the vulnerability like file or line number. - If applicable, proof-of-concept or exploit code. ================================================ FILE: .github/workflows/publish-docs.yml ================================================ name: Publish Docs on: workflow_dispatch: push: branches: - main paths: - "docs/**" - "mkdocs.yml" permissions: id-token: write pages: write contents: write jobs: build-docs: name: Build documentation runs-on: ubuntu-latest steps: - name: Checkout the code uses: actions/checkout@v4 - name: Install Python uses: actions/setup-python@v5 with: python-version: 3.x - name: Install dependencies run: pip install mkdocs-material - name: Publish the documentation run: | mkdocs gh-deploy --force --message "cd: deploy docs from {sha}" publish-docs: name: Publish documentation runs-on: ubuntu-latest needs: build-docs environment: name: Docs url: ${{ steps.deployment.outputs.page_url }} steps: - name: Checkout uses: actions/checkout@v4 with: ref: gh-pages - name: Setup pages uses: actions/configure-pages@v5 - name: Upload artifact uses: actions/upload-pages-artifact@v3 with: path: "." - name: Deploy to GitHub Pages id: deployment uses: actions/deploy-pages@v4 ================================================ FILE: .github/workflows/quality-check.yml ================================================ name: Quality Check on: workflow_dispatch: pull_request: paths-ignore: - "docs/**" - "clients/**" push: branches: - main paths-ignore: - "docs/**" - "clients/**" jobs: quality-check: name: Run All Checks runs-on: ubuntu-latest steps: - name: Checkout Code uses: actions/checkout@v4 - name: Install Rust Toolchain uses: dtolnay/rust-toolchain@stable with: components: rustfmt, clippy - name: Install Protobuf Compiler run: | sudo apt update && sudo apt upgrade -y sudo apt install -y protobuf-compiler libprotobuf-dev - name: Run Formatter run: cargo fmt -- --check - name: Run Linter run: cargo clippy -- -D warnings - name: Run Tests run: cargo test --all-features -- --test-threads 1 ================================================ FILE: .gitignore ================================================ # OasysDB tests. odb* oasysdb* # Rust stuff. debug target # Python stuff. __pycache__ .pytest_cache .venv *.so *.py[cod] # Benchmarking. *.ivecs *.fvecs # Misc. .vscode .ds_store # Environment variables. .env .env.* !.env.example ================================================ FILE: .prettierrc.yml ================================================ bracketSpacing: true singleQuote: false trailingComma: "none" semi: false tabWidth: 2 printWidth: 80 proseWrap: "always" ================================================ FILE: Cargo.toml ================================================ [package] name = "oasysdb" version = "0.8.0" edition = "2021" authors = ["Edwin Kys"] [dependencies] tokio = { version = "1.39.3", features = ["rt-multi-thread", "macros"] } hashbrown = { version = "0.15.0", features = ["serde", "rayon"] } uuid = { version = "1.10.0", features = ["v4", "serde"] } clap = "4.5.16" # gRPC-related dependencies tonic = "0.12.1" prost = "0.13.1" # Serialization-related dependencies serde = { version = "1.0.208", features = ["derive"] } bincode = "1.3.3" # Parallelism-related dependencies simsimd = "5.0.1" rayon = "1.10.0" # Logging-related dependencies tracing = "0.1.40" tracing-subscriber = "0.3.18" # Utility dependencies rand = "0.8.5" dotenv = "0.15.0" [build-dependencies] tonic-build = "0.12" ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS ================================================ FILE: README.md ================================================ ![OasysDB Use Case](https://odb-assets.s3.amazonaws.com/banners/0.7.0.png) [![GitHub Stars](https://img.shields.io/github/stars/oasysai/oasysdb?style=for-the-badge&logo=github&logoColor=%23000000&labelColor=%23fcd34d&color=%236b7280)](https://github.com/oasysai/oasysdb) [![Crates.io](https://img.shields.io/crates/d/oasysdb?style=for-the-badge&logo=rust&logoColor=%23000&label=crates.io&labelColor=%23fdba74&color=%236b7280)](https://crates.io/crates/oasysdb) ## Notice This repository is not currently maintained. I initially created this project to learn more about databases and Rust. As times goes on, I actually learned from this project and the people who used it. Unfortunately, most open-source projects doesn't generate enough revenue to sustain itself. I'm currently looking for a new opportunity to work as a **Software Engineer in AI Infrastructure**. If you have or know someone who has an open position, please let me know. I'm open to work remotely or anywhere in the United States. You can reach me via [LinkedIn](https://www.linkedin.com/in/edwinkys). If you're interested in taking over this project, please let me know. I'll be happy to discuss the details with you. Other than that, I'll just leave this project as is for historical purposes. Thank you all for your support and understanding. It's been a great journey! ================================================ FILE: build.rs ================================================ use std::error::Error; use tonic_build::compile_protos; fn main() -> Result<(), Box> { compile_protos("protos/database.proto")?; Ok(()) } ================================================ FILE: docs/CNAME ================================================ docs.oasysdb.com ================================================ FILE: docs/blog/.authors.yml ================================================ authors: edwinkys: name: Edwin Kys description: Author of OasysDB avatar: https://avatars.githubusercontent.com/u/51223060?v=4 ================================================ FILE: docs/blog/index.md ================================================ # Latest Posts Bite-sized blog posts about generative AI, machine learning, and more. ================================================ FILE: docs/changelog.md ================================================ # Changelog ## v0.7.2 ### What's Changed This release includes a fix for the file system issue happening on Windows which happen when the default temporary directory in in a different drive than the current working directory. This issue is fixed by creating a temporary directory in the root of the database directory. ### Contributors - @edwinkys ### Full Changelog [v0.7.1...v0.7.2](https://github.com/oasysai/oasysdb/compare/v0.7.1...v0.7.2) ## v0.7.1 ### What's Changed This release includes a low-level CRUD API for the index implementation from the Database layer. Once the index is built, when necessary, you can use the CRUD API to manage the index data directly. This API allows you to perform the following operations: - Insert new records into the index. - Update existing records in the index. - Delete records from the index. ### Contributors - @edwinkys ### Full Changelog [v0.7.0...v0.7.1](https://github.com/oasysai/oasysdb/compare/v0.7.0...v0.7.1) ## v0.7.0 ### What's Changed OasysDB v0.7.0 is a major release that includes a complete overhaul of the system. Instead of being a dedicated vector database, OasysDB is now a hybrid vector database that integrates with SQL databases such as SQLite and PostgreSQL which you can configure to store the vector records. This approach gives various advantages such as: - Reliability and durability of the data due to SQL database ACID properties. - Separation of vector storage and computation allowing you to scale the system independently. These are some of the key changes in this release: - **SQL Storage Layer**: OasysDB can be configured to source vector records from a SQL database such as SQLite or PostgreSQL. - **Multi-index Support**: OasysDB can support multiple indices for the same SQL table allowing users to improve the search performance. - **Pre-filtering**: OasysDB can pre-filter the vector records from SQL tables based on the metadata before inserting them into the index. - **Configurable Algorithm**: Each index in OasysDB can be configured with different algorithms and parameters to fit the performance requirements. ### Contributors - @edwinkys ### Full Changelog [v0.6.1...v0.7.0](https://github.com/oasysai/oasysdb/compare/v0.6.1...v0.7.0) ## v0.6.1 ### What's Changed - Add support for boolean metadata type. This allows full compatibility with JSON-like object or dictionary metadata when storing vector records in the collection. - We optimize the database save and get collection operations performance by 10-20% by reducing the number of IO operations. Also, the save collection operation is now atomic which means that the collection is saved to the disk only when the operation is completed successfully. - We launch our own documentation website at [docs.oasysdb.com](https://docs.oasysdb.com) to provide a better user experience and more comprehensive documentation for the OasysDB library. It's still a work in progress and we will continue to improve the documentation over time. ### Contributors - @edwinkys ### Full Changelog [v0.6.0...v0.6.1](https://github.com/oasysai/oasysdb/compare/v0.6.0...v0.6.1) ## v0.6.0 ### What's Changed - **CONDITIONAL BREAKING CHANGE**: We remove support for dot distance metric and we replace cosine similarity with cosine distance metric. This change is made to make the distance metric consistent with the other distance metrics. - The default configuration for the collection (EF Construction and EF Search) is increased to a more sensible value according to the common real-world use cases. The default EF Construction is set to 128 and the default EF Search is set to 64. - We add a new script to measure the recall rate of the collection search functionality. And with this, we improve the search recall rate of OasysDB to match the recall rate of HNSWLib with the same configuration. ```sh cargo run --example measure-recall ``` - We add a new benchmark to measure the performance of saving and getting the collection. The benchmark can be run by running the command below. ```sh cargo bench ``` ### Contributors - @edwinkys ### Full Changelog [v0.5.1...v0.6.0](https://github.com/oasysai/oasysdb/compare/v0.5.1...v0.6.0) ## v0.5.1 ### What's Changed We add a new method `Collection.filter` to filter the vector records based on the metadata. This method returns a HashMap of the filtered vector records and their corresponding vector IDs. This implementation performs a linear search through the collection and thus might be slow for large datasets. This implementation includes support for the following metadata to filter: - `String`: Stored value must include the filter string. - `Float`: Stored value must be equal to the filter float. - `Integer`: Stored value must be equal to the filter integer. - `Object`: Stored value must match all the key-value pairs in the filter object. We currently don't support filtering based on the array type metadata because I am not sure of the best way to implement it. If you have any suggestions, please let me know. ### Contributors - @edwinkys ### Full Changelog [v0.5.0...v0.5.1](https://github.com/oasysai/oasysdb/compare/v0.5.0...v0.5.1) ## v0.5.0 ### What's Changed - **BREAKING CHANGE**: Although there is no change in the database API, the underlying storage format has been changed to save the collection data to dedicated files directly. The details of the new persistent system and how to migrate from v0.4.x to v0.5.0 can be found in this migration guide. - By adding the feature `gen`, you can now use the `EmbeddingModel` trait and OpenAI's embedding models to generate vectors or records from text without external dependencies. This feature is optional and can be enabled by adding the feature to the `Cargo.toml` file. ```toml [dependencies] oasysdb = { version = "0.5.0", features = ["gen"] } ``` ### Contributors - @edwinkys ### Full Changelog [v0.4.5...v0.5.0](https://github.com/oasysai/oasysdb/compare/v0.4.5...v0.5.0) ## v0.4.5 ### What's Changed - Add insert benchmark to measure the performance of inserting vectors into the collection. The benchmark can be run using the `cargo bench` command. - Fix the issue with large-size dirty IO buffers caused by the database operation. This issue is fixed by flushing the dirty IO buffers after the operation is completed. This operation can be done synchronously or asynchronously based on the user's preference since this operation might take some time to complete. ### Contributors - @edwinkys ### Full Changelog [v0.4.4...v0.4.5](https://github.com/oasysai/oasysdb/compare/v0.4.4...v0.4.5) ## v0.4.4 ### What's Changed - Maximize compatibility with the standard library error types to allow users to convert OasysDB errors to most commonly used error handling libraries such as `anyhow`, `thiserror`, etc. - Add conversion methods to convert metadata to JSON value by `serde_json` and vice versa. This allows users to store JSON format metadata easily. - Add normalized cosine distance metric to the collection search functionality. Read more about the normalized cosine distance metric here. - Fix the search distance calculation to use the correct distance metric and sort it accordingly based on the collection configuration. - Add vector ID utility methods to the `VectorID` struct to make it easier to work with the vector ID. ### Additional Notes - Add a new benchmark to measure the true search AKA brute-force search performance of the collection. If possible, dealing with a small dataset, it is recommended to use the true search method for better accuracy. The benchmark can be run using the `cargo bench` command. - Improve the documentation to include more examples and explanations on how to use the library: Comprehensive Guide. ### Contributors - @edwinkys ### Full Changelog [v0.4.3...v0.4.4](https://github.com/oasysai/oasysdb/compare/v0.4.3...v0.4.4) ## v0.4.3 ### What's Changed - Add SIMD acceleration to calculate the distance between vectors. This improves the performance of inserting and searching vectors in the collection. - Improve OasysDB native error type implementation to include the type/kind of error that occurred in addition to the error message. For example, `ErrorKind::CollectionError` is used to represent errors that occur during collection operations. - Fix the `Config.ml` default value from 0.3 to 0.2885 which is the optimal value for the HNSW with M of 32. The optimal value formula for ml is `1/ln(M)`. ### Contributors - @edwinkys ### Full Changelog [v0.4.2...v0.4.3](https://github.com/oasysai/oasysdb/compare/v0.4.2...v0.4.3) ## v0.4.2 ### What's Changed Due to an issue (#62) with the Python release of v0.4.1, this patch version is released to fix the build wheels for Python users. The issue is caused due to the new optional PyO3 feature for the v0.4.1 Rust crate release which exclude PyO3 dependencies from the build process. To solve this, the Python package build and deploy script now includes `--features py` argument. For Rust users, this version doesn't offer any additional features or functionality compared to v0.4.1 release. ### Full Changelog [v0.4.1...v0.4.2](https://github.com/oasysai/oasysdb/compare/v0.4.1...v0.4.2) ## v0.4.1 ### What's Changed - Added quality of life improvements to the `VectorID` type interoperability. - Improved the `README.md` file with additional data points on the database performance. - Changed to `Collection.insert` method to return the new `VectorID` after inserting a new vector record. - Pyo3 dependencies are now hidden behind the `py` feature. This allows users to build the library without the Python bindings if they don't need it, which is probably all of them. ### Contributors - @dteare - @edwinkys - @noneback ### Full Changelog [v0.4.0...v0.4.1](https://github.com/oasysai/oasysdb/compare/v0.4.0...v0.4.1) ## v0.4.0 ### What's Changed - **CONDITIONAL BREAKING CHANGE**: Add an option to configure distance for the vector collection via `Config` struct. The new field `distance` can be set using the `Distance` enum. This includes Euclidean, Cosine, and Dot distance metrics. The default distance metric is Euclidean. This change is backward compatible if you are creating a config using the `Config::default()` method. Otherwise, you need to update the config to include the distance metric. ```rs let config = Config { ... distance: Distance::Cosine, }; ``` - With the new distance metric feature, now, you can set a `relevancy` threshold for the search results. This will filter out the results that are below or above the threshold depending on the distance metric used. This feature is disabled by default which is set to -1.0. To enable this feature, you can set the `relevancy` field in the `Collection` struct. ```rs ... let mut collection = Collection::new(&config)?; collection.relevancy = 3.0; ``` - Add a new method `Collection::insert_many` to insert multiple vector records into the collection at once. This method is more optimized than using the `Collection::insert` method in a loop. ### Contributors - @noneback - @edwinkys ### Full Changelog [v0.3.0...v0.4.0](https://github.com/oasysai/oasysdb/compare/v0.3.0...v0.4.0) ## v0.3.0 This release introduces a BREAKING CHANGE to one of the method from the `Database` struct. The `Database::create_collection` method has been removed from the library due to redundancy. The `Database::save_collection` method can be used to create a new collection or update an existing one. This change is made to simplify the API and to make it more consistent with the other methods in the `Database` struct. ### What's Changed - **BREAKING CHANGE**: Removed the `Database::create_collection` method from the library. To replace this, you can use the code snippet below: ```rs // Before: this creates a new empty collection. db.create_collection("vectors", None, Some(records))?; // After: create new or build a collection then save it. // let collection = Collection::new(&config)?; let collection = Collection::build(&config, &records)?; db.save_collection("vectors", &collection)?; ``` - Added the `Collection::list` method to list all the vector records in the collection. - Created a full Python binding for OasysDB which is available on PyPI. This allows you to use OasysDB directly from Python. The Python binding is available at https://pypi.org/project/oasysdb. ### Contributors - @edwinkys - @Zelaren - @FebianFebian1 ### Full Changelog [v0.2.1...v0.3.0](https://github.com/oasysai/oasysdb/compare/v0.2.1...v0.3.0) ## v0.2.1 ### What's Changed - `Metadata` enum can now be accessed publicly using `oasysdb::metadata::Metadata`. This allows users to use `match` statements to extract the data from it. - Added a `prelude` module that re-exports the most commonly used types and traits. This makes it easier to use the library by importing the prelude module by `use oasysdb::prelude::*`. ### Contributors - @edwinkys ### Full Changelog [v0.2.0...v0.2.1](https://github.com/oasysai/oasysdb/compare/v0.2.0...v0.2.1) ## v0.2.0 ### What's Changed - For `Collection` struct, the generic parameter `D` has been replaced with `Metadata` enum which allows one collection to store different types of data as needed. - The `Vector` now uses `Vec` instead of `[f32, N]` which removes the `N` generic parameter from the `Vector` struct. Since there is a chance of using different vector dimensions in the same collection with this change, An additional functionality is added to the `Collection` to make sure that the vector dimension is uniform. - The `M` generic parameter in the `Collection` struct has been replaced with a constant of 32. This removes the flexibility to tweak the indexing configuration for this value. But for most use cases, this value should be sufficient. - Added multiple utility functions to structs such as `Record`, `Vector`, and `Collection` to make it easier to work with the data. ### Contributors - @edwinkys ### Full Changelog [v0.1.0...v0.2.0](https://github.com/oasysai/oasysdb/compare/v0.1.0...v0.2.0) ## v0.1.0 ### What's Changed - OasysDB release as an embedded vector database available directly via `cargo add oasysdb` command. - Using HNSW algorithm implementation for the collection indexing along with Euclidean distance metrics. - Incremental updates on the vector collections allowing inserts, deletes, and modifications without rebuilding the index. - Add a benchmark on the collection search functionality using SIFT dataset that can be run using `cargo bench` command. ### Contributors - @edwinkys ### Full Changelog [v0.1.0](https://github.com/oasysai/oasysdb/commits/v0.1.0) ================================================ FILE: docs/contributing.md ================================================ # Contributing to OasysDB First of all, thank you for considering to contribute to OasysDB! We welcome contributions from the community, and this documentation outlines the process to start contributing to our project. ## Code of Conduct We are committed to building an inclusive and welcoming community because we believe that it will lead to a more successful project and a better experience for everyone involved. To achieve that, any participant in our project is expected to act respectfully and to follow the Code of Conduct. ## Have questions or suggestions? [![Discord](https://img.shields.io/discord/1182432298382131200?logo=discord&logoColor=%23ffffff&label=Discord&labelColor=%235865F2&style=for-the-badge)][discord] There is no such thing as a stupid question. If you have a question, chances are, someone else does too. So, please feel free to ask questions whether it's on our [Discord][discord] server or by opening a new discussion on [GitHub Discussions][gh_discussions]. ## Encounter a bug? Have a feature request? If you encounter a bug or have a feature request, please open an issue on [GitHub Issues][gh_issues]. Please include enough information for us to understand the issue or the feature request. For this reason, we recommend you to follow the issue templates we have provided when creating a new issue. ## Want to contribute code? **TLDR: Check or open an issue first before working on a PR.** Before you start working on a pull request, we encourage you to check out the existing issues and pull requests to make sure that the feature you want to work on is in our roadmap and is aligned with the project's vision. After all, we don't want you to waste your time working on something that might not be merged. We try to prioritize features and bug fixes that are on our roadmap or requested a lot by the community. If you want to work on a feature or a fix that isn't already in the issue tracker, please open an issue first to discuss it with the project maintainers and the community. For features, we try to prioritize features that are backed by real-world use cases. If you have a use case for a feature, please include it in the issue. We'd love to hear about it! ## Getting started OasysDB is written in Rust. So, you need to have Rust installed on your local machine. If you haven't installed Rust yet, you can install it by following the instructions on the [Rust Installation Guide][rustup]. After you have installed Rust, you can clone the repository into your local machine. Before you start making changes in the codebase, you should run the tests to make sure that everything is working as expected: ```sh cargo test ``` OasysDB uses a couple of third-party dependencies that might be useful for you to get familiar with. These are the most important ones along with their documentation: - [gRPC](https://grpc.io/) - [Tonic](https://github.com/hyperium/tonic) - [Tokio](https://tokio.rs/) ## Style guide We mostly use the default linting and style guide for Rust except for some linting changes listed in the rustfmt.toml file. For more information about the code style, see the [Rust Style Guide][style_guide]. For commit messages, we use the [Conventional Commits][conventional_commits] format. This allows us to maintain consistency and readability in our Git commit history making it easier to understand the changes made to the codebase at a high-level. When commenting your code, please try your best to write comments that are clear and concise with proper English sentence capitalization and punctuation. This will help us and the community understand your code better and keep the codebase maintainable. ## Submitting a pull request Once you have made your changes, you can submit a pull request. We will review your pull request and provide feedback. If your pull request is accepted, we will merge it into the main branch. For organization purposes, we ask that you use the [Conventional Commits][conventional_commits] format for your pull request title in lowercase: ``` : ``` For example: ``` feat: add support ... fix: fix issue ... ``` ## Conclusion Thank you for taking the time to read this documentation. We look forward to your contributions! Another way to support this project is to star this project, share it with your circles, and join us on [Discord][discord]. Best regards,
Edwin Kys [discord]: https://discord.gg/bDhQrkqNP4 [gh_issues]: https://github.com/oasysai/oasysdb/issues [gh_discussions]: https://github.com/oasysai/oasysdb/discussions [rustup]: https://www.rust-lang.org/tools/install [style_guide]: https://doc.rust-lang.org/beta/style-guide/index.html [conventional_commits]: https://www.conventionalcommits.org/en/v1.0.0/ ================================================ FILE: docs/css/style.css ================================================ h1, h2, h3 { font-weight: bold !important; } .odb-button { text-align: center; width: 100%; } .odb-button.disabled { opacity: 0.5; cursor: not-allowed; } /* Tables will be displayed at full width. */ .md-typeset__table { width: 100%; } .md-typeset__table table:not([class]) { display: table; } ================================================ FILE: docs/index.md ================================================ # Welcome to OasysDB 🎉 ================================================ FILE: mkdocs.yml ================================================ site_name: OasysDB repo_name: oasysai/oasysdb repo_url: https://github.com/oasysai/oasysdb theme: name: material logo: assets/wordmark.png favicon: assets/favicon64.png icon: repo: fontawesome/brands/github palette: - media: "(prefers-color-scheme: light)" scheme: default primary: black toggle: name: Light Mode icon: material/brightness-7 - media: "(prefers-color-scheme: dark)" scheme: slate primary: black toggle: name: Dark Mode icon: material/brightness-4 font: text: Space Grotesk code: Space Mono features: - header.autohide - navigation.tabs - navigation.tabs.sticky - navigation.expand - navigation.footer - content.code.copy copyright: Copyright © 2024 OasysDB extra: generator: false social: - icon: fontawesome/brands/x-twitter link: https://x.com/oasysai - icon: fontawesome/brands/linkedin link: https://www.linkedin.com/company/oasysai - icon: fontawesome/brands/discord link: https://discord.gg/bDhQrkqNP4 extra_css: - css/style.css nav: - Documentation: - Introduction: index.md - Other: - Changelog: changelog.md - Contributing: contributing.md - Blog: - blog/index.md markdown_extensions: - admonition - attr_list - md_in_html - pymdownx.details - pymdownx.inlinehilite - pymdownx.snippets - pymdownx.superfences - pymdownx.tabbed: alternate_style: true - pymdownx.emoji: emoji_index: !!python/name:material.extensions.emoji.twemoji emoji_generator: !!python/name:material.extensions.emoji.to_svg - toc: permalink: "#" plugins: - blog: post_readtime: true post_excerpt: required authors: true categories_allowed: - Log - Rust ================================================ FILE: protos/database.proto ================================================ syntax = "proto3"; package database; import "google/protobuf/empty.proto"; // OasysDB gRPC service definition. service Database { // Check if the connection to the database is alive. rpc Heartbeat(google.protobuf.Empty) returns (HeartbeatResponse); // Manually create a snapshot of the database. rpc Snapshot(google.protobuf.Empty) returns (SnapshotResponse); // Insert a new record into the database. rpc Insert(InsertRequest) returns (InsertResponse); // Retrieve an existing record from the database. rpc Get(GetRequest) returns (GetResponse); // Delete a record from the database. rpc Delete(DeleteRequest) returns (google.protobuf.Empty); // Update a record metadata in the database. rpc Update(UpdateRequest) returns (google.protobuf.Empty); // Query the database for nearest neighbors. rpc Query(QueryRequest) returns (QueryResponse); } message HeartbeatResponse { string version = 1; } message SnapshotResponse { int32 count = 1; } message InsertRequest { Record record = 1; } message InsertResponse { string id = 1; } message GetRequest { string id = 1; } message GetResponse { Record record = 1; } message DeleteRequest { string id = 1; } message UpdateRequest { string id = 1; map metadata = 2; } message QueryRequest { Vector vector = 1; int32 k = 2; string filter = 3; QueryParameters params = 4; } message QueryParameters { int32 probes = 1; float radius = 2; } message QueryResponse { repeated QueryResult results = 1; } message QueryResult { string id = 1; map metadata = 2; float distance = 3; } // List shared types below. message Record { Vector vector = 1; map metadata = 2; } message Vector { repeated float data = 1; } message Value { oneof value { string text = 1; double number = 2; bool boolean = 4; } } ================================================ FILE: requirements.txt ================================================ # Documentation website. mkdocs-material==9.5.26 ================================================ FILE: rustfmt.toml ================================================ tab_spaces = 4 reorder_imports = true max_width = 80 use_small_heuristics = "Max" merge_derives = false ================================================ FILE: src/cores/database.rs ================================================ use super::*; use protos::database_server::Database as DatabaseService; use std::io::{BufReader, BufWriter}; use tonic::{Request, Response}; const TMP_DIR: &str = "tmp"; const PARAMS_FILE: &str = "odb_params"; const STORAGE_FILE: &str = "odb_storage"; const INDEX_FILE: &str = "odb_index"; /// Database parameters. /// /// Fields: /// - dimension: Vector dimension. /// - metric: Metric to calculate distance. /// - density: Max number of records per IVF cluster. #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)] pub struct Parameters { pub dimension: usize, pub metric: Metric, pub density: usize, } /// Dynamic query-time parameters. /// /// Fields: /// - probes: Suggested number of clusters to visit. /// - radius: Maximum distance to include in the result. #[derive(Debug, Clone, Copy, PartialEq)] pub struct QueryParameters { pub probes: usize, pub radius: f32, } impl Default for QueryParameters { /// Default query parameters: /// - probes: 32 /// - radius: ∞ fn default() -> Self { QueryParameters { probes: 32, radius: f32::INFINITY } } } impl TryFrom for QueryParameters { type Error = Status; fn try_from(value: protos::QueryParameters) -> Result { Ok(QueryParameters { probes: value.probes as usize, radius: value.radius, }) } } /// Database snapshot statistics. /// /// The snapshot statistics include the information that might be useful /// for monitoring the state of the database. This stats will be returned /// by the `create_snapshot` method. #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] pub struct SnapshotStats { pub count: usize, } impl From for protos::SnapshotResponse { fn from(value: SnapshotStats) -> Self { protos::SnapshotResponse { count: value.count as i32 } } } #[derive(Debug)] pub struct Database { dir: PathBuf, params: Parameters, index: RwLock, storage: RwLock, } impl Database { pub fn configure(params: &Parameters) { let index = Index::new() .with_metric(params.metric) .with_density(params.density); let db = Database { dir: Self::dir(), params: *params, index: RwLock::new(index), storage: RwLock::new(Storage::new()), }; if db.dir.join(PARAMS_FILE).exists() { let stdin = std::io::stdin(); let overwrite = { eprint!("Database is already configured. Overwrite? (y/n): "); let mut input = String::new(); stdin.read_line(&mut input).unwrap(); matches!(input.to_lowercase().trim(), "y") }; if !overwrite { return; } fs::remove_dir_all(&db.dir).expect("Failed to reset the database"); println!("The database has been reset successfully"); } db.setup_dir().expect("Failed to setup database directory"); } pub fn open() -> Result> { let dir = Self::dir(); let params = Self::load_binary(dir.join(PARAMS_FILE))?; let index = Self::load_binary(dir.join(INDEX_FILE))?; let storage: Storage = Self::load_binary(dir.join(STORAGE_FILE))?; let count = storage.count(); tracing::info!("Restored {count} record(s) from the disk"); Ok(Database { dir, params, index: RwLock::new(index), storage: RwLock::new(storage), }) } fn dir() -> PathBuf { match env::var("ODB_DIR") { Ok(dir) => PathBuf::from(dir), Err(_) => PathBuf::from("oasysdb"), } } fn setup_dir(&self) -> Result<(), Box> { if self.dir.try_exists()? { return Ok(()); } fs::create_dir_all(&self.dir)?; fs::create_dir_all(self.dir.join(TMP_DIR))?; self.create_snapshot()?; Ok(()) } fn load_binary( path: impl AsRef, ) -> Result> { let file = OpenOptions::new().read(true).open(path)?; let reader = BufReader::new(file); Ok(bincode::deserialize_from(reader)?) } fn persist_as_binary( &self, path: impl AsRef, data: T, ) -> Result<(), Box> { let file_name = path.as_ref().file_name().unwrap(); let tmp_file = self.dir.join(TMP_DIR).join(file_name); let file = OpenOptions::new() .write(true) .create(true) .truncate(true) .open(&tmp_file)?; let writer = BufWriter::new(file); bincode::serialize_into(writer, &data)?; fs::rename(&tmp_file, &path)?; Ok(()) } pub fn create_snapshot(&self) -> Result> { self.persist_as_binary(self.dir.join(PARAMS_FILE), self.params)?; let index = self.index.read().unwrap(); self.persist_as_binary(self.dir.join(INDEX_FILE), &*index)?; let storage = self.storage.read().unwrap(); self.persist_as_binary(self.dir.join(STORAGE_FILE), &*storage)?; let count = storage.count(); tracing::info!("Created a snapshot with {count} record(s)"); Ok(SnapshotStats { count }) } fn validate_dimension(&self, vector: &Vector) -> Result<(), Status> { if vector.len() != self.params.dimension { return Err(Status::invalid_argument(format!( "Invalid vector dimension: expected {}, got {}", self.params.dimension, vector.len() ))); } Ok(()) } } #[tonic::async_trait] impl DatabaseService for Arc { async fn heartbeat( &self, _request: Request<()>, ) -> Result, Status> { let response = protos::HeartbeatResponse { version: env!("CARGO_PKG_VERSION").to_string(), }; Ok(Response::new(response)) } async fn snapshot( &self, _request: Request<()>, ) -> Result, Status> { let stats = self.create_snapshot().map_err(|e| { let message = format!("Failed to create a snapshot: {e}"); Status::internal(message) })?; Ok(Response::new(stats.into())) } async fn insert( &self, request: Request, ) -> Result, Status> { let record = match request.into_inner().record { Some(record) => Record::try_from(record)?, None => { let message = "Record data is required for insertion"; return Err(Status::invalid_argument(message)); } }; self.validate_dimension(&record.vector)?; let id = RecordID::new(); // Insert the record into the storage. // This operation must be done before updating the index. Otherwise, // the index won't have access to the record data. let mut storage = self.storage.write().unwrap(); storage.insert(&id, &record)?; let mut index = self.index.write().unwrap(); index.insert(&id, &record, storage.records())?; tracing::info!("Inserted a new record with ID: {id}"); Ok(Response::new(protos::InsertResponse { id: id.to_string() })) } async fn get( &self, request: Request, ) -> Result, Status> { let request = request.into_inner(); let id = request.id.parse::()?; let storage = self.storage.read().unwrap(); let record = storage.get(&id)?.to_owned(); let response = protos::GetResponse { record: Some(record.into()) }; Ok(Response::new(response)) } async fn delete( &self, request: Request, ) -> Result, Status> { let request = request.into_inner(); let id = request.id.parse::()?; let mut index = self.index.write().unwrap(); index.delete(&id)?; let mut storage = self.storage.write().unwrap(); storage.delete(&id)?; tracing::info!("Deleted a record with ID: {id}"); Ok(Response::new(())) } async fn update( &self, request: Request, ) -> Result, Status> { let request = request.into_inner(); let id = request.id.parse::()?; let mut metadata = HashMap::new(); for (key, value) in request.metadata { metadata.insert(key, value.try_into()?); } let mut storage = self.storage.write().unwrap(); storage.update(&id, &metadata)?; tracing::info!("Updated metadata for a record: {id}"); Ok(Response::new(())) } async fn query( &self, request: Request, ) -> Result, Status> { let request = request.into_inner(); let vector = match request.vector { Some(vector) => Vector::try_from(vector)?, None => { let message = "Vector is required for query operation"; return Err(Status::invalid_argument(message)); } }; self.validate_dimension(&vector)?; let k = request.k as usize; if k == 0 { let message = "Invalid k value, k must be greater than 0"; return Err(Status::invalid_argument(message)); } let filter = Filters::try_from(request.filter.as_str())?; let params = match request.params { Some(params) => QueryParameters::try_from(params)?, None => QueryParameters::default(), }; let storage = self.storage.read().unwrap(); let records = storage.records(); let index = self.index.read().unwrap(); let results = index .query(&vector, k, &filter, ¶ms, records)? .into_iter() .map(Into::into) .collect(); Ok(Response::new(protos::QueryResponse { results })) } } #[cfg(test)] mod tests { use super::*; use uuid::Uuid; #[test] fn test_open() { let db = setup_db(); assert_eq!(db.params, Parameters::default()); } #[tokio::test] async fn test_heartbeat() { let db = setup_db(); let request = Request::new(()); let response = db.heartbeat(request).await.unwrap(); assert_eq!(response.get_ref().version, env!("CARGO_PKG_VERSION")); } #[tokio::test] async fn test_insert() { let params = Parameters::default(); let db = setup_db(); let vector = Vector::random(params.dimension); let request = Request::new(protos::InsertRequest { record: Some(protos::Record { vector: Some(vector.into()), metadata: std::collections::HashMap::new(), }), }); let response = db.insert(request).await.unwrap(); assert!(response.get_ref().id.parse::().is_ok()); assert_eq!(db.storage.read().unwrap().records().len(), 1); } fn setup_db() -> Arc { if Database::dir().exists() { fs::remove_dir_all(Database::dir()).unwrap(); } let params = Parameters::default(); Database::configure(¶ms); Arc::new(Database::open().unwrap()) } impl Default for Parameters { fn default() -> Self { Parameters { dimension: 128, metric: Metric::Euclidean, density: 64, } } } } ================================================ FILE: src/cores/index.rs ================================================ use super::*; use std::cmp::{min, Ordering}; use std::collections::BinaryHeap; use std::rc::Rc; type ClusterIndex = usize; /// ANNS search result containing the metadata of the record. /// /// We exclude the vector data from the result because it doesn't provide /// any additional value on the search result. If users are interested in /// the vector data, they can use the get method to retrieve the record. #[derive(Debug, Clone)] pub struct QueryResult { pub id: RecordID, pub metadata: HashMap, pub distance: f32, } impl Eq for QueryResult {} impl PartialEq for QueryResult { /// Compare two query results based on their IDs. fn eq(&self, other: &Self) -> bool { self.id == other.id } } impl Ord for QueryResult { fn cmp(&self, other: &Self) -> Ordering { self.distance.partial_cmp(&other.distance).unwrap_or(Ordering::Equal) } } impl PartialOrd for QueryResult { /// Allow the query results to be sorted based on their distance. fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } impl From for protos::QueryResult { fn from(value: QueryResult) -> Self { let metadata = value .metadata .into_iter() .map(|(key, value)| (key, value.into())) .collect(); protos::QueryResult { id: value.id.to_string(), metadata, distance: value.distance, } } } /// ANNS Index interface. /// /// OasysDB uses a modified version of IVF index algorithm. This custom index /// implementation allows OasysDB to maintain a balanced index structure /// allowing the clusters to grow to accommodate data growth. #[repr(C)] #[derive(Debug, Serialize, Deserialize)] pub struct Index { centroids: Vec, clusters: Vec>, // Index parameters. metric: Metric, density: usize, } impl Index { /// Create a new index instance with default parameters. /// /// Default parameters: /// - metric: Euclidean /// - density: 256 pub fn new() -> Self { Index { centroids: vec![], clusters: vec![], metric: Metric::Euclidean, density: 256, } } /// Configure the metric used for distance calculations. pub fn with_metric(mut self, metric: Metric) -> Self { self.metric = metric; self } /// Configure the density of the index. pub fn with_density(mut self, density: usize) -> Self { self.density = density; self } /// Insert a new record into the index. /// /// This method required the reference to all the records because /// during the cluster splitting process, the record assignments /// will be re-calculated pub fn insert( &mut self, id: &RecordID, record: &Record, records: &HashMap, ) -> Result<(), Status> { let vector = &record.vector; let nearest_centroid = self.find_nearest_centroid(vector); // If the index is empty, the record's vector will be // the first centroid. if nearest_centroid.is_none() { let cluster_id = self.insert_centroid(vector); self.clusters[cluster_id].push(*id); return Ok(()); } let nearest_centroid = nearest_centroid.unwrap(); if self.clusters[nearest_centroid].len() < self.density { self.update_centroid(&nearest_centroid, vector); self.clusters[nearest_centroid].push(*id); } else { // If the cluster is full, insert the record into the cluster // and split the cluster with KMeans algorithm. self.clusters[nearest_centroid].push(*id); self.split_cluster(&nearest_centroid, records); } Ok(()) } /// Delete a record from the index by its ID. /// /// This method will iterate over all the clusters and remove the record /// from the cluster if it exists. This method doesn't update the value of /// the cluster's centroid. pub fn delete(&mut self, id: &RecordID) -> Result<(), Status> { // Find the cluster and record indices where the record is stored. let cluster_record_index = self.clusters.iter().enumerate().find_map(|(i, cluster)| { cluster.par_iter().position_first(|x| x == id).map(|x| (i, x)) }); if let Some((cluster_ix, record_ix)) = cluster_record_index { // If the cluster has only one record, remove the cluster and // centroid from the index. This won't happen often. if self.clusters[cluster_ix].len() == 1 { self.clusters.remove(cluster_ix); self.centroids.remove(cluster_ix); } else { self.clusters[cluster_ix].remove(record_ix); } } Ok(()) } /// Search for the nearest neighbors of a given vector. /// /// This method uses the IVF search algorithm to find the nearest neighbors /// of the query vector. The filtering process of the search is done within /// the boundaries of the nearest clusters to the query vector. pub fn query( &self, vector: &Vector, k: usize, filters: &Filters, params: &QueryParameters, records: &HashMap, ) -> Result, Status> { let QueryParameters { probes, radius } = params.to_owned(); let probes = min(probes, self.centroids.len()); let nearest_clusters = self.sort_nearest_centroids(vector); let mut results = BinaryHeap::new(); for cluster_id in nearest_clusters.iter().take(probes) { for record_id in &self.clusters[*cluster_id] { let record = match records.get(record_id) { Some(record) => record, None => continue, }; let distance = self.metric.distance(&record.vector, vector); let distance = match distance { Some(distance) => distance as f32, None => continue, }; // Check if the record is within the search radius and // the record's metadata passes the filters. if distance > radius || !filters.apply(&record.metadata) { continue; } results.push(QueryResult { id: *record_id, metadata: record.metadata.clone(), distance, }); if results.len() > k { results.pop(); } } } Ok(results.into_sorted_vec()) } /// Insert a new centroid and cluster into the index. /// - vector: Centroid vector. fn insert_centroid(&mut self, vector: &Vector) -> ClusterIndex { self.centroids.push(vector.to_owned()); self.clusters.push(vec![]); self.centroids.len() - 1 } /// Recalculate the centroid of a cluster with the new vector. /// /// This method must be called before inserting the new vector into the /// cluster because this method calculates the new centroid by taking the /// weighted average of the current centroid and adding the new vector /// before normalizing the result with the new cluster size. fn update_centroid(&mut self, cluster_id: &ClusterIndex, vector: &Vector) { let count = self.clusters[*cluster_id].len() as f32; self.centroids[*cluster_id] = self.centroids[*cluster_id] .as_slice() .iter() .zip(vector.as_slice()) .map(|(a, b)| (a * count) + b / count + 1.0) .collect::>() .into(); } /// Find the nearest centroid to a given vector. /// /// If the index is empty, this method will return None. Otherwise, it will /// calculate the distance between the given vector and all centroids and /// return the index of the centroid with the smallest distance. fn find_nearest_centroid(&self, vector: &Vector) -> Option { self.centroids .par_iter() .map(|centroid| self.metric.distance(centroid, vector)) .enumerate() .min_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap()) .map(|(index, _)| index) } /// Sort the centroids by their distance to a given vector. /// /// This method returns an array of cluster indices sorted by their /// distance to the vector. The first element will be the index of the /// nearest centroid. fn sort_nearest_centroids(&self, vector: &Vector) -> Vec { let mut distances = self .centroids .par_iter() .enumerate() .map(|(i, centroid)| (i, self.metric.distance(centroid, vector))) .collect::)>>(); // Sort the distances in ascending order. If the distance is NaN or // something else, it will be placed at the end. distances.sort_by(|(_, a), (_, b)| { a.partial_cmp(b).unwrap_or(Ordering::Greater) }); distances.iter().map(|(i, _)| *i).collect() } /// Split a cluster into two new clusters. /// /// The current cluster will be halved. The first half will be assigned to /// the current cluster, and the second half will be assigned to a new /// cluster with a new centroid. fn split_cluster( &mut self, cluster_id: &ClusterIndex, records: &HashMap, ) { let record_ids = &self.clusters[*cluster_id]; let vectors = record_ids .iter() .map(|id| &records.get(id).unwrap().vector) .collect::>(); let mut kmeans = KMeans::new(2).with_metric(self.metric); kmeans.fit(Rc::from(vectors)).unwrap(); let centroids = kmeans.centroids(); self.centroids[*cluster_id] = centroids[0].to_owned(); self.centroids.push(centroids[1].to_owned()); let mut clusters = [vec![], vec![]]; let assignments = kmeans.assignments(); for (i, cluster_id) in assignments.iter().enumerate() { clusters[*cluster_id].push(record_ids[i]); } self.clusters[*cluster_id] = clusters[0].to_vec(); self.clusters.push(clusters[1].to_vec()); } } #[cfg(test)] mod tests { use super::*; #[test] fn test_insert_many() { let params = Parameters::default(); let mut index = setup_index(¶ms); let mut records = HashMap::new(); for _ in 0..1000 { let id = RecordID::new(); let record = Record::random(params.dimension); records.insert(id, record); } for (id, record) in records.iter() { index.insert(id, record, &records).unwrap(); } assert!(index.centroids.len() > 20); } #[test] fn test_delete() { let params = Parameters::default(); let mut index = setup_index(¶ms); let mut ids = vec![]; for _ in 0..10 { let centroid = Vector::random(params.dimension); let mut cluster = vec![]; for _ in 0..10 { let id = RecordID::new(); cluster.push(id); ids.push(id); } index.centroids.push(centroid); index.clusters.push(cluster); } assert_eq!(ids.len(), 100); assert_eq!(index.centroids.len(), 10); index.delete(&ids[0]).unwrap(); for cluster in index.clusters.iter() { assert!(!cluster.contains(&ids[0])); } for i in 1..10 { index.delete(&ids[i]).unwrap(); } assert_eq!(index.centroids.len(), 9); } #[test] fn test_query() { let params = Parameters::default(); let mut index = setup_index(¶ms); // Populate the index with 1000 sequential records. // This allows us to predict the order of the results. let mut ids = vec![]; let mut records = HashMap::new(); for i in 0..1000 { let id = RecordID::new(); let vector = Vector::from(vec![i as f32; params.dimension]); let mut metadata = HashMap::new(); let value = Value::Number((1000 + i) as f64); metadata.insert("number".to_string(), value); let record = Record { vector, metadata }; records.insert(id, record); ids.push(id); } for (id, record) in records.iter() { index.insert(id, record, &records).unwrap(); } let query = Vector::from(vec![1.0; params.dimension]); let query_params = QueryParameters::default(); let result = index .query(&query, 10, &Filters::None, &query_params, &records) .unwrap(); assert_eq!(result.len(), 10); assert!(result.iter().any(|r| r.id == ids[0])); let metadata_filters = Filters::try_from("number > 1050").unwrap(); let result = index .query(&query, 10, &metadata_filters, &query_params, &records) .unwrap(); assert_eq!(result.len(), 10); assert!(result.iter().any(|r| r.id == ids[51])); } #[test] fn test_insert_centroid() { let params = Parameters::default(); let mut index = setup_index(¶ms); let vector = Vector::random(params.dimension); let cluster_id = index.insert_centroid(&vector); assert_eq!(index.centroids.len(), 1); assert_eq!(index.clusters.len(), 1); assert_eq!(index.centroids[0], vector); assert_eq!(cluster_id, 0); } #[test] fn test_update_centroid() { let params = Parameters::default(); let mut index = setup_index(¶ms); let initial_centroid = Vector::from(vec![0.0; params.dimension]); let cluster_id = index.insert_centroid(&initial_centroid); index.clusters[cluster_id].push(RecordID::new()); let vector = Vector::from(vec![1.0; params.dimension]); index.update_centroid(&cluster_id, &vector); let centroid = Vector::from(vec![0.5; params.dimension]); assert_ne!(index.centroids[cluster_id], centroid); } #[test] fn test_find_nearest_centroid_empty() { let params = Parameters::default(); let index = setup_index(¶ms); let query = Vector::random(params.dimension); assert_eq!(index.find_nearest_centroid(&query), None); } #[test] fn test_find_nearest_centroid() { let params = Parameters::default(); let mut index = setup_index(¶ms); for i in 1..5 { let centroid = Vector::from(vec![i as f32; params.dimension]); index.centroids.push(centroid); } let query = Vector::from(vec![0.0; params.dimension]); assert_eq!(index.find_nearest_centroid(&query), Some(0)); } #[test] fn test_split_cluster() { let params = Parameters::default(); let mut index = setup_index(¶ms); let mut ids = vec![]; let mut records = HashMap::new(); for i in 1..5 { let id = RecordID::new(); let vector = Vector::from(vec![i as f32; params.dimension]); let record = Record { vector, metadata: HashMap::new() }; ids.push(id); records.insert(id, record); } let centroid = Vector::from(vec![2.5; params.dimension]); index.centroids.push(centroid); index.clusters.push(ids); index.split_cluster(&0, &records); assert_eq!(index.centroids.len(), 2); } #[test] fn test_sort_nearest_centroids() { let params = Parameters::default(); let mut index = setup_index(¶ms); for i in 1..5 { let centroid = Vector::from(vec![i as f32; params.dimension]); index.centroids.push(centroid); } let query = Vector::from(vec![5.0; params.dimension]); let nearest = index.sort_nearest_centroids(&query); assert_eq!(nearest, vec![3, 2, 1, 0]); } fn setup_index(params: &Parameters) -> Index { let index = Index::new() .with_metric(params.metric) .with_density(params.density); index } } ================================================ FILE: src/cores/mod.rs ================================================ // Initialize the modules without making them public. mod database; mod index; mod storage; // Re-export types from the modules. pub use database::*; pub use index::*; pub use storage::*; // Import common dependencies below. use crate::protos; use crate::types::*; use crate::utils::kmeans::KMeans; use hashbrown::HashMap; use rayon::prelude::*; use serde::de::DeserializeOwned; use serde::{Deserialize, Serialize}; use std::error::Error; use std::fs::OpenOptions; use std::path::{Path, PathBuf}; use std::sync::{Arc, RwLock}; use std::{env, fs}; use tonic::Status; ================================================ FILE: src/cores/storage.rs ================================================ use super::*; /// Record storage interface. /// /// This interface wraps around Hashbrown's HashMap implementation to store /// the records. In the future, if needed, we can modify the storage /// implementation without changing the rest of the code. #[repr(C)] #[derive(Debug, Serialize, Deserialize)] pub struct Storage { count: usize, records: HashMap, } impl Storage { /// Create a new empty storage instance. pub fn new() -> Self { Storage { count: 0, records: HashMap::new() } } /// Insert a new record into the record storage. pub fn insert( &mut self, id: &RecordID, record: &Record, ) -> Result<(), Status> { self.records.insert(*id, record.to_owned()); self.count += 1; Ok(()) } /// Retrieve a record from the storage given its ID. pub fn get(&self, id: &RecordID) -> Result<&Record, Status> { let record = self.records.get(id); if record.is_none() { let message = "The specified record is not found"; return Err(Status::not_found(message)); } Ok(record.unwrap()) } /// Delete a record from the storage given its ID. pub fn delete(&mut self, id: &RecordID) -> Result<(), Status> { self.records.remove(id); self.count -= 1; Ok(()) } /// Update a record metadata given its ID. /// /// Vector data should be immutable as it is tightly coupled with the /// semantic meaning of the record. If the vector data changes, users /// should create a new record instead. pub fn update( &mut self, id: &RecordID, metadata: &HashMap, ) -> Result<(), Status> { let record = match self.records.get_mut(id) { Some(record) => record, None => { let message = "The specified record is not found"; return Err(Status::not_found(message)); } }; record.metadata = metadata.to_owned(); Ok(()) } /// Return a reference to the records in the storage. pub fn records(&self) -> &HashMap { &self.records } /// Return the number of records in the storage. pub fn count(&self) -> usize { self.count } } #[cfg(test)] mod tests { use super::*; #[test] fn test_insert() { let mut storage = Storage::new(); let record = Record::random(128); let id = RecordID::new(); storage.insert(&id, &record).unwrap(); assert_eq!(storage.count, 1); assert_eq!(storage.count, storage.records.len()); } #[test] fn test_delete() { let mut storage = Storage::new(); let record = Record::random(128); let id = RecordID::new(); storage.insert(&id, &record).unwrap(); storage.delete(&id).unwrap(); assert_eq!(storage.count, 0); assert_eq!(storage.count, storage.records.len()); } #[test] fn test_update() { let mut storage = Storage::new(); let record = Record::random(128); let id = RecordID::new(); storage.insert(&id, &record).unwrap(); let mut metadata = HashMap::new(); metadata.insert("key".to_string(), Value::random()); storage.update(&id, &metadata).unwrap(); let updated_record = storage.records.get(&id).unwrap(); assert_eq!(updated_record.metadata, metadata); } } ================================================ FILE: src/main.rs ================================================ mod cores; mod protos; mod types; mod utils; use clap::{arg, ArgMatches, Command}; use cores::{Database, Parameters}; use dotenv::dotenv; use protos::database_server::DatabaseServer; use std::sync::Arc; use std::thread; use std::time::Duration; use tonic::transport::Server; use types::Metric; const SNAPSHOT_INTERVAL: Duration = Duration::from_secs(600); #[tokio::main] async fn main() { dotenv().ok(); tracing_subscriber::fmt::init(); let command = Command::new(env!("CARGO_PKG_NAME")) .version(env!("CARGO_PKG_VERSION")) .about("Interface to setup and manage OasysDB server") .arg_required_else_help(true) .subcommand(start()) .subcommand(configure()) .get_matches(); match command.subcommand() { Some(("start", args)) => start_handler(args).await, Some(("configure", args)) => configure_handler(args).await, _ => unreachable!(), } } fn start() -> Command { let arg_port = arg!(--port "Port to listen on") .default_value("2505") .value_parser(clap::value_parser!(u16)) .allow_negative_numbers(false); Command::new("start") .alias("run") .about("Start the database server") .arg(arg_port) } async fn start_handler(args: &ArgMatches) { // Unwrap is safe because Clap validates the arguments. let port = args.get_one::("port").unwrap(); let addr = format!("[::]:{port}").parse().unwrap(); let db = Arc::new(Database::open().expect("Failed to open the database")); let db_clone = db.clone(); thread::spawn(move || loop { thread::sleep(SNAPSHOT_INTERVAL); db_clone.create_snapshot().expect("Failed to create a snapshot"); }); tracing::info!("Database server is ready on port {port}"); Server::builder() .add_service(DatabaseServer::new(db)) .serve(addr) .await .expect("Failed to start the database"); } fn configure() -> Command { let arg_dimension = arg!(--dim "Vector dimension") .required(true) .value_parser(clap::value_parser!(usize)) .allow_negative_numbers(false); // List optional arguments below. let arg_metric = arg!(--metric "Metric to calculate distance") .default_value(Metric::Euclidean.as_str()) .value_parser(clap::value_parser!(Metric)); let arg_density = arg!(--density "Density of the cluster") .default_value("256") .value_parser(clap::value_parser!(usize)) .allow_negative_numbers(false); Command::new("configure") .about("Configure the initial database parameters") .arg(arg_dimension) .arg(arg_metric) .arg(arg_density) } async fn configure_handler(args: &ArgMatches) { let dim = *args.get_one::("dim").unwrap(); let metric = *args.get_one::("metric").unwrap(); let density = *args.get_one::("density").unwrap(); let params = Parameters { dimension: dim, metric, density }; Database::configure(¶ms); } ================================================ FILE: src/protos.rs ================================================ #![allow(clippy::all)] #![allow(non_snake_case)] tonic::include_proto!("database"); ================================================ FILE: src/types/filter.rs ================================================ use super::*; /// Joined multiple filters operation with either AND or OR. /// /// At the moment, OasysDB only supports single-type join operations. This /// means that we can't use both AND and OR operations in the same filter. #[derive(Debug, Clone, PartialEq, PartialOrd)] pub enum Filters { None, And(Vec), Or(Vec), } impl Filters { /// Returns true if the record passes the filters. /// - metadata: Record metadata to check against the filters. /// /// Filters of NONE type will always return true. This is useful when /// no filters are provided and we want to include all records. pub fn apply(&self, metadata: &HashMap) -> bool { match self { Filters::None => true, Filters::And(filters) => filters.iter().all(|f| f.apply(metadata)), Filters::Or(filters) => filters.iter().any(|f| f.apply(metadata)), } } } impl TryFrom<&str> for Filters { type Error = Status; fn try_from(value: &str) -> Result { if value.is_empty() { return Ok(Filters::None); } const OR: &str = " OR "; const AND: &str = " AND "; // Check which join operator is used. let or_count = value.matches(OR).count(); let and_count = value.matches(AND).count(); if or_count > 0 && and_count > 0 { let message = "Mixing AND and OR join operators is not supported"; return Err(Status::invalid_argument(message)); } let join = if or_count > 0 { OR } else { AND }; let filters = value .split(join) .map(TryInto::try_into) .collect::>()?; let filters = match join { OR => Filters::Or(filters), _ => Filters::And(filters), }; Ok(filters) } } /// Record metadata filter. /// /// Using the filter operator, the record metadata can be compared against /// a specific value to determine if it should be included in the results. #[derive(Debug, Clone, PartialEq, PartialOrd)] pub struct Filter { key: String, value: Value, operator: Operator, } impl Filter { fn apply(&self, metadata: &HashMap) -> bool { let value = match metadata.get(&self.key) { Some(value) => value, None => return false, }; match (value, &self.value) { (Value::Text(a), Value::Text(b)) => self.filter_text(a, b), (Value::Number(a), Value::Number(b)) => self.filter_number(a, b), (Value::Boolean(a), Value::Boolean(b)) => self.filter_boolean(a, b), _ => false, } } fn filter_text(&self, a: impl AsRef, b: impl AsRef) -> bool { let (a, b) = (a.as_ref(), b.as_ref()); match self.operator { Operator::Equal => a == b, Operator::NotEqual => a != b, Operator::Contains => a.contains(b), _ => false, } } fn filter_number(&self, a: &f64, b: &f64) -> bool { match self.operator { Operator::Equal => a == b, Operator::NotEqual => a != b, Operator::GreaterThan => a > b, Operator::GreaterThanOrEqual => a >= b, Operator::LessThan => a < b, Operator::LessThanOrEqual => a <= b, _ => false, } } fn filter_boolean(&self, a: &bool, b: &bool) -> bool { match self.operator { Operator::Equal => a == b, Operator::NotEqual => a != b, _ => false, } } } impl TryFrom<&str> for Filter { type Error = Status; fn try_from(value: &str) -> Result { if value.is_empty() { let message = "Filter string cannot be empty"; return Err(Status::invalid_argument(message)); } // Split the filter string into EXACTLY 3 parts. let parts = value .splitn(3, ' ') .map(|token| token.trim()) .collect::>(); let key = parts[0].to_string(); let operator = Operator::try_from(parts[1])?; let value = Value::from(parts[2]); let filter = Filter { key, value, operator }; Ok(filter) } } #[derive(Debug, Clone, Copy, Eq, PartialEq, PartialOrd)] pub enum Operator { Equal, NotEqual, GreaterThan, GreaterThanOrEqual, LessThan, LessThanOrEqual, Contains, } impl TryFrom<&str> for Operator { type Error = Status; fn try_from(value: &str) -> Result { let operator = match value { "CONTAINS" => Operator::Contains, "=" => Operator::Equal, "!=" => Operator::NotEqual, ">" => Operator::GreaterThan, ">=" => Operator::GreaterThanOrEqual, "<" => Operator::LessThan, "<=" => Operator::LessThanOrEqual, _ => { let message = format!("Invalid filter operator: {value}"); return Err(Status::invalid_argument(message)); } }; Ok(operator) } } #[cfg(test)] mod tests { use super::*; use std::error::Error; #[test] fn test_filters_from_string() { let filters = Filters::try_from("name CONTAINS Ada").unwrap(); let expected = Filters::And(vec![Filter { key: "name".into(), value: "Ada".into(), operator: Operator::Contains, }]); assert_eq!(filters, expected); let filters = Filters::try_from("gpa >= 3.0 OR age < 21").unwrap(); let expected = { let filter_gpa = Filter { key: "gpa".into(), value: Value::Number(3.0), operator: Operator::GreaterThanOrEqual, }; let filter_age = Filter { key: "age".into(), value: Value::Number(21.0), operator: Operator::LessThan, }; Filters::Or(vec![filter_gpa, filter_age]) }; assert_eq!(filters, expected); } #[test] fn test_filters_apply() -> Result<(), Box> { let data = setup_metadata(); let filters = Filters::try_from("name CONTAINS Alice")?; assert!(filters.apply(&data)); let filters = Filters::try_from("name = Bob")?; assert!(!filters.apply(&data)); let filters = Filters::try_from("age >= 20 AND gpa < 4.0")?; assert!(filters.apply(&data)); let filters = Filters::try_from("age >= 20 AND gpa < 3.0")?; assert!(!filters.apply(&data)); let filters = Filters::try_from("active = true")?; assert!(filters.apply(&data)); Ok(()) } fn setup_metadata() -> HashMap { let keys = vec!["name", "age", "gpa", "active"]; let values: Vec = vec![ "Alice".into(), Value::Number(20.0), Value::Number(3.5), Value::Boolean(true), ]; let mut data = HashMap::new(); for (key, value) in keys.into_iter().zip(values.into_iter()) { data.insert(key.into(), value); } data } } ================================================ FILE: src/types/metric.rs ================================================ use super::*; use simsimd::SpatialSimilarity; // Distance name constants. const EUCLIDEAN: &str = "euclidean"; const COSINE: &str = "cosine"; /// Distance formula for vector similarity calculations. /// /// ### Euclidean /// We use the squared Euclidean distance instead for a slight performance /// boost since we only use the distance for comparison. /// /// ### Cosine /// We use cosine distance instead of cosine similarity to be consistent with /// other distance metrics where a lower value indicates a closer match. #[allow(missing_docs)] #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)] pub enum Metric { Euclidean, Cosine, } impl Metric { /// Calculate the distance between two vectors. pub fn distance(&self, a: &Vector, b: &Vector) -> Option { let (a, b) = (a.as_slice(), b.as_slice()); match self { Metric::Euclidean => f32::sqeuclidean(a, b), Metric::Cosine => f32::cosine(a, b), } } /// Return the metric name as a string slice. pub fn as_str(&self) -> &str { match self { Metric::Euclidean => EUCLIDEAN, Metric::Cosine => COSINE, } } } impl From<&str> for Metric { fn from(value: &str) -> Self { let value = value.to_lowercase(); match value.as_str() { COSINE => Metric::Cosine, EUCLIDEAN => Metric::Euclidean, _ => panic!("Metric should be cosine or euclidean"), } } } impl From for Metric { fn from(value: String) -> Self { Metric::from(value.as_str()) } } #[cfg(test)] mod tests { use super::*; #[test] fn test_distance() { let a = Vector::from(vec![1.0, 2.0, 3.0]); let b = Vector::from(vec![4.0, 5.0, 6.0]); let euclidean = Metric::Euclidean.distance(&a, &b).unwrap(); let cosine = Metric::Cosine.distance(&a, &b).unwrap(); assert_eq!(euclidean, 27.0); assert_eq!(cosine.round(), 0.0); } } ================================================ FILE: src/types/mod.rs ================================================ // Initialize modules without publicizing them. mod filter; mod metric; mod record; mod vector; // Re-export types from the modules. pub use filter::*; pub use metric::*; pub use record::*; pub use vector::*; // Import common dependencies below. use crate::protos; use hashbrown::HashMap; use serde::{Deserialize, Serialize}; use tonic::Status; ================================================ FILE: src/types/record.rs ================================================ use super::*; use std::fmt; use std::str::FromStr; use uuid::Uuid; /// Record identifier. /// /// OasysDB should be able to deal with a lot of writes and deletes. Using UUID /// version 4 to allow us to generate a lot of IDs with very low probability /// of collision. #[derive(Debug, Serialize, Deserialize, Clone, Copy)] #[derive(PartialOrd, Ord, PartialEq, Eq, Hash)] pub struct RecordID(Uuid); impl RecordID { /// Generate a new random record ID using UUID v4. pub fn new() -> Self { RecordID(Uuid::new_v4()) } } impl fmt::Display for RecordID { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{}", self.0) } } impl FromStr for RecordID { type Err = Status; fn from_str(s: &str) -> Result { Ok(RecordID(Uuid::try_parse(s).map_err(|_| { let message = "Record ID should be a string-encoded UUID"; Status::invalid_argument(message) })?)) } } /// Metadata value. /// /// OasysDB doesn't support nested objects in metadata for performance reasons. /// We only need to support primitive types for metadata. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, PartialOrd)] pub enum Value { Text(String), Number(f64), Boolean(bool), } impl From for Value { fn from(value: String) -> Self { Value::from(value.as_str()) } } impl From<&str> for Value { fn from(value: &str) -> Self { // Try to parse the value as a number. // This is must be prioritized over boolean parsing. if let Ok(float) = value.parse::() { return Value::Number(float); } if let Ok(boolean) = value.parse::() { return Value::Boolean(boolean); } // Remove quotes from the start and end of the string. // This ensures that we won't have to deal with quotes. let match_quotes = |c: char| c == '\"' || c == '\''; let value = value .trim_start_matches(match_quotes) .trim_end_matches(match_quotes) .to_string(); Value::Text(value) } } impl From for protos::Value { fn from(value: Value) -> Self { type ProtoValue = protos::value::Value; let value = match value { Value::Text(text) => ProtoValue::Text(text), Value::Number(number) => ProtoValue::Number(number), Value::Boolean(boolean) => ProtoValue::Boolean(boolean), }; protos::Value { value: Some(value) } } } impl TryFrom for Value { type Error = Status; fn try_from(value: protos::Value) -> Result { type ProtoValue = protos::value::Value; match value.value { Some(ProtoValue::Text(text)) => Ok(Value::Text(text)), Some(ProtoValue::Number(number)) => Ok(Value::Number(number)), Some(ProtoValue::Boolean(boolean)) => Ok(Value::Boolean(boolean)), None => Err(Status::invalid_argument("Metadata value is required")), } } } /// OasysDB vector record. /// /// This is the main data structure for OasysDB. It contains the vector data /// and metadata of the record. Metadata is a key-value store that can be used /// to store additional information about the vector. #[derive(Debug, Clone, Serialize, Deserialize)] pub struct Record { pub vector: Vector, pub metadata: HashMap, } impl From for protos::Record { fn from(value: Record) -> Self { let vector = value.vector.into(); let metadata = value .metadata .into_iter() .map(|(key, value)| (key, value.into())) .collect(); protos::Record { vector: Some(vector), metadata } } } impl TryFrom for Record { type Error = Status; fn try_from(value: protos::Record) -> Result { let vector = match value.vector { Some(vector) => Vector::try_from(vector)?, None => { let message = "Vector data should not be empty"; return Err(Status::invalid_argument(message)); } }; let metadata = value .metadata .into_iter() .map(|(k, v)| Ok((k, v.try_into()?))) .collect::, Self::Error>>()?; Ok(Record { vector, metadata }) } } #[cfg(test)] mod tests { use super::*; use rand::random; impl Value { pub fn random() -> Self { Value::Number(random::()) } } impl Record { pub fn random(dimension: usize) -> Self { let mut metadata = HashMap::new(); metadata.insert("key".to_string(), Value::random()); Record { vector: Vector::random(dimension), metadata } } } } ================================================ FILE: src/types/vector.rs ================================================ use super::*; /// Vector data structure. /// /// We use a boxed slice to store the vector data for a slight memory /// efficiency boost. The length of the vector is not checked, so a length /// validation should be performed before most operations. #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, PartialOrd)] pub struct Vector(Box<[f32]>); impl Vector { /// Return the vector as a slice of floating-point numbers. pub fn as_slice(&self) -> &[f32] { self.0.as_ref() } /// Return as a vector of floating-point numbers. pub fn to_vec(&self) -> Vec { self.0.to_vec() } /// Return the length of the vector. pub fn len(&self) -> usize { self.0.len() } } // Vector conversion implementations. impl From> for Vector { fn from(value: Vec) -> Self { Vector(value.into_boxed_slice()) } } impl From for protos::Vector { fn from(value: Vector) -> Self { protos::Vector { data: value.to_vec() } } } impl TryFrom for Vector { type Error = Status; fn try_from(value: protos::Vector) -> Result { Ok(Vector(value.data.into_boxed_slice())) } } #[cfg(test)] mod tests { use super::*; #[test] fn test_random_vector() { let dim = 128; let vector = Vector::random(dim); assert_eq!(vector.len(), dim); } impl Vector { pub fn random(dimension: usize) -> Self { let vector = vec![0.0; dimension] .iter() .map(|_| rand::random::()) .collect::>(); Vector(vector.into_boxed_slice()) } } } ================================================ FILE: src/utils/kmeans.rs ================================================ use super::*; use rand::seq::SliceRandom; use rand::Rng; use std::cmp::min; use std::rc::Rc; type ClusterIndex = usize; /// A list of vectors. /// /// We use a reference-counted slice to store the vectors. This allows us to /// share the vectors around without having to actually clone the vectors. type Vectors<'v> = Rc<[&'v Vector]>; /// K-means clustering algorithm. /// /// The K-means algorithm is a clustering algorithm that partitions a dataset /// into K clusters by iteratively assigning data points to the nearest cluster /// centroids and recalculating these centroids until they are stable. #[derive(Debug)] pub struct KMeans { assignments: Vec, centroids: Vec, // Algorithm parameters. metric: Metric, n_clusters: usize, max_iter: usize, } impl KMeans { /// Initialize the K-means algorithm with default parameters. /// /// Default parameters: /// - metric: Euclidean /// - max_iter: 100 pub fn new(n_clusters: usize) -> Self { Self { n_clusters, metric: Metric::Euclidean, max_iter: 100, assignments: Vec::new(), centroids: Vec::with_capacity(n_clusters), } } /// Configure the metric used for distance calculations. pub fn with_metric(mut self, metric: Metric) -> Self { self.metric = metric; self } /// Configure the maximum number of iterations to run the algorithm. #[allow(dead_code)] pub fn with_max_iter(mut self, max_iter: usize) -> Self { self.max_iter = max_iter; self } /// Train the K-means algorithm with the given vectors. pub fn fit(&mut self, vectors: Vectors) -> Result<(), Box> { if self.n_clusters > vectors.len() { let message = "Dataset is smaller than cluster configuration."; return Err(message.into()); } self.centroids = self.initialize_centroids(vectors.clone()); self.assignments = vec![0; vectors.len()]; let mut no_improvement_count = 0; for _ in 0..self.max_iter { if no_improvement_count > 3 { break; } let assignments = self.assign_clusters(vectors.clone()); // Check at most 1000 assignments for convergence. // This prevents checking the entire dataset for large datasets. let end = min(1000, assignments.len()); match assignments[0..end] == self.assignments[0..end] { true => no_improvement_count += 1, false => no_improvement_count = 0, } self.assignments = assignments; self.centroids = self.update_centroids(vectors.clone()); } Ok(()) } fn initialize_centroids(&self, vectors: Vectors) -> Vec { let mut rng = rand::thread_rng(); let mut centroids = Vec::with_capacity(self.n_clusters); // Pick the first centroid randomly. let first_centroid = vectors.choose(&mut rng).cloned().unwrap(); centroids.push(first_centroid.to_owned()); for _ in 1..self.n_clusters { let nearest_centroid_distance = |vector: &&Vector| { centroids .iter() .map(|centroid| self.metric.distance(vector, centroid)) .min_by(|a, b| a.partial_cmp(b).unwrap()) .unwrap() .unwrap() }; let distances = vectors .par_iter() .map(nearest_centroid_distance) .collect::>(); // Choose the next centroid with probability proportional // to the squared distance. let threshold = rng.gen::() * distances.iter().sum::(); let mut cumulative_sum = 0.0; for (i, distance) in distances.iter().enumerate() { cumulative_sum += distance; if cumulative_sum >= threshold { centroids.push(vectors[i].clone()); break; } } } centroids } fn update_centroids(&self, vectors: Vectors) -> Vec { let dimension = vectors[0].len(); let mut centroids = vec![vec![0.0; dimension]; self.n_clusters]; let mut cluster_count = vec![0; self.n_clusters]; // Sum up vectors assigned to the cluster into the centroid. for (i, cluster_id) in self.assignments.iter().enumerate() { let cluster_id = *cluster_id; cluster_count[cluster_id] += 1; centroids[cluster_id] = centroids[cluster_id] .iter() .zip(vectors[i].as_slice().iter()) .map(|(a, b)| a + b) .collect(); } // Divide the sum by the number of vectors in the cluster. for i in 0..self.n_clusters { // If the cluster is empty, reinitialize the centroid. if cluster_count[i] == 0 { let mut rng = rand::thread_rng(); centroids[i] = vectors.choose(&mut rng).unwrap().to_vec(); continue; } centroids[i] = centroids[i] .iter() .map(|x| x / cluster_count[i] as f32) .collect(); } centroids.into_par_iter().map(|centroid| centroid.into()).collect() } /// Create cluster assignments for the vectors. fn assign_clusters(&self, vectors: Vectors) -> Vec { vectors .par_iter() .map(|vector| self.find_nearest_centroid(vector)) .collect() } /// Find the index of the nearest centroid from a vector. pub fn find_nearest_centroid(&self, vector: &Vector) -> ClusterIndex { self.centroids .par_iter() .enumerate() .map(|(i, centroid)| (i, self.metric.distance(vector, centroid))) .min_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap()) .map(|(id, _)| id) .unwrap() } /// Returns index-mapped cluster assignment for each data point. /// /// The index corresponds to the data point index and the value corresponds /// to the cluster index. For example, given the following assignments: /// /// ```text /// [0, 1, 0, 1, 2] /// ``` /// /// This means: /// - Point 0 and 2 are assigned to cluster 0. /// - Point 1 and 3 are assigned to cluster 1. /// - Point 4 is assigned to cluster 2. /// pub fn assignments(&self) -> &[ClusterIndex] { &self.assignments } /// Returns the centroids of each cluster. pub fn centroids(&self) -> &[Vector] { &self.centroids } } #[cfg(test)] mod tests { use super::*; #[test] fn test_kmeans_fit_1_to_1() { evaluate_kmeans(1, generate_vectors(1)); } #[test] fn test_kmeans_fit_10_to_5() { evaluate_kmeans(5, generate_vectors(10)); } #[test] fn test_kmeans_fit_100_to_10() { evaluate_kmeans(10, generate_vectors(100)); } fn evaluate_kmeans(n_cluster: usize, vectors: Vec) { let vectors: Vectors = { let vectors_ref: Vec<&Vector> = vectors.iter().collect(); Rc::from(vectors_ref.as_slice()) }; let mut kmeans = KMeans::new(n_cluster); kmeans.fit(vectors.clone()).unwrap(); assert_eq!(kmeans.centroids().len(), n_cluster); let mut correct_count = 0; for (i, clusted_id) in kmeans.assignments().iter().enumerate() { let vector = vectors[i]; let nearest_centroid = kmeans.find_nearest_centroid(vector); if clusted_id == &nearest_centroid { correct_count += 1; } } let accuracy = correct_count as f32 / vectors.len() as f32; assert!(accuracy > 0.99); } fn generate_vectors(n: usize) -> Vec { (0..n).map(|i| Vector::from(vec![i as f32; 3])).collect() } } ================================================ FILE: src/utils/mod.rs ================================================ pub mod kmeans; // Import common dependencies below. use crate::types::{Metric, Vector}; use rayon::prelude::*; use std::error::Error;