Repository: valeriansaliou/sonic Branch: master Commit: 75ec203693a3 Files: 153 Total size: 708.9 KB Directory structure: gitextract_z4cxjv9t/ ├── .dockerignore ├── .github/ │ ├── FUNDING.yml │ └── workflows/ │ ├── build.yml │ └── test.yml ├── .gitignore ├── CHANGELOG.md ├── CODE_OF_CONDUCT.md ├── CONFIGURATION.md ├── CONTRIBUTING.md ├── Cargo.toml ├── Dockerfile ├── INNER_WORKINGS.md ├── LICENSE.md ├── PACKAGING.md ├── PROTOCOL.md ├── README.md ├── config.cfg ├── debian/ │ ├── changelog │ ├── compat │ ├── control │ ├── copyright │ ├── rules │ ├── sonic.install │ ├── sonic.postinst │ ├── sonic.service │ └── source/ │ └── format ├── scripts/ │ ├── build_packages.sh │ ├── release_binaries.sh │ └── sign_binaries.sh ├── src/ │ ├── channel/ │ │ ├── command.rs │ │ ├── format.rs │ │ ├── handle.rs │ │ ├── listen.rs │ │ ├── macros.rs │ │ ├── message.rs │ │ ├── mod.rs │ │ ├── mode.rs │ │ └── statistics.rs │ ├── config/ │ │ ├── defaults.rs │ │ ├── env_var.rs │ │ ├── logger.rs │ │ ├── mod.rs │ │ ├── options.rs │ │ └── reader.rs │ ├── executor/ │ │ ├── count.rs │ │ ├── flushb.rs │ │ ├── flushc.rs │ │ ├── flusho.rs │ │ ├── list.rs │ │ ├── macros.rs │ │ ├── mod.rs │ │ ├── pop.rs │ │ ├── push.rs │ │ ├── search.rs │ │ └── suggest.rs │ ├── lexer/ │ │ ├── mod.rs │ │ ├── ranges.rs │ │ ├── stopwords.rs │ │ └── token.rs │ ├── main.rs │ ├── query/ │ │ ├── actions.rs │ │ ├── builder.rs │ │ ├── mod.rs │ │ └── types.rs │ ├── stopwords/ │ │ ├── afr.rs │ │ ├── aka.rs │ │ ├── amh.rs │ │ ├── ara.rs │ │ ├── aze.rs │ │ ├── bel.rs │ │ ├── ben.rs │ │ ├── bul.rs │ │ ├── cat.rs │ │ ├── ces.rs │ │ ├── cmn.rs │ │ ├── dan.rs │ │ ├── deu.rs │ │ ├── ell.rs │ │ ├── eng.rs │ │ ├── epo.rs │ │ ├── est.rs │ │ ├── fin.rs │ │ ├── fra.rs │ │ ├── guj.rs │ │ ├── heb.rs │ │ ├── hin.rs │ │ ├── hrv.rs │ │ ├── hun.rs │ │ ├── hye.rs │ │ ├── ind.rs │ │ ├── ita.rs │ │ ├── jav.rs │ │ ├── jpn.rs │ │ ├── kan.rs │ │ ├── kat.rs │ │ ├── khm.rs │ │ ├── kor.rs │ │ ├── lat.rs │ │ ├── lav.rs │ │ ├── lit.rs │ │ ├── mal.rs │ │ ├── mar.rs │ │ ├── mkd.rs │ │ ├── mod.rs │ │ ├── mya.rs │ │ ├── nep.rs │ │ ├── nld.rs │ │ ├── nob.rs │ │ ├── ori.rs │ │ ├── pan.rs │ │ ├── pes.rs │ │ ├── pol.rs │ │ ├── por.rs │ │ ├── ron.rs │ │ ├── rus.rs │ │ ├── sin.rs │ │ ├── slk.rs │ │ ├── slv.rs │ │ ├── sna.rs │ │ ├── spa.rs │ │ ├── srp.rs │ │ ├── swe.rs │ │ ├── tam.rs │ │ ├── tel.rs │ │ ├── tgl.rs │ │ ├── tha.rs │ │ ├── tuk.rs │ │ ├── tur.rs │ │ ├── ukr.rs │ │ ├── urd.rs │ │ ├── uzb.rs │ │ ├── vie.rs │ │ ├── yid.rs │ │ └── zul.rs │ ├── store/ │ │ ├── fst.rs │ │ ├── generic.rs │ │ ├── identifiers.rs │ │ ├── item.rs │ │ ├── keyer.rs │ │ ├── kv.rs │ │ ├── macros.rs │ │ ├── mod.rs │ │ └── operation.rs │ └── tasker/ │ ├── mod.rs │ ├── runtime.rs │ └── shutdown.rs └── tests/ └── integration/ ├── .gitignore ├── instance/ │ └── config.cfg ├── runner/ │ ├── package.json │ └── runner.js ├── scenarios/ │ ├── insert.js │ └── ping.js └── scripts/ └── run.sh ================================================ FILE CONTENTS ================================================ ================================================ FILE: .dockerignore ================================================ tests/* target/* data/* ================================================ FILE: .github/FUNDING.yml ================================================ # These are supported funding model platforms github: valeriansaliou ================================================ FILE: .github/workflows/build.yml ================================================ on: push: tags: - "v*.*.*" name: Build and Release jobs: build-releases: runs-on: ubuntu-22.04 steps: - name: Checkout code uses: actions/checkout@v2 - name: Cache build artifacts id: cache-cargo uses: actions/cache@v4 with: path: | ~/.cargo/bin ~/.cargo/registry ~/.cargo/git target key: build-${{ runner.os }}-cargo-any - name: Install Rust toolchain uses: actions-rs/toolchain@v1 with: toolchain: stable components: rustfmt override: true - name: Verify versions run: rustc --version && rustup --version && cargo --version - name: Get current tag id: current_tag uses: WyriHaximus/github-action-get-previous-tag@v1 - name: Release package run: cargo publish --no-verify --token ${CRATES_TOKEN} env: CRATES_TOKEN: ${{ secrets.CRATES_TOKEN }} - name: Release binaries run: ./scripts/release_binaries.sh --version=${{ steps.current_tag.outputs.tag }} - name: Release new version uses: softprops/action-gh-release@v1 with: tag_name: ${{ steps.current_tag.outputs.tag }} name: Sonic ${{ steps.current_tag.outputs.tag }} body: "⚠️ Changelog not yet provided." files: ./${{ steps.current_tag.outputs.tag }}-*.tar.gz env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} build-packages: needs: build-releases runs-on: ubuntu-latest steps: - name: Checkout code uses: actions/checkout@v2 - name: Build packages run: ./scripts/build_packages.sh - name: Push packages to Packagecloud uses: faucetsdn/action-packagecloud-upload-debian-packages@v1 with: path: ./packages repo: ${{ secrets.PACKAGECLOUD_REPO }} token: ${{ secrets.PACKAGECLOUD_TOKEN }} build-docker: runs-on: ubuntu-latest steps: - name: Checkout code uses: actions/checkout@v4 - name: Acquire Docker image metadata id: metadata uses: docker/metadata-action@v4 with: images: valeriansaliou/sonic - name: Login to Docker Hub uses: docker/login-action@v3 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} - name: Build and push Docker image uses: docker/build-push-action@v4 id: build with: context: . tags: ${{ steps.metadata.outputs.tags }} labels: ${{ steps.metadata.outputs.labels }} push: true ================================================ FILE: .github/workflows/test.yml ================================================ on: [push, pull_request] name: Test and Build jobs: test: strategy: matrix: os: [ubuntu-latest] rust-toolchain: [stable] fail-fast: false runs-on: ${{ matrix.os }} steps: - name: Checkout code uses: actions/checkout@v2 - name: Cache build artifacts id: cache-cargo uses: actions/cache@v4 with: path: | ~/.cargo/registry ~/.cargo/git target key: test-${{ runner.os }}-cargo-${{ matrix.rust-toolchain }} - name: Cache integration artifacts id: cache-integration uses: actions/cache@v4 with: path: | tests/integration/runner/node_modules key: test-${{ runner.os }}-integration-${{ matrix.rust-toolchain }} - name: Install Rust toolchain uses: actions-rs/toolchain@v1 with: toolchain: ${{ matrix.rust-toolchain }} components: rustfmt override: true - name: Install NodeJS uses: actions/setup-node@v1 - name: Verify versions run: rustc --version && rustup --version && cargo --version && node --version && npm --version - name: Build code run: cargo build - name: Test code run: cargo test - name: Check code style run: cargo fmt -- --check - name: Run integration tests run: tests/integration/scripts/run.sh ================================================ FILE: .gitignore ================================================ target/* .DS_Store *~ *# .cargo data/store/fst/* data/store/kv/* ================================================ FILE: CHANGELOG.md ================================================ Sonic Changelog =============== ## 1.4.9 (2024-06-16) ### Changes * Update Rust code style to conform to new `rustc` requirements (preventing builds on `rustc 1.79.0` and further) [[@jaseemabid](https://github.com/jaseemabid), [#321](https://github.com/valeriansaliou/sonic/pull/321)]. ## 1.4.8 (2023-12-14) ### Changes * Pull out the `arm64` platform from the Docker image, since it does not build in acceptable time via GitHub Actions due to using QEMU emulation (will wait that GitHub Actions provides a native `arm64` runner) [[@valeriansaliou](https://github.com/valeriansaliou)]. ## 1.4.7 (2023-12-14) ### Bug Fixes * Fixed non-working `arm64` builds due to hardcoded `x86_64-unknown-linux-gnu` Rust target in the `Dockerfile` [[@valeriansaliou](https://github.com/valeriansaliou)]. ## 1.4.6 (2023-12-14) ### New Features * The Docker image is now also available for the `arm64` platform, in addition to `amd64` [[@PovilasID](https://github.com/PovilasID), [#310](https://github.com/valeriansaliou/sonic/pull/310)]. ## 1.4.5 (2023-12-11) ### Bug Fixes * Fixed an issue where system clock can move back to the past on a virtualized system, resulting in client threads entering a crash loop due to mutex poisoning [[@valeriansaliou](https://github.com/valeriansaliou)]. ## 1.4.4 (2023-12-08) ### Bug Fixes * Fixed `rocksdb` not building due to a `rust-bindgen` version which was not compatible with `clang` version 16 [[@anthonyroussel](https://github.com/anthonyroussel), [#316](https://github.com/valeriansaliou/sonic/pull/316)]. ### Changes * Dependencies have been bumped to latest versions (namely: `rocksdb`, `toml`, `regex-syntax`, `hashbrown`, `lindera-core`, `lindera-dictionary`, `lindera-tokenizer`) [[@valeriansaliou](https://github.com/valeriansaliou)]. ## 1.4.3 (2023-09-04) ### Changes * Publish `.deb` packages for Debian 12 on `x86_64` architecture [[@valeriansaliou](https://github.com/valeriansaliou)]. ## 1.4.2 (2023-09-04) ### Changes * Produce `glibc` builds from GitHub Actions whenever a new Sonic version gets released [[@valeriansaliou](https://github.com/valeriansaliou)]. * Pull out `tokenizer-japanese` from the default features, as it x10 the final binary size [[@valeriansaliou](https://github.com/valeriansaliou)]. ## 1.4.1 (2023-08-12) ### New Features * Added support for Japanese word segmentation in tokenizer (note that as this adds quite some size overhead to the final binary size, the feature `tokenizer-japanese` can be disabled when building Sonic) [[@nmkj-io](https://github.com/nmkj-io), [#311](https://github.com/valeriansaliou/sonic/pull/311)]. ## 1.4.0 (2022-10-20) ### Bug Fixes * Fixed typo in README abstract [[@remram44](https://github.com/remram44), [#295](https://github.com/valeriansaliou/sonic/pull/295)]. * Fixed typos in code and documentation [[@kianmeng](https://github.com/kianmeng), [#294](https://github.com/valeriansaliou/sonic/pull/294)]. ### Changes * Replaced Docker source image from Debian Slim to lighter Google distroless image [[@0x0x1](https://github.com/0x0x1), [#282](https://github.com/valeriansaliou/sonic/pull/282)]. ### New Features * Added an index enumeration `LIST` command to Sonic Channel [[@trkohler](https://github.com/trkohler), [#293](https://github.com/valeriansaliou/sonic/pull/293)]. ## 1.3.5 (2022-07-10) ### Bug Fixes * Rolled back `rocksdb` version, as the latest version does not link properly in `--release` mode [[@valeriansaliou](https://github.com/valeriansaliou)]. ## 1.3.4 (2022-07-10) ### Changes * Dependencies have been bumped to latest versions (namely: `rocksdb`, `clap`, `regex`) [[@valeriansaliou](https://github.com/valeriansaliou)]. ## 1.3.3 (2022-07-07) ### Changes * Dependencies have been bumped to latest versions (namely: `hashbrown`, `whatlang`, `regex`) [[@valeriansaliou](https://github.com/valeriansaliou)]. * Moved the release pipeline to GitHub Actions [[@valeriansaliou](https://github.com/valeriansaliou)]. ### New Features * The language detection system is now about 2x faster (due to the upgrade of `whatlang` past `v0.14.0`) [[@valeriansaliou](https://github.com/valeriansaliou)]. * Added Armenian stopwords [[@valeriansaliou](https://github.com/valeriansaliou)]. * Added Georgian stopwords [[@valeriansaliou](https://github.com/valeriansaliou)]. * Added Gujarati stopwords [[@valeriansaliou](https://github.com/valeriansaliou)]. * Added Tagalog stopwords [[@valeriansaliou](https://github.com/valeriansaliou)]. ## 1.3.2 (2021-11-09) ### Bug Fixes * Fixed Norwegian stopwords [[@valeriansaliou](https://github.com/valeriansaliou), [#239](https://github.com/valeriansaliou/sonic/issues/239)]. ### Changes * Code has been formatted according to `clippy` recommendations. This does not change the way Sonic behaves [[@pleshevskiy](https://github.com/pleshevskiy), [#233](https://github.com/valeriansaliou/sonic/pull/233)]. ### New Features * Added support for Chinese word segmentation in tokenizer (note that as this adds quite some size overhead to the final binary size, the feature `tokenizer-chinese` can be disabled when building Sonic) [[@vincascm](https://github.com/vincascm), [#209](https://github.com/valeriansaliou/sonic/pull/209)]. ## 1.3.1 (2021-11-02) ### Changes * Apple Silicon is now supported [[@valeriansaliou](https://github.com/valeriansaliou)]. * Added Norwegian stopwords [[@mikalv](https://github.com/mikalv), [#236](https://github.com/valeriansaliou/sonic/pull/236)]. * Added Catalan stopwords [[@coopanio](https://github.com/coopanio), [#227](https://github.com/valeriansaliou/sonic/pull/227)]. * Dependencies have been bumped to latest versions (namely: `rocksdb`, `fst-levenshtein`, `fst-regex`, `hashbrown`, `whatlang`, `byteorder`, `rand`) [[@valeriansaliou](https://github.com/valeriansaliou)]. ### Deprecations * A few rarely-used languages have been removed, following `whatlang` `v0.12.0` release, [see the notes here](https://github.com/greyblake/whatlang-rs/blob/master/CHANGELOG.md#v0120---2021-04-18) [[@valeriansaliou](https://github.com/valeriansaliou), [940d3c3](https://github.com/valeriansaliou/sonic/commit/940d3c3070e144a10f041fcfdf77d15548598eee)]. ## 1.3.0 (2020-06-27) ### Changes * Added support for Slovak, which is now auto-detected from terms [[@valeriansaliou](https://github.com/valeriansaliou), [19412ce](https://github.com/valeriansaliou/sonic/commit/19412ce05a802ef1e6054b751faaef50cab5d36b)]. * Added Slovak stopwords [[@valeriansaliou](https://github.com/valeriansaliou), [19412ce](https://github.com/valeriansaliou/sonic/commit/19412ce05a802ef1e6054b751faaef50cab5d36b)]. * Dependencies have been bumped to latest versions (namely: `whatlang`) [[@valeriansaliou](https://github.com/valeriansaliou), [19412ce](https://github.com/valeriansaliou/sonic/commit/19412ce05a802ef1e6054b751faaef50cab5d36b)]. ## 1.2.4 (2020-06-25) ### Bug Fixes * Fixed multiple deadlocks, which where not noticed in practice by running Sonic at scale, but that are still theoretically possible [[@BurtonQin](https://github.com/BurtonQin), [#213](https://github.com/valeriansaliou/sonic/pull/213), [#211](https://github.com/valeriansaliou/sonic/pull/211)]. ### Changes * Added support for Latin, which is now auto-detected from terms [[@valeriansaliou](https://github.com/valeriansaliou), [e6c5621](https://github.com/valeriansaliou/sonic/commit/e6c5621ba0fabe83b8bc060824951006b373dc3f)]. * Added Latin stopwords [[@valeriansaliou](https://github.com/valeriansaliou), [e6c5621](https://github.com/valeriansaliou/sonic/commit/e6c5621ba0fabe83b8bc060824951006b373dc3f)]. * Dependencies have been bumped to latest versions (namely: `rocksdb`, `radix`, `hashbrown`, `whatlang`) [[@valeriansaliou](https://github.com/valeriansaliou)]. ### New Features * Added a release script, with cross-compilation capabilities (currently for the `x86_64` architecture, dynamically linked against GNU libraries) [[@valeriansaliou](https://github.com/valeriansaliou), [961bab9](https://github.com/valeriansaliou/sonic/commit/961bab92211295e99f1f6052577fa1aeff459d0c)]. ## 1.2.3 (2019-10-14) ### Changes * RocksDB compression algorithm has been changed from LZ4 to Zstandard, for a slightly better compression ratio, and much better read/write performance; this will be used for new SST files only [[@valeriansaliou](https://github.com/valeriansaliou), [cd4cdfb](https://github.com/valeriansaliou/sonic/commit/cd4cdfb756ae9eccd43dc7e73d2c115b33297714)]. * Dependencies have been bumped to latest versions (namely: `rocksdb`) [[@valeriansaliou](https://github.com/valeriansaliou), [cd4cdfb](https://github.com/valeriansaliou/sonic/commit/cd4cdfb756ae9eccd43dc7e73d2c115b33297714)]. ## 1.2.2 (2019-07-12) ### Bug Fixes * Fixed a regression on optional configuration values not working anymore, due to an issue in the environment variable reading system introduced in `v1.2.1` [[@valeriansaliou](https://github.com/valeriansaliou), [#155](https://github.com/valeriansaliou/sonic/issues/155)]. ### Changes * Optimized some aspects of FST consolidation and pending operations management [[@valeriansaliou](https://github.com/valeriansaliou), [#156](https://github.com/valeriansaliou/sonic/issues/156)]. ## 1.2.1 (2019-07-08) ### Changes * FST graph consolidation is now able to ignore new words when the graph is over configured limits, which are set with the new `store.fst.graph.max_size` and `store.fst.graph.max_words` configuration variables [[@valeriansaliou](https://github.com/valeriansaliou), [53db9c1](https://github.com/valeriansaliou/sonic/commit/53db9c186630a6751c0a85e610cebabace1aee2b)]. * An integration testing infrastructure has been added to the Sonic automated test suite [[@vilunov](https://github.com/vilunov), [#154](https://github.com/valeriansaliou/sonic/pull/154)]. * Configuration values can now be sourced from environment variables, using the `${env.VARIABLE}` syntax in `config.cfg` [[@perzanko](https://github.com/perzanko), [#148](https://github.com/valeriansaliou/sonic/pull/148)]. * Dependencies have been bumped to latest versions (namely: `rand`, `radix` and `hashbrown`) [[@valeriansaliou](https://github.com/valeriansaliou), [c1b1f54](https://github.com/valeriansaliou/sonic/commit/c1b1f54ad836df553bec0cd14f041bb34058307c)]. ## 1.2.0 (2019-05-03) ### Bug Fixes * Fixed a rare deadlock occurring when 3 concurrent operations get executed on different threads for the same collection, in the following timely order: `PUSH` then `FLUSHB` then `PUSH` [[@valeriansaliou](https://github.com/valeriansaliou), [d96546b](https://github.com/valeriansaliou/sonic/commit/d96546bd9d8b79332df1106766377e4a4acebd50)]. ### Changes * Reworked the KV store manager to perform periodic memory flushes to disk, thus reducing startup time [[@valeriansaliou](https://github.com/valeriansaliou), [6713488](https://github.com/valeriansaliou/sonic/commit/6713488af3543bca33be6e772936f9668430ba86)]. * Stop accepting Sonic Channel commands when shutting down Sonic [[@valeriansaliou](https://github.com/valeriansaliou), [#131](https://github.com/valeriansaliou/sonic/issues/131)]. ### New Features * Introduced a server statistics `INFO` command to Sonic Channel [[@valeriansaliou](https://github.com/valeriansaliou), [#70](https://github.com/valeriansaliou/sonic/issues/70)]. * Added the ability to disable the lexer for a command with the command modifier `LANG(none)` [[@valeriansaliou](https://github.com/valeriansaliou), [#108](https://github.com/valeriansaliou/sonic/issues/108)]. * Added a backup and restore system for both KV and FST stores, which can be triggered over Sonic Channel with `TRIGGER backup` and `TRIGGER restore` [[@valeriansaliou](https://github.com/valeriansaliou), [#5](https://github.com/valeriansaliou/sonic/issues/5)]. * Added the ability to disable KV store WAL (Write-Ahead Log) with the `write_ahead_log` option, which helps limit write wear on heavily loaded SSD-backed servers [[@valeriansaliou](https://github.com/valeriansaliou), [#130](https://github.com/valeriansaliou/sonic/issues/130)]. ## 1.1.9 (2019-03-29) ### Bug Fixes * RocksDB has been bumped to `v5.18.3`, which fixes a dead-lock occurring in RocksDB at scale when a compaction task is ran under heavy disk writes (ie. disk flushes). This dead-lock was causing Sonic to stop responding to any command issued for the frozen collection. This dead-lock was due to a bug in RocksDB internals (not originating from Sonic itself) [[@baptistejamin](https://github.com/baptistejamin), [19c4a10](https://github.com/baptistejamin/sonic/commit/19c4a104a6d6aaed1dd9beb2e51d2639627825cd)]. ### Changes * Reworked the `FLUSHB` command internals, which now use the atomic `delete_range()` operation provided by RocksDB `v5.18` [[@valeriansaliou](https://github.com/valeriansaliou), [660f8b7](https://github.com/valeriansaliou/sonic/commit/660f8b714d968400fb9f88a245752dca02249bf7)]. ### New Features * Added the `LANG()` command modifier for `QUERY` and `PUSH`, that lets a Sonic Channel client force a text locale (instead of letting the lexer system guess the text language) [[@valeriansaliou](https://github.com/valeriansaliou), [#75](https://github.com/valeriansaliou/sonic/issues/75)]. * The FST word lookup system, used by the `SUGGEST` command, now support all scripts via a restricted Unicode range forward scan [[@valeriansaliou](https://github.com/valeriansaliou), [#64](https://github.com/valeriansaliou/sonic/issues/64)]. ## 1.1.8 (2019-03-27) ### Bug Fixes * A store acquire lock has been added to prevent 2 concurrent threads from opening the same collection at the same time [[@valeriansaliou](https://github.com/valeriansaliou), [2628077](https://github.com/valeriansaliou/sonic/commit/2628077ebe7e24155975962471e7653745a0add7)]. ## 1.1.7 (2019-03-27) ### Bug Fixes * A superfluous mutex was removed from KV and FST store managers, in an attempt to solve a rare dead-lock occurring on high-traffic Sonic setups in the KV store [[@valeriansaliou](https://github.com/valeriansaliou), [60566d2](https://github.com/valeriansaliou/sonic/commit/60566d2f087fd6725dba4a60c3c5a3fef7e8399b)]. ## 1.1.6 (2019-03-27) ### Changes * Reverted changes made in `v1.1.5` regarding the open files `rlimit`, as this can be set from outside Sonic [[@valeriansaliou](https://github.com/valeriansaliou), [f6400c6](https://github.com/valeriansaliou/sonic/commit/f6400c61a9a956130ae0bdaa9a164f4955cd2a18)]. * Added Chinese Traditional stopwords [[@dsewnr](https://github.com/dsewnr), [#87](https://github.com/valeriansaliou/sonic/issues/87)]. ### Bug Fixes * Improved the way database locking is handled when calling a pool janitor; this prevents potential dead-locks under high load [[@valeriansaliou](https://github.com/valeriansaliou), [fa78372](https://github.com/valeriansaliou/sonic/commit/fa783728fd27a116b8dcf9a7180740d204b69aa4)]. ## 1.1.5 (2019-03-27) ### New Features * Added the `server.limit_open_files` configuration variable to allow configuring `rlimit` [[@valeriansaliou](https://github.com/valeriansaliou)]. ## 1.1.4 (2019-03-27) ### Changes * Added Kannada stopwords [[@dileepbapat](https://github.com/dileepbapat)]. * The Docker image is now much lighter [[@codeflows](https://github.com/codeflows)]. ### New Features * Automatically adjust `rlimit` for the process to the hard limit allowed by the system (allows opening more FSTs in parallel) [[@valeriansaliou](https://github.com/valeriansaliou)]. ## 1.1.3 (2019-03-25) ### Changes * Limit the size of words that can hit against the FST graph, as the FST gets slower for long words [[@valeriansaliou](https://github.com/valeriansaliou), [#81](https://github.com/valeriansaliou/sonic/issues/81)]. ### Bug Fixes * Rework Sonic Channel buffer management using a VecDeque (Sonic should now work better in harsh network environments) [[@valeriansaliou](https://github.com/valeriansaliou), [1c2b9c8](https://github.com/valeriansaliou/sonic/commit/1c2b9c8fcd28b033a7cb80d678c388ce78ab989d)]. ## 1.1.2 (2019-03-24) ### Changes * FST graph consolidation locking strategy has been improved even further, based on issues with the previous rework we have noticed at scale in production (now, consolidation locking is done at a lower-priority relative to actual queries and pushes to the index) [[@valeriansaliou](https://github.com/valeriansaliou), [#68](https://github.com/valeriansaliou/sonic/issues/68)]. ## 1.1.1 (2019-03-24) ### Changes * FST graph consolidation locking strategy has been reworked as to allow queries to be executed lock-free when the FST consolidate task takes a lot of time (previously, queries were being deferred due to an ongoing FST consolidate task) [[@valeriansaliou](https://github.com/valeriansaliou), [#68](https://github.com/valeriansaliou/sonic/issues/68)]. * Removed special license clause introduced in `v1.0.2`, Sonic is full `MPL 2.0` now. [[@valeriansaliou](https://github.com/valeriansaliou)] ## 1.1.0 (2019-03-21) ### Breaking Changes * Change how buckets are stored in a KV-based collection (nest them in the same RocksDB database; this is much more efficient on setups with a large number of buckets - **`v1.1.0` is incompatible with the `v1.0.0` KV database format**) [[@valeriansaliou](https://github.com/valeriansaliou)]. ### Changes * Bump `jemallocator` to version `0.3` [[@valeriansaliou](https://github.com/valeriansaliou)]. ## 1.0.2 (2019-03-20) ### Changes * Re-license from `MPL 2.0` to `SOSSL 1.0` (Sonic has a special license clause) [[@valeriansaliou](https://github.com/valeriansaliou)]. ## 1.0.1 (2019-03-19) ### Changes * Added automated benchmarks (can be ran via `cargo bench --features benchmark`) [[@valeriansaliou](https://github.com/valeriansaliou)]. * Reduced the time to query the search index by 50% via optimizations (in multiple methods, eg. the lexer) [[@valeriansaliou](https://github.com/valeriansaliou)]. ## 1.0.0 (2019-03-18) ### New Features * Initial Sonic release [[@valeriansaliou](https://github.com/valeriansaliou)]. ================================================ FILE: CODE_OF_CONDUCT.md ================================================ # Contributor Covenant Code of Conduct ## Our Pledge In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation. ## Our Standards Examples of behavior that contributes to creating a positive environment include: * Using welcoming and inclusive language * Being respectful of differing viewpoints and experiences * Gracefully accepting constructive criticism * Focusing on what is best for the community * Showing empathy towards other community members Examples of unacceptable behavior by participants include: * The use of sexualized language or imagery and unwelcome sexual attention or advances * Trolling, insulting/derogatory comments, and personal or political attacks * Public or private harassment * Publishing others' private information, such as a physical or electronic address, without explicit permission * Other conduct which could reasonably be considered inappropriate in a professional setting ## Our Responsibilities Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. ## Scope This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. ## Enforcement Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at valerian@valeriansaliou.name. All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. ## Attribution This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html [homepage]: https://www.contributor-covenant.org For answers to common questions about this code of conduct, see https://www.contributor-covenant.org/faq ================================================ FILE: CONFIGURATION.md ================================================ Sonic Configuration =================== # File: config.cfg **All available configuration options are commented below, with allowed values:** **[server]** * `log_level` (type: _string_, allowed: `debug`, `info`, `warn`, `error`, default: `error`) — Verbosity of logging, set it to `error` in production **[channel]** * `inet` (type: _string_, allowed: IPv4 / IPv6 + port, default: `[::1]:1491`) — Host and TCP port Sonic Channel should listen on * `tcp_timeout` (type: _integer_, allowed: seconds, default: `300`) — Timeout of idle/dead client connections to Sonic Channel * `auth_password` (type: _string_, allowed: password values, default: none) — Authentication password required to connect to the channel (optional but recommended) **[channel.search]** * `query_limit_default` (type: _integer_, allowed: numbers, default: `10`) — Default search results limit for a query command (if the LIMIT command modifier is not used when issuing a QUERY command) * `query_limit_maximum` (type: _integer_, allowed: numbers, default: `100`) — Maximum search results limit for a query command (if the LIMIT command modifier is being used when issuing a QUERY command) * `query_alternates_try` (type: _integer_, allowed: numbers, default: `4`) — Number of alternate words that look like query word to try if there are not enough query results (if zero, no alternate will be tried; if too high there may be a noticeable performance penalty) * `suggest_limit_default` (type: _integer_, allowed: numbers, default: `5`) — Default suggested words limit for a suggest command (if the LIMIT command modifier is not used when issuing a SUGGEST command) * `suggest_limit_maximum` (type: _integer_, allowed: numbers, default: `20`) — Maximum suggested words limit for a suggest command (if the LIMIT command modifier is being used when issuing a SUGGEST command) * `list_limit_default` (type: _integer_, allowed: numbers, default: `100`) — Default listed words limit for a list command (if the LIMIT command modifier is not used when issuing a LIST command) * `list_limit_maximum` (type: _integer_, allowed: numbers, default: `500`) — Maximum listed words limit for a list command (if the LIMIT command modifier is being used when issuing a LIST command) **[store]** **[store.kv]** * `path` (type: _string_, allowed: UNIX path, default: `./data/store/kv/`) — Path to the Key-Value database store * `retain_word_objects` (type: _integer_, allowed: numbers, default: `1000`) — Maximum number of objects a given word in the index can be linked to (older objects are cleared using a sliding window) **[store.kv.pool]** * `inactive_after` (type: _integer_, allowed: seconds, default: `1800`) — Time after which a cached database is considered inactive and can be closed (if it is not used, ie. re-activated) **[store.kv.database]** * `flush_after` (type: _integer_, allowed: seconds, default: `900`) — Time after which pending database updates should be flushed from memory to disk (increase this delay if you encounter high-CPU usage issues when a flush task kicks-in; this value should be lower than `store.kv.pool.inactive_after`) * `compress` (type: _boolean_, allowed: `true`, `false`, default: `true`) — Whether to compress database or not (uses Zstandard) * `parallelism` (type: _integer_, allowed: numbers, default: `2`) — Limit on the number of compaction and flush threads that can run at the same time * `max_files` (type: _integer_, allowed: numbers, no default) — Maximum number of database files kept open at the same time per-database (if any; otherwise there are no limits) * `max_compactions` (type: _integer_, allowed: numbers, default: `1`) — Limit on the number of concurrent database compaction jobs * `max_flushes` (type: _integer_, allowed: numbers, default: `1`) — Limit on the number of concurrent database flush jobs * `write_buffer` (type: _integer_, allowed: numbers, default: `16384`) — Maximum size in KB of the database write buffer, after which data gets flushed to disk (ie. `16384` is `16MB`; the size should be a multiple of `1024`, eg. `128 * 1024 = 131072` for `128MB`) * `write_ahead_log` (type: _boolean_, allowed: `true`, `false`, default: `true`) — Whether to enable Write-Ahead Log or not (it avoids losing non-flushed data in case of server crash) **[store.fst]** * `path` (type: _string_, allowed: UNIX path, default: `./data/store/fst/`) — Path to the Finite-State Transducer database store **[store.fst.pool]** * `inactive_after` (type: _integer_, allowed: seconds, default: `300`) — Time after which a cached graph is considered inactive and can be closed (if it is not used, ie. re-activated) **[store.fst.graph]** * `consolidate_after` (type: _integer_, allowed: seconds, default: `180`) — Time after which a graph that has pending updates should be consolidated (increase this delay if you encounter high-CPU usage issues when a consolidation task kicks-in; this value should be lower than `store.fst.pool.inactive_after`) * `max_size` (type: _integer_, allowed: numbers, default: `2048`) — Maximum size in KB of the graph file on disk, after which further words are not inserted anymore (ie. `2048` is `2MB`; the size should be a multiple of `1024`, eg. `8 * 1024 = 8192` for `8MB`; use this limit to prevent heavy graphs to be consolidating forever; this limit is enforced in pair with `store.fst.graph.max_words`, whichever is reached first) * `max_words` (type: _integer_, allowed: numbers, default: `250000`) — Maximum number of words that can be held at the same time in the graph, after which further words are not inserted anymore (use this limit to prevent heavy graphs to be consolidating forever; this limit is enforced in pair with `store.fst.graph.max_size`, whichever is reached first) # Command-Line: Environment variables You are allowed to use environment variables in the configuration file. **You can provide them as follows:** ```toml [channel] auth_password = "${env.SECRET}" ``` **Then, you can run Sonic providing a defined environment variable:** ```bash SECRET=secretphrase ./sonic -c /path/to/config.cfg ``` _Note that this can only be used with string-like values._ ================================================ FILE: CONTRIBUTING.md ================================================ Sonic Contributing Guide ======================== # Get Started - First of all, fork and clone this repo; - Install Rust and Cargo (to build and test Sonic); - Install NPM (for integration tests); ## Build Sonic From the repository root, run: ```sh cargo build ``` ## Start Sonic From the repository root, run: ```sh cargo run ``` ## Run unit tests From the repository root, run: ```sh cargo test ``` ## Run integration tests From the directory: `/tests/integration/scripts/`, run: ```sh ./run.sh ``` # Report Issues & Request Features **If you encounter an issue with Sonic, or would like to request a feature to be implemented, please do [open an issue](https://github.com/valeriansaliou/sonic/issues/new).** Note that before opening an issue, you should always search for other similar issues as to avoid opening a duplicate issue. This makes the life of the project maintainer much easier. When writing your issue title and command, make sure to be as precise as possible, giving away the maximum amount of details (even if you have a feeling some details are useless, they might make debugging or understanding easier for us). # Submit Your Code **If you would like to contribute directly by writing code, you should fork this repository and edit it right away from your GitHub namespace.** Once you are done with your work, always ensure to format your Rust code according to guidelines, via the [rustfmt](https://github.com/rust-lang/rustfmt) utility: `rustfmt src/*.rs` When this is done, you may open a Pull Request (PR), then explain your changes and their purpose precisely. We will finally accept or comment on your Pull Request, if we need more changes done on your code. ================================================ FILE: Cargo.toml ================================================ [package] name = "sonic-server" version = "1.4.9" description = "Fast, lightweight and schema-less search backend." readme = "README.md" license = "MPL-2.0" edition = "2018" homepage = "https://github.com/valeriansaliou/sonic" repository = "https://github.com/valeriansaliou/sonic.git" keywords = ["search", "query", "server", "index"] categories = ["database-implementations", "web-programming"] authors = ["Valerian Saliou ", "Baptiste Jamin "] [[bin]] name = "sonic" path = "src/main.rs" doc = false [dependencies] log = "0.4" toml = "0.8" clap = { version = "3.2", features = ["std", "cargo"] } lazy_static = "1.4" serde = "1.0" serde_derive = "1.0" rand = "0.8" unicode-segmentation = "1.6" radix = "0.6" rocksdb = { version = "0.24", features = ["zstd"] } fst = "0.3" fst-levenshtein = "0.3" fst-regex = "0.3" regex-syntax = "0.8" twox-hash = "1.5" byteorder = "1.4" hashbrown = "0.14" linked_hash_set = "0.1" whatlang = "0.16" regex = "1.6" jieba-rs = { version = "0.7", optional = true } lindera-core = { version = "0.31", optional = true } lindera-dictionary = { version = "0.31", features = ["unidic"], optional = true } lindera-tokenizer = { version = "0.31", features = ["unidic"], optional = true } [target.'cfg(unix)'.dependencies] nix = { version = "0.31.1", features = ["signal"] } tikv-jemallocator = { version = "0.4", optional = true } [target.'cfg(windows)'.dependencies] winapi = { version = "0.3", features = ["minwindef", "consoleapi"] } [features] default = ["allocator-jemalloc", "tokenizer-chinese"] allocator-jemalloc = ["tikv-jemallocator"] tokenizer-chinese = ["jieba-rs"] tokenizer-japanese = ["lindera-core", "lindera-dictionary", "lindera-tokenizer"] benchmark = [] [profile.dev] opt-level = 0 debug = true debug-assertions = true [profile.release] opt-level = 3 lto = true debug = false debug-assertions = false strip = true [profile.bench] opt-level = 3 debug = false debug-assertions = false ================================================ FILE: Dockerfile ================================================ FROM rust:slim-bullseye AS build RUN apt-get update RUN apt-get install -y build-essential clang RUN rustup --version RUN rustup component add rustfmt RUN rustc --version && \ rustup --version && \ cargo --version WORKDIR /app COPY . /app RUN cargo clean && cargo build --release RUN strip ./target/release/sonic FROM gcr.io/distroless/cc WORKDIR /usr/src/sonic COPY --from=build /app/target/release/sonic /usr/local/bin/sonic CMD [ "sonic", "-c", "/etc/sonic.cfg" ] EXPOSE 1491 ================================================ FILE: INNER_WORKINGS.md ================================================ Sonic Inner Workings ==================== This document was written with the goal of explaining the inner workings of Sonic, as well as the whys of the design choices that were made while building Sonic. Anyone reading this documentation should quickly get more familiar in how such a search index can be built from scratch, to the point that they should be able to start building their own Sonic from scratch. _If you feel something is missing from this document, or if it did not help you understand a concept Sonic implements, please [open an issue](https://github.com/valeriansaliou/sonic/issues/new) and explain precisely which part you did not get and why you think you did not get it._ # The Building Blocks of a Search Index ## Basics of a search index A search index is nothing more than a specialized database. It should expose primitives such as: query the index, push text in the index, pop text from the index, flush parts of the index. The search index server is responsible for organizing the index data in a way that makes writes and reads efficient. It makes uses of specialized data structures for some very specific operations like typos corrections. The overall goal of such a search index system is: speed, lightweightness and data compactness (ie. it should minimize the resulting output database size given a text input size). As to provide flexibility to organized indexed data, the search index is organized into collections that contain buckets. Buckets contain indexed objects. This means that you can organize your search index within a depth of 2 layers. Objects are actual search results; you could push an object `result_1` to collection `messages` within bucket `user_1`. This would index `messages` for `user_1` with result `result_1`. Later on, one could search for `messages` matching a given query for `user_1`. If the Sonic user use case does not require using buckets, the bucket value can still be set to a generic value, eg. `default`. Sonic, unlike many other search index systems, does not serve actual documents as search results. A strategic choice was made to store only identifiers referring to primary keys in an external database, which makes the data stored on-disk as compact as it can be. Users can still refer to their external database to fetch actual search result documents, using identifiers provided by Sonic. It is worth nothing that any project initiated as of 2019 should make use of modern server hardware, which is mostly all about multi-core CPUs and SSDs. Also, Sonic should be very wary of minimizing its resource requirements — _from a cold start to running under high load_ — as a lot of developers nowadays expect to run software on cheap VPS servers with limited CPU time, small disk space and little RAM. Those modern VPS are nonetheless powered by modern SSDs with fast random I/O. Last but not least, it would definitely be a plus if we could make software a bit greener. In order to address the above, Sonic is capable to run queries over multiple CPUs in parallel. It leverages SSDs fast random I/O by using RocksDB as its main key-value store. It also avoids eating all available RAM by storing most data on-disk (via memory mapping), which is not an issue anymore as of 2019, as SSDs have low I/O latency and can sustain an unlimited number of reads over their lifetimes. Though, as writes are Achilles' heel of SSD disks, Sonic aims at minimizing writes and buffers a lot of those writes in RAM, which are committed to disk at periodic intervals. This should maximize the lifespan of the SSD disk under heavy index write load. Unfortunately, the side-effect of doing this is that in case of server power loss, non-committed writes will vanish. ## How do result objects get indexed? Sonic stores result objects in a key-value database (abbreviated KV), powered by RocksDB. When a text is pushed to Sonic, this text gets normalized, cleaned up and split in separate words. Each word is then associated to the pushed object result, and committed to the KV database as `word <-> object`. Upon cleaning the text, overhead is eluded. For instance, in the text `the lazy dog` there would be no point in indexing the word `the`, which is what is called a _stopword_. Sonic does not push stopwords to the index ([read more on stopwords](https://en.wikipedia.org/wiki/Stop_words)). When objects are pushed to the search index for a given bucket in a given collection, for instance object `session_77f2e05e-5a81-49f0-89e3-177e9e1d1f32`, Sonic converts this object to a compact 32 bits format, for instance `10292198`. We call the user-provided object identifier the OID, while the compact internal identifier is named the IID. The IID is mapped internally to indexed words, and is much more compact in terms of storage than the OID. You can think of OIDs and IIDs as basically the same thing, except that the IID is the compact version of an OID. OIDs are only used for user-facing input and output objects, while IIDs are only used for internal storage of those objects. On very long indexed texts, this helps save **_a lot_** of disk space. The KV store has a simple schema, where we associate a binary key to binary data. The following types of keys exist: 1. **Meta-To-Value**: state data for the bucket, eg. stores the count increment of indexed objects (data is in arbitrary format) (_code: [StoreKeyerIdx::MetaToValue](https://github.com/valeriansaliou/sonic/blob/5320b81afc1598ac1cd2af938df0b2ef6cb96dc4/src/store/keyer.rs#L24)_); 2. **Term-To-IIDs**: maps a word (ie. term) to an internal identifier (ie. IID), which is essentially a word-to-result mapping (data is an array of 32 bits numbers encoded to binary as little-endian) (_code: [StoreKeyerIdx::TermToIIDs](https://github.com/valeriansaliou/sonic/blob/5320b81afc1598ac1cd2af938df0b2ef6cb96dc4/src/store/keyer.rs#L25)_); 3. **OID-To-IID**: maps an object identifier (ie. OID) to an internal identifier (ie. IID), which converts an user-provided object to a compact internal object (data is a 32 bits number encoded to binary as little-endian) (_code: [StoreKeyerIdx::OIDToIID](https://github.com/valeriansaliou/sonic/blob/5320b81afc1598ac1cd2af938df0b2ef6cb96dc4/src/store/keyer.rs#L26)_); 4. **IID-To-OID**: this is the reverse mapping of OID-To-IID, which lets convert an IID back to an OID (data is a variable-length UTF-8 string encoded in binary) (_code: [StoreKeyerIdx::IIDToOID](https://github.com/valeriansaliou/sonic/blob/5320b81afc1598ac1cd2af938df0b2ef6cb96dc4/src/store/keyer.rs#L27)_); 5. **IID-To-Terms**: this lists all words (ie. terms) associated to an internal identifier (ie. IID) (data is an array of 32 bits numbers encoded to binary as little-endian) (_code: [StoreKeyerIdx::IIDToTerms](https://github.com/valeriansaliou/sonic/blob/5320b81afc1598ac1cd2af938df0b2ef6cb96dc4/src/store/keyer.rs#L28)_); A key is formatted as such, in binary: `[idx<1B> | bucket<4B> | route<4B>]` (_code: [StoreKeyerBuilder::build_key](https://github.com/valeriansaliou/sonic/blob/5320b81afc1598ac1cd2af938df0b2ef6cb96dc4/src/store/keyer.rs#L73)_), which makes it 9-bytes long. The index stands for the type of key, eg. Term-To-IIDs. The bucket and what we call the route are hashed as 32 bits numbers, and appended in little-endian binary format to the key. Both IIDs and terms are stored as 32 bits numbers in binary format. 64 bits numbers could have been used instead, increasing the total number of objects that can be indexed per-bucket. Though, storing such 64 bits numbers instead of 32 bits numbers would double required storage space. As they make up most of stored space, it was important to keep them as small as possible. Those 32 bits numbers are generated using a fast and low-collision hash family called [XxHash](http://www.xxhash.com), from the OID in the case of the IID, and from the word in the case of the term hash (_code: [StoreTermHash](https://github.com/valeriansaliou/sonic/blob/5320b81afc1598ac1cd2af938df0b2ef6cb96dc4/src/store/identifiers.rs#L32)_). ## How do word suggestion and user typo auto-correction work? When most users input text to a computer system using an actual keyboard, they make typos and mistakes. A nice property of a good search system should be that those typos can be forgiven and accurate search results still come up for the bogus user query. Sonic implements a data structure that lets it correct typos or autocomplete incomplete words. For instance, if our index has the word `english` but the user, for some reason, inputs `englich`, Sonic would still return results for `english`. Similarly, if the user inputs an incomplete word eg. `eng`, Sonic would expand this word to `english`, if there were no or not enough exact matches for `eng`. The store system responsible for such a feat is the FST ([Finite-State Transducer](https://en.wikipedia.org/wiki/Finite-state_transducer)). It can be grossly compared to a graph of characters, where nodes are characters and edges connect those characters to produce words. Sonic stores a single FST file per bucket. This FST file is memory-mapped, and read directly from the disk when Sonic needs to read it. The [fst](https://crates.io/crates/fst) crate is used to implement the FST data structure. One downside of the FST implementation that Sonic uses, is that once built, an FST is immutable. It means that in order to add a new word to the search index (for a given bucket), Sonic needs to re-build the entire FST (ie. iterate word-by-word on the existing FST and stream those words plus the added word to a new on-disk FST file). In order to do that in an efficient manner, Sonic implements an FST consolidation tasker, which stores FST changes in-memory and consolidates them to disk at periodic intervals (this interval can be configured) (_code: [StoreFSTPool::consolidate](https://github.com/valeriansaliou/sonic/blob/5320b81afc1598ac1cd2af938df0b2ef6cb96dc4/src/store/fst.rs#L173)_). ## How do texts get cleaned up? (via the lexer) Any text that gets pushed to Sonic needs to be normalized (eg. lower-cased) and cleaned up (eg. remove stopwords) before it can be added to the index. This task is handled by the lexer system, also called [tokenizer](https://en.wikipedia.org/wiki/Lexical_analysis#Tokenization). Sonic's tokenizer is built around an iterator pattern (_code: [Iterator->TokenLexer](https://github.com/valeriansaliou/sonic/blob/5320b81afc1598ac1cd2af938df0b2ef6cb96dc4/src/lexer/token.rs#L244)_), and yields lexed words one-by-one. Iteration can be stopped before the end of the text is reached, for instance if we did not get enough search results for the first words of the query. This ensures no extraneous lexing work is done. Given that stopwords depend on the text language, Sonic first needs to detect the language of the text that is being cleaned up. This is done using an hybrid method of either counting the number of stopwords that appear in the text for long-enough texts (which is faster) (_code: [TokenLexerBuilder::detect_lang_fast](https://github.com/valeriansaliou/sonic/blob/5320b81afc1598ac1cd2af938df0b2ef6cb96dc4/src/lexer/token.rs#L177)_), or performing an [n-gram](https://en.wikipedia.org/wiki/N-gram) pass on the text for smaller texts (which is **_an order of magnitude_** slower) (_code: [TokenLexerBuilder::detect_lang_slow](https://github.com/valeriansaliou/sonic/blob/5320b81afc1598ac1cd2af938df0b2ef6cb96dc4/src/lexer/token.rs#L126)_). As the n-gram method is better at guessing the language for small texts than the stopwords method is, we prefer it, although it is crazy slow in comparison to the stopwords method. For long-enough texts, the stopwords method becomes reliable enough, so we can use it. In either cases, if the first chosen guessing method result is judged as non-reliable, Sonic fallbacks on the other method (_code: [!detector.is_reliable()](https://github.com/valeriansaliou/sonic/blob/5320b81afc1598ac1cd2af938df0b2ef6cb96dc4/src/lexer/token.rs#L148)_). By the way, Sonic builds up its own list of stopwords for all supported languages, [which can be found here](https://github.com/valeriansaliou/sonic/tree/master/src/stopwords) (languages are referred to via their ISO 639-3 codes). People are welcome to improve those lists of stopwords by [submitting a Pull Request](https://github.com/valeriansaliou/sonic/pulls). ## What is the purpose of the tasker system? Looking at the source code of Sonic, you will find a module named `tasker` ([see here](https://github.com/valeriansaliou/sonic/tree/master/src/tasker)). This module performs background tasks, and is triggered periodically. **The tasker performs the following actions:** 1. **Janitor**: it closes cached collection and bucket stores that were not used recently, freeing up memory (_code: [Tasker::tick](https://github.com/valeriansaliou/sonic/blob/5320b81afc1598ac1cd2af938df0b2ef6cb96dc4/src/tasker/runtime.rs#L48)_); 2. **Consolidate**: it writes in-memory FST changes to the on-disk FST data structure (_code: [Tasker::tick](https://github.com/valeriansaliou/sonic/blob/5320b81afc1598ac1cd2af938df0b2ef6cb96dc4/src/tasker/runtime.rs#L48)_); As in all databases, a lot of locking is involved while the tasker is performing heavy-duty work on a KV or FST store. Thus, when the tasker system kicks-in, stores may experience higher than expected latency for all consumers attempting to read or write to them. The tasker system has been optimized to minimize thread contention caused by locks, so the impact of those locks on Sonic consumers should be minimum. # On the Sonic Channel Protocol In order for a client to communicate with the search index system, one needs a protocol. Sonic uses the Sonic Channel protocol, which defines a way for clients to send commands (ie. requests) to a Sonic server over the network (via a raw TCP socket); and get responses from the Sonic server. For instance, a client may send a search query command such as `QUERY collection bucket "search query"` and get a response with search results such as `EVENT QUERY isgsHQYu result_1 result_2`. **On that Sonic Channel protocol, technical choices that may seem to go against common sense were made:** 1. **Sonic does not expose any HTTP API interface**, as it adds a network and processing overhead cost we do not want to bear; 2. **Sonic only exposes a raw TCP socket** with which clients interact via the Sonic Channel protocol, which was designed to be simple, lightweight and extensible; 3. **Most Sonic Channel commands are synchronous**, for simplicity's sake (Redis does the same). You can still run multiple Sonic Channel connections in parallel, and enjoy increased parallelism, but on a given Sonic Channel connection, you must wait for the previous command to return before issuing the next one; 4. **Some Sonic Channel commands are asynchronous**, when a lot of commands may be issued in a short period of time, in a burst pattern. This is typical of read operations such as search queries, which should be submitted as jobs to a dedicated thread pool, which can be upsized and downsized at will. To handle this, a special eventing protocol format should be used; _The Sonic Channel protocol is specified in a separate document, which [you can read here](https://github.com/valeriansaliou/sonic/blob/master/PROTOCOL.md)._ # The Journey of a Search Query As always, examples are the way to go to explain any complex system. This section drafts the journey of a search query in Sonic, from receiving the search query command over Sonic Channel, to serving results to the Sonic Channel consumer. Given a collection `messages` and a bucket `acme_corp` (ie. indexed messages for Acme Corp), John Doe wants to find messages that match the query text `"The robber has stolen our corporate car"`. First off, John Doe would connect to Sonic over a Sonic Channel client, for instance [node-sonic-channel](https://github.com/valeriansaliou/node-sonic-channel). Using this client, he would issue the following query: `QUERY messages acme_corp "The robber has stolen our corporate car"` to find conversations that contain messages about a recent robbery at Acme Corp. **After receiving the raw command above, the Sonic server would, in order:** 1. Read the raw command from the Sonic Channel TCP stream buffer (_code: [Self::on_message](https://github.com/valeriansaliou/sonic/blob/5320b81afc1598ac1cd2af938df0b2ef6cb96dc4/src/channel/handle.rs#L163)_); 2. Route the unpacked command message to the proper command handler, which would be `ChannelCommandSearch::dispatch_query` (_code: [ChannelMessage::on](https://github.com/valeriansaliou/sonic/blob/5320b81afc1598ac1cd2af938df0b2ef6cb96dc4/src/channel/message.rs#L39)_); 3. Commit the search query for processing (_code: [ChannelCommandBase::commit_pending_operation](https://github.com/valeriansaliou/sonic/blob/5320b81afc1598ac1cd2af938df0b2ef6cb96dc4/src/channel/command.rs#L428)_); 4. Dispatch the search query to its executor (_code: [StoreOperationDispatch::dispatch](https://github.com/valeriansaliou/sonic/blob/5320b81afc1598ac1cd2af938df0b2ef6cb96dc4/src/channel/command.rs#L351)_); 5. Run the search executor (_code: [ExecutorSearch::search](https://github.com/valeriansaliou/sonic/blob/5320b81afc1598ac1cd2af938df0b2ef6cb96dc4/src/executor/search.rs#L21)_); 6. Open both the KV and FST stores for the collection `messages` and bucket `acme_corp` (_code: [StoreKVPool::acquire + StoreFSTPool::acquire](https://github.com/valeriansaliou/sonic/blob/5320b81afc1598ac1cd2af938df0b2ef6cb96dc4/src/executor/search.rs#L34)_); 7. Perform search query text lexing, and search word-by-word, which would yield in order: `robber`, `stolen`, `corporate`, `car` (_code: [lexer.next](https://github.com/valeriansaliou/sonic/blob/5320b81afc1598ac1cd2af938df0b2ef6cb96dc4/src/executor/search.rs#L50)_); 8. If not enough search results are found, tries to suggest other words eg. typos corrections (_code: [fst_action.suggest_words](https://github.com/valeriansaliou/sonic/blob/5320b81afc1598ac1cd2af938df0b2ef6cb96dc4/src/executor/search.rs#L81)_); 9. Perform paging on found OIDs from KV store to limit results (_code: [found_iids.iter](https://github.com/valeriansaliou/sonic/blob/5320b81afc1598ac1cd2af938df0b2ef6cb96dc4/src/executor/search.rs#L163)_); 10. Return found OIDs from the executor (_code: [result_oids](https://github.com/valeriansaliou/sonic/blob/5320b81afc1598ac1cd2af938df0b2ef6cb96dc4/src/executor/search.rs#L180)_); 11. Write back the final results to the TCP stream (_code: [response_args_groups](https://github.com/valeriansaliou/sonic/blob/5320b81afc1598ac1cd2af938df0b2ef6cb96dc4/src/channel/message.rs#L81)_); _This is it!_ John Doe would receive the following response from Sonic Channel: `EVENT QUERY isgsHQYu conversation_3459 conversation_29398`, which indicates that there are 2 conversations that contain messages matching the search text `"The robber has stolen our corporate car"`. ================================================ FILE: LICENSE.md ================================================ Mozilla Public License Version 2.0 ================================== 1. Definitions -------------- 1.1. "Contributor" means each individual or legal entity that creates, contributes to the creation of, or owns Covered Software. 1.2. "Contributor Version" means the combination of the Contributions of others (if any) used by a Contributor and that particular Contributor's Contribution. 1.3. "Contribution" means Covered Software of a particular Contributor. 1.4. "Covered Software" means Source Code Form to which the initial Contributor has attached the notice in Exhibit A, the Executable Form of such Source Code Form, and Modifications of such Source Code Form, in each case including portions thereof. 1.5. "Incompatible With Secondary Licenses" means (a) that the initial Contributor has attached the notice described in Exhibit B to the Covered Software; or (b) that the Covered Software was made available under the terms of version 1.1 or earlier of the License, but not also under the terms of a Secondary License. 1.6. "Executable Form" means any form of the work other than Source Code Form. 1.7. "Larger Work" means a work that combines Covered Software with other material, in a separate file or files, that is not Covered Software. 1.8. "License" means this document. 1.9. "Licensable" means having the right to grant, to the maximum extent possible, whether at the time of the initial grant or subsequently, any and all of the rights conveyed by this License. 1.10. "Modifications" means any of the following: (a) any file in Source Code Form that results from an addition to, deletion from, or modification of the contents of Covered Software; or (b) any new file in Source Code Form that contains any Covered Software. 1.11. "Patent Claims" of a Contributor means any patent claim(s), including without limitation, method, process, and apparatus claims, in any patent Licensable by such Contributor that would be infringed, but for the grant of the License, by the making, using, selling, offering for sale, having made, import, or transfer of either its Contributions or its Contributor Version. 1.12. "Secondary License" means either the GNU General Public License, Version 2.0, the GNU Lesser General Public License, Version 2.1, the GNU Affero General Public License, Version 3.0, or any later versions of those licenses. 1.13. "Source Code Form" means the form of the work preferred for making modifications. 1.14. "You" (or "Your") means an individual or a legal entity exercising rights under this License. For legal entities, "You" includes any entity that controls, is controlled by, or is under common control with You. For purposes of this definition, "control" means (a) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (b) ownership of more than fifty percent (50%) of the outstanding shares or beneficial ownership of such entity. 2. License Grants and Conditions -------------------------------- 2.1. Grants Each Contributor hereby grants You a world-wide, royalty-free, non-exclusive license: (a) under intellectual property rights (other than patent or trademark) Licensable by such Contributor to use, reproduce, make available, modify, display, perform, distribute, and otherwise exploit its Contributions, either on an unmodified basis, with Modifications, or as part of a Larger Work; and (b) under Patent Claims of such Contributor to make, use, sell, offer for sale, have made, import, and otherwise transfer either its Contributions or its Contributor Version. 2.2. Effective Date The licenses granted in Section 2.1 with respect to any Contribution become effective for each Contribution on the date the Contributor first distributes such Contribution. 2.3. Limitations on Grant Scope The licenses granted in this Section 2 are the only rights granted under this License. No additional rights or licenses will be implied from the distribution or licensing of Covered Software under this License. Notwithstanding Section 2.1(b) above, no patent license is granted by a Contributor: (a) for any code that a Contributor has removed from Covered Software; or (b) for infringements caused by: (i) Your and any other third party's modifications of Covered Software, or (ii) the combination of its Contributions with other software (except as part of its Contributor Version); or (c) under Patent Claims infringed by Covered Software in the absence of its Contributions. This License does not grant any rights in the trademarks, service marks, or logos of any Contributor (except as may be necessary to comply with the notice requirements in Section 3.4). 2.4. Subsequent Licenses No Contributor makes additional grants as a result of Your choice to distribute the Covered Software under a subsequent version of this License (see Section 10.2) or under the terms of a Secondary License (if permitted under the terms of Section 3.3). 2.5. Representation Each Contributor represents that the Contributor believes its Contributions are its original creation(s) or it has sufficient rights to grant the rights to its Contributions conveyed by this License. 2.6. Fair Use This License is not intended to limit any rights You have under applicable copyright doctrines of fair use, fair dealing, or other equivalents. 2.7. Conditions Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted in Section 2.1. 3. Responsibilities ------------------- 3.1. Distribution of Source Form All distribution of Covered Software in Source Code Form, including any Modifications that You create or to which You contribute, must be under the terms of this License. You must inform recipients that the Source Code Form of the Covered Software is governed by the terms of this License, and how they can obtain a copy of this License. You may not attempt to alter or restrict the recipients' rights in the Source Code Form. 3.2. Distribution of Executable Form If You distribute Covered Software in Executable Form then: (a) such Covered Software must also be made available in Source Code Form, as described in Section 3.1, and You must inform recipients of the Executable Form how they can obtain a copy of such Source Code Form by reasonable means in a timely manner, at a charge no more than the cost of distribution to the recipient; and (b) You may distribute such Executable Form under the terms of this License, or sublicense it under different terms, provided that the license for the Executable Form does not attempt to limit or alter the recipients' rights in the Source Code Form under this License. 3.3. Distribution of a Larger Work You may create and distribute a Larger Work under terms of Your choice, provided that You also comply with the requirements of this License for the Covered Software. If the Larger Work is a combination of Covered Software with a work governed by one or more Secondary Licenses, and the Covered Software is not Incompatible With Secondary Licenses, this License permits You to additionally distribute such Covered Software under the terms of such Secondary License(s), so that the recipient of the Larger Work may, at their option, further distribute the Covered Software under the terms of either this License or such Secondary License(s). 3.4. Notices You may not remove or alter the substance of any license notices (including copyright notices, patent notices, disclaimers of warranty, or limitations of liability) contained within the Source Code Form of the Covered Software, except that You may alter any license notices to the extent required to remedy known factual inaccuracies. 3.5. Application of Additional Terms You may choose to offer, and to charge a fee for, warranty, support, indemnity or liability obligations to one or more recipients of Covered Software. However, You may do so only on Your own behalf, and not on behalf of any Contributor. You must make it absolutely clear that any such warranty, support, indemnity, or liability obligation is offered by You alone, and You hereby agree to indemnify every Contributor for any liability incurred by such Contributor as a result of warranty, support, indemnity or liability terms You offer. You may include additional disclaimers of warranty and limitations of liability specific to any jurisdiction. 4. Inability to Comply Due to Statute or Regulation --------------------------------------------------- If it is impossible for You to comply with any of the terms of this License with respect to some or all of the Covered Software due to statute, judicial order, or regulation then You must: (a) comply with the terms of this License to the maximum extent possible; and (b) describe the limitations and the code they affect. Such description must be placed in a text file included with all distributions of the Covered Software under this License. Except to the extent prohibited by statute or regulation, such description must be sufficiently detailed for a recipient of ordinary skill to be able to understand it. 5. Termination -------------- 5.1. The rights granted under this License will terminate automatically if You fail to comply with any of its terms. However, if You become compliant, then the rights granted under this License from a particular Contributor are reinstated (a) provisionally, unless and until such Contributor explicitly and finally terminates Your grants, and (b) on an ongoing basis, if such Contributor fails to notify You of the non-compliance by some reasonable means prior to 60 days after You have come back into compliance. Moreover, Your grants from a particular Contributor are reinstated on an ongoing basis if such Contributor notifies You of the non-compliance by some reasonable means, this is the first time You have received notice of non-compliance with this License from such Contributor, and You become compliant prior to 30 days after Your receipt of the notice. 5.2. If You initiate litigation against any entity by asserting a patent infringement claim (excluding declaratory judgment actions, counter-claims, and cross-claims) alleging that a Contributor Version directly or indirectly infringes any patent, then the rights granted to You by any and all Contributors for the Covered Software under Section 2.1 of this License shall terminate. 5.3. In the event of termination under Sections 5.1 or 5.2 above, all end user license agreements (excluding distributors and resellers) which have been validly granted by You or Your distributors under this License prior to termination shall survive termination. 6. Disclaimer of Warranty ------------------------- > Covered Software is provided under this License on an "as is" > basis, without warranty of any kind, either expressed, implied, or > statutory, including, without limitation, warranties that the > Covered Software is free of defects, merchantable, fit for a > particular purpose or non-infringing. The entire risk as to the > quality and performance of the Covered Software is with You. > Should any Covered Software prove defective in any respect, You > (not any Contributor) assume the cost of any necessary servicing, > repair, or correction. This disclaimer of warranty constitutes an > essential part of this License. No use of any Covered Software is > authorized under this License except under this disclaimer. 7. Limitation of Liability -------------------------- > Under no circumstances and under no legal theory, whether tort > (including negligence), contract, or otherwise, shall any > Contributor, or anyone who distributes Covered Software as > permitted above, be liable to You for any direct, indirect, > special, incidental, or consequential damages of any character > including, without limitation, damages for lost profits, loss of > goodwill, work stoppage, computer failure or malfunction, or any > and all other commercial damages or losses, even if such party > shall have been informed of the possibility of such damages. This > limitation of liability shall not apply to liability for death or > personal injury resulting from such party's negligence to the > extent applicable law prohibits such limitation. Some > jurisdictions do not allow the exclusion or limitation of > incidental or consequential damages, so this exclusion and > limitation may not apply to You. 8. Litigation ------------- Any litigation relating to this License may be brought only in the courts of a jurisdiction where the defendant maintains its principal place of business and such litigation shall be governed by laws of that jurisdiction, without reference to its conflict-of-law provisions. Nothing in this Section shall prevent a party's ability to bring cross-claims or counter-claims. 9. Miscellaneous ---------------- This License represents the complete agreement concerning the subject matter hereof. If any provision of this License is held to be unenforceable, such provision shall be reformed only to the extent necessary to make it enforceable. Any law or regulation which provides that the language of a contract shall be construed against the drafter shall not be used to construe this License against a Contributor. 10. Versions of the License --------------------------- 10.1. New Versions Mozilla Foundation is the license steward. Except as provided in Section 10.3, no one other than the license steward has the right to modify or publish new versions of this License. Each version will be given a distinguishing version number. 10.2. Effect of New Versions You may distribute the Covered Software under the terms of the version of the License under which You originally received the Covered Software, or under the terms of any subsequent version published by the license steward. 10.3. Modified Versions If you create software not governed by this License, and you want to create a new license for such software, you may create and use a modified version of this License if you rename the license and remove any references to the name of the license steward (except to note that such modified license differs from this License). 10.4. Distributing Source Code Form that is Incompatible With Secondary Licenses If You choose to distribute Source Code Form that is Incompatible With Secondary Licenses under the terms of this version of the License, the notice described in Exhibit B of this License must be attached. Exhibit A - Source Code Form License Notice ------------------------------------------- This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0. If a copy of the MPL was not distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/. If it is not possible or desirable to put the notice in a particular file, then You may include the notice in a location (such as a LICENSE file in a relevant directory) where a recipient would be likely to look for such a notice. You may add additional accurate notices of copyright ownership. Exhibit B - "Incompatible With Secondary Licenses" Notice --------------------------------------------------------- This Source Code Form is "Incompatible With Secondary Licenses", as defined by the Mozilla Public License, v. 2.0. ================================================ FILE: PACKAGING.md ================================================ Packaging ========= This file contains quick reminders and notes on how to package Sonic. We consider here the packaging flow of Sonic version `1.0.0` for Linux. 1. **How to bump Sonic version before a release:** 1. Bump version in `Cargo.toml` to `1.0.0` 2. Execute `cargo update` to bump `Cargo.lock` 3. Bump Debian package version in `debian/rules` to `1.0.0` 2. **How to build Sonic, package it and release it on Crates, GitHub, Docker Hub and Packagecloud (multiple architectures):** 1. Tag the latest Git commit corresponding to the release with tag `v1.0.0`, and push the tag 2. Wait for all release jobs to complete on the [actions](https://github.com/valeriansaliou/sonic/actions) page on GitHub 3. Download all release archives, and sign them locally using: `./scripts/sign_binaries.sh --version=1.0.0` 4. Publish a changelog and upload all the built archives, as well as their signatures on the [releases](https://github.com/valeriansaliou/sonic/releases) page on GitHub ================================================ FILE: PROTOCOL.md ================================================ Sonic Protocol ============== # ⚡️ Sonic Channel **Sonic Channel is the protocol used to perform searches and ingest index data. You can also use it for Sonic administration operations. Sonic listens on TCP port 1491 by default.** This document specifies the Sonic Channel protocol. Use it if you are looking to build your own Sonic Channel library, or if you are looking to debug Sonic using eg. `telnet` in command-line. To start a `telnet` session with your local Sonic instance, execute: `telnet ::1 1491` _Refer to sections below to interact with Sonic._ --- ### 1️⃣ Before you start **Please consider the following upon integrating the Sonic Channel protocol:** 1. Each command sent must be terminated with a new line character (`\n`) as to commit the command to the server; 2. Upon starting a Sonic Channel session, your library should read the `buffer(20000)` parameter in the `STARTED` response, and use this value (in bytes) as to know when a command data should be truncated and split in multiple sub-commands (to avoid buffer overflows, ie. sending too much data in a single command); --- ### 2️⃣ Sonic Channel (uninitialized) * `START `: select mode to use for connection (either: `search` or `ingest`). The password is found in the `config.cfg` file at `channel.auth_password`. _Issuing any other command — eg. `QUIT` — in this mode will abort the TCP connection, effectively resulting in a `QUIT` with the `ENDED not_recognized` response._ --- ### 3️⃣ Sonic Channel (Search mode) _The Sonic Channel Search mode is used for querying the search index. Once in this mode, you cannot switch to other modes or gain access to commands from other modes._ **➡️ Available commands:** * `QUERY`: query database (syntax: `QUERY "" [LIMIT()]? [OFFSET()]? [LANG()]?`; time complexity: `O(1)` if enough exact word matches or `O(N)` if not enough exact matches where `N` is the number of alternate words tried, in practice it approaches `O(1)`) * `SUGGEST`: auto-completes word (syntax: `SUGGEST "" [LIMIT()]?`; time complexity: `O(1)`) * `LIST`: enumerates all words in an index (syntax: `LIST [LIMIT()]? [OFFSET()]?`; time complexity: `O(N)` where `N` is the number of words enumerated, within provided limits) * `PING`: ping server (syntax: `PING`; time complexity: `O(1)`) * `HELP`: show help (syntax: `HELP []?`; time complexity: `O(1)`) * `QUIT`: stop connection (syntax: `QUIT`; time complexity: `O(1)`) **⏩ Syntax terminology:** * ``: index collection (ie. what you search in, eg. `messages`, `products`, etc.); * ``: index bucket name (ie. user-specific search classifier in the collection if you have any eg. `user-1, user-2, ..`, otherwise use a common bucket name eg. `generic, default, common, ..`); * ``: text for search terms (between quotes); * ``: a positive integer number; set within allowed maximum & minimum limits; * ``: an ISO 639-3 locale code eg. `eng` for English (if set, the locale must be a valid ISO 639-3 code; if set to `none`, lexing will be disabled; if not set, the locale will be guessed from text); * ``: help manual to be shown (available manuals: `commands`); _Notice: the `bucket` terminology may confuse some Sonic users. As we are well-aware Sonic may be used in an environment where end-users may each hold their own search index in a given `collection`, we made it possible to manage per-end-user search indexes with `bucket`. If you only have a single index per `collection` (most Sonic users will), we advise you use a static generic name for your `bucket`, for instance: `default`._ **⬇️ Search flow example (via `telnet`):** ```bash T1: telnet sonic.local 1491 T2: Trying ::1... T3: Connected to sonic.local. T4: Escape character is '^]'. T5: CONNECTED T6: START search SecretPassword T7: STARTED search protocol(1) buffer(20000) T8: QUERY messages user:0dcde3a6 "valerian saliou" LIMIT(10) T9: PENDING Bt2m2gYa T10: EVENT QUERY Bt2m2gYa conversation:71f3d63b conversation:6501e83a T11: QUERY helpdesk user:0dcde3a6 "gdpr" LIMIT(50) T12: PENDING y57KaB2d T13: QUERY helpdesk user:0dcde3a6 "law" LIMIT(50) OFFSET(200) T14: PENDING CjPvE5t9 T15: PING T16: PONG T17: EVENT QUERY CjPvE5t9 T18: EVENT QUERY y57KaB2d article:28d79959 T19: SUGGEST messages user:0dcde3a6 "val" T20: PENDING z98uDE0f T21: EVENT SUGGEST z98uDE0f valerian valala T22: QUIT T23: ENDED quit T24: Connection closed by foreign host. ``` _Notes on what happens:_ * **T6:** we enter `search` mode (this is required to enable `search` commands); * **T8:** we query collection `messages`, in bucket for platform user `user:0dcde3a6` with search terms `valerian saliou` and a limit of `10` on returned results; * **T9:** Sonic received the query and stacked it for processing with marker `Bt2m2gYa` (the marker is used to track the asynchronous response); * **T10:** Sonic processed search query of T8 with marker `Bt2m2gYa` and sends 2 search results (those are conversation identifiers, that refer to a primary key in an external database); * **T11 + T13:** we query collection `helpdesk` twice (in the example, this one is heavy, so processing of results takes more time); * **T17 + T18:** we receive search results for search queries of T11 + T13 (this took a while!); --- ### 4️⃣ Sonic Channel (Ingest mode) _The Sonic Channel Ingest mode is used for altering the search index (push, pop and flush). Once in this mode, you cannot switch to other modes or gain access to commands from other modes._ **➡️ Available commands:** * `PUSH`: Push search data in the index (syntax: `PUSH "" [LANG()]?`; time complexity: `O(1)`) * `POP`: Pop search data from the index (syntax: `POP ""`; time complexity: `O(1)`) * `COUNT`: Count indexed search data (syntax: `COUNT [ []?]?`; time complexity: `O(1)`) * `FLUSHC`: Flush all indexed data from a collection (syntax: `FLUSHC `; time complexity: `O(1)`) * `FLUSHB`: Flush all indexed data from a bucket in a collection (syntax: `FLUSHB `; time complexity: `O(N)` where `N` is the number of bucket objects) * `FLUSHO`: Flush all indexed data from an object in a bucket in collection (syntax: `FLUSHO `; time complexity: `O(1)`) * `PING`: ping server (syntax: `PING`; time complexity: `O(1)`) * `HELP`: show help (syntax: `HELP []?`; time complexity: `O(1)`) * `QUIT`: stop connection (syntax: `QUIT`; time complexity: `O(1)`) **⏩ Syntax terminology:** * ``: index collection (ie. what you search in, eg. `messages`, `products`, etc.); * ``: index bucket name (ie. user-specific search classifier in the collection if you have any eg. `user-1, user-2, ..`, otherwise use a common bucket name eg. `generic, default, common, ..`); * ``: object identifier that refers to an entity in an external database, where the searched object is stored (eg. you use Sonic to index CRM contacts by name; full CRM contact data is stored in a MySQL database; in this case the object identifier in Sonic will be the MySQL primary key for the CRM contact); * ``: search text to be indexed (can be a single word, or a longer text; within maximum length safety limits; should be quoted using `"` quotes; internal quotes should be escaped using `\"`); * ``: an ISO 639-3 locale code eg. `eng` for English (if set, the locale must be a valid ISO 639-3 code; if set to `none`, lexing will be disabled; if not set, the locale will be guessed from text); * ``: help manual to be shown (available manuals: `commands`); _Notice: the `bucket` terminology may confuse some Sonic users. As we are well-aware Sonic may be used in an environment where end-users may each hold their own search index in a given `collection`, we made it possible to manage per-end-user search indexes with `bucket`. If you only have a single index per `collection` (most Sonic users will), we advise you use a static generic name for your `bucket`, for instance: `default`._ **⬇️ Ingest flow example (via `telnet`):** ```bash T1: telnet sonic.local 1491 T2: Trying ::1... T3: Connected to sonic.local. T4: Escape character is '^]'. T5: CONNECTED T6: START ingest SecretPassword T7: STARTED ingest protocol(1) buffer(20000) T8: PUSH messages user:0dcde3a6 conversation:71f3d63b Hey Valerian T9: ERR invalid_format(PUSH "") T10: PUSH messages user:0dcde3a6 conversation:71f3d63b "Hello Valerian Saliou, how are you today?" T11: OK T12: COUNT messages user:0dcde3a6 T13: RESULT 43 T14: COUNT messages user:0dcde3a6 conversation:71f3d63b T15: RESULT 1 T16: FLUSHO messages user:0dcde3a6 conversation:71f3d63b T17: RESULT 1 T18: FLUSHB messages user:0dcde3a6 T19: RESULT 42 T20: PING T21: PONG T22: QUIT T23: ENDED quit T24: Connection closed by foreign host. ``` _Notes on what happens:_ * **T6:** we enter `ingest` mode (this is required to enable `ingest` commands); * **T8:** we try to push text `Hey Valerian` to the index, in collection `messages`, bucket `user:0dcde3a6` and object `conversation:71f3d63b` (the syntax that was used is invalid); * **T9:** Sonic refuses the command we issued in T8, and provides us with the correct command format (notice that `` should be quoted); * **T10:** we attempt to push another text in the same collection, bucket and object as in T8; * **T11:** this time, our push command in T10 was valid (Sonic acknowledges the push commit to the search index); * **T12:** we count the number of indexed terms in collection `messages` and bucket `user:0dcde3a6`; * **T13:** there are 43 terms (ie. words) in index for query in T12; * **T18:** we flush all index data from collection `messages` and bucket `user:0dcde3a6`; * **T19:** 42 terms have been flushed from index for command in T18; --- ### 5️⃣ Sonic Channel (Control mode) _The Sonic Channel Control mode is used for administration purposes. Once in this mode, you cannot switch to other modes or gain access to commands from other modes._ **➡️ Available commands:** * `TRIGGER`: trigger an action (syntax: `TRIGGER []? []?`; time complexity: `O(1)`) * `INFO`: get server information (syntax: `INFO`; time complexity: `O(1)`) * `PING`: ping server (syntax: `PING`; time complexity: `O(1)`) * `HELP`: show help (syntax: `HELP []?`; time complexity: `O(1)`) * `QUIT`: stop connection (syntax: `QUIT`; time complexity: `O(1)`) **⏩ Syntax terminology:** * ``: action to be triggered (available actions: `consolidate`, `backup`, `restore`); * ``: additional data to provide to the action (required for: `backup`, `restore`); * ``: help manual to be shown (available manuals: `commands`); **⬇️ Control flow example (via `telnet`):** ```bash T1: telnet sonic.local 1491 T2: Trying ::1... T3: Connected to sonic.local. T4: Escape character is '^]'. T5: CONNECTED T6: START control SecretPassword T7: STARTED control protocol(1) buffer(20000) T8: TRIGGER consolidate T9: OK T10: PING T11: PONG T12: QUIT T13: ENDED quit T14: Connection closed by foreign host. ``` _Notes on what happens:_ * **T6:** we enter `control` mode (this is required to enable `control` commands); * **T8:** we trigger a database consolidation (instead of waiting for the next automated consolidation tick); ================================================ FILE: README.md ================================================ Sonic ===== [![Test and Build](https://github.com/valeriansaliou/sonic/workflows/Test%20and%20Build/badge.svg?branch=master)](https://github.com/valeriansaliou/sonic/actions?query=workflow%3A%22Test+and+Build%22) [![Build and Release](https://github.com/valeriansaliou/sonic/workflows/Build%20and%20Release/badge.svg)](https://github.com/valeriansaliou/sonic/actions?query=workflow%3A%22Build+and+Release%22) [![dependency status](https://deps.rs/repo/github/valeriansaliou/sonic/status.svg)](https://deps.rs/repo/github/valeriansaliou/sonic) [![Buy Me A Coffee](https://img.shields.io/badge/buy%20me%20a%20coffee-donate-yellow.svg)](https://www.buymeacoffee.com/valeriansaliou) **Sonic is a fast, lightweight and schema-less search backend. It ingests search texts and identifier tuples that can then be queried against in a microsecond's time.** Sonic can be used as a simple alternative to super-heavy and full-featured search backends such as Elasticsearch in some use-cases. It is capable of normalizing natural language search queries, auto-completing a search query and providing the most relevant results for a query. Sonic is an identifier index, rather than a document index; when queried, it returns IDs that can then be used to refer to the matched documents in an external database. A strong attention to performance and code cleanliness has been given when designing Sonic. It aims at being crash-free, super-fast and puts minimum strain on server resources (our measurements have shown that Sonic - when under load - responds to search queries in the μs range, eats ~30MB RAM and has a low CPU footprint; [see our benchmarks](https://github.com/valeriansaliou/sonic#how-fast--lightweight-is-it)). _Tested at Rust version: `rustc 1.74.1 (a28077b28 2023-12-04)`_ **🇫🇷 Crafted in Nantes, France.** **:newspaper: The Sonic project was initially announced in [a post on my personal journal](https://journal.valeriansaliou.name/announcing-sonic-a-super-light-alternative-to-elasticsearch/).** ![Sonic](https://valeriansaliou.github.io/sonic/images/banner.jpg) > _« Sonic » is the mascot of the Sonic project. I drew it to look like a psychedelic hipster hedgehog._ ## Who uses it?
Crisp Scrumpy
_👋 You use Sonic and you want to be listed there? [Contact me](https://valeriansaliou.name/)._ ## Demo Sonic is integrated in all Crisp search products on the [Crisp](https://crisp.chat/) platform. It is used to index half a billion objects on a $5/mth 1-vCPU SSD cloud server (as of 2019). Crisp users use it to search in their messages, conversations, contacts, helpdesk articles and more. **You can test Sonic live on: [Crisp Helpdesk](https://help.crisp.chat/), and get an idea of the speed and relevance of Sonic search results. You can also test search suggestions from there: start typing at least 2 characters for a word, and get suggested a full word (press the tab key to expand suggestion). _Both search and suggestions are powered by Sonic._** ![Demo on Crisp Helpdesk search](https://valeriansaliou.github.io/sonic/images/crisp-search-demo.gif) > _Sonic fuzzy search in helpdesk articles at its best. Lookup for any word or group of terms, get results instantly._ ## Features * **Search terms are stored in collections, organized in buckets**; you may use a single bucket, or a bucket per user on your platform if you need to search in separate indexes. * **Search results return object identifiers**, that can be resolved from an external database if you need to enrich the search results. This makes Sonic a simple word index, that points to identifier results. Sonic doesn't store any direct textual data in its index, but it still holds a word graph for auto-completion and typo corrections. * **Search query typos are corrected** if there are not enough exact-match results for a given word in a search query, Sonic tries to correct the word and tries against alternate words. You're allowed to make mistakes when searching. * **Insert and remove items in the index**; index-altering operations are light and can be committed to the server while it is running. A background tasker handles the job of consolidating the index so that the entries you have pushed or popped are quickly made available for search. * **Auto-complete any word** in real-time via the suggest operation. This helps build a snappy word suggestion feature in your end-user search interface. * **Full Unicode compatibility** on 80+ most spoken languages in the world. Sonic removes useless stop words from any text (eg. 'the' in English), after guessing the text language. This ensures any searched or ingested text is clean before it hits the index; [see languages](https://github.com/valeriansaliou/sonic#which-text-languages-are-supported). * **Simple protocol (Sonic Channel)**, that let you search your index, manage data ingestion (push in the index, pop from the index, flush a collection, flush a bucket, etc.) and perform administrative actions. Sonic Channel was designed to be lightweight on resources and simple to integrate with; [read protocol specification](https://github.com/valeriansaliou/sonic/blob/master/PROTOCOL.md). * **Easy-to-use libraries**, that let you connect to Sonic from your apps; [see libraries](https://github.com/valeriansaliou/sonic#-sonic-channel-libraries). ## How to use it? ### Installation Sonic is built in Rust. To install it, either download a version from the [Sonic releases](https://github.com/valeriansaliou/sonic/releases) page, use `cargo install` or pull the source code from `master`. 👉 _Each release binary comes with an `.asc` signature file, which can be verified using [@valeriansaliou](https://github.com/valeriansaliou) GPG public key: [:key:valeriansaliou.gpg.pub.asc](https://valeriansaliou.name/files/keys/valeriansaliou.gpg.pub.asc)._ **👉 Install from packages:** Sonic provides [pre-built packages](https://packagecloud.io/valeriansaliou/sonic) for Debian-based systems (Debian, Ubuntu, etc.). **Important: Sonic only provides 64 bits packages targeting Debian 12 for now (codename: `bookworm`). You might still be able to use them on other Debian versions, as well as Ubuntu (although they rely on a specific `glibc` version that might not be available on older or newer systems).** First, add the Sonic APT repository (eg. for Debian `bookworm`): ```bash echo "deb [signed-by=/usr/share/keyrings/valeriansaliou_sonic.gpg] https://packagecloud.io/valeriansaliou/sonic/debian/ bookworm main" > /etc/apt/sources.list.d/valeriansaliou_sonic.list ``` ```bash curl -fsSL https://packagecloud.io/valeriansaliou/sonic/gpgkey | gpg --dearmor -o /usr/share/keyrings/valeriansaliou_sonic.gpg ``` ```bash apt-get update ``` Then, install the Sonic package: ```bash apt-get install sonic ``` Then, edit the pre-filled Sonic configuration file: ```bash nano /etc/sonic.cfg ``` Finally, restart Sonic: ``` service sonic restart ``` **👉 Install from source:** If you pulled the source code from Git, you can build it using `cargo`: ```bash cargo build --release ``` You can find the built binaries in the `./target/release` directory. _Install `build-essential`, `clang`, `libclang-dev`, `libc6-dev`, `g++` and `llvm-dev` to be able to compile the required RocksDB dependency._ Note that the following optional features can be enabled upon building Sonic: `allocator-jemalloc`, `tokenizer-chinese` and `tokenizer-japanese` (some might be already enabled by default). **👉 Install from Cargo:** You can install Sonic directly with `cargo install`: ```bash cargo install sonic-server ``` Ensure that your `$PATH` is properly configured to source the Crates binaries, and then run Sonic using the `sonic` command. _Install `build-essential`, `clang`, `libclang-dev`, `libc6-dev`, `g++` and `llvm-dev` to be able to compile the required RocksDB dependency._ **👉 Install from Docker Hub:** You might find it convenient to run Sonic via Docker. You can find the pre-built Sonic image on Docker Hub as [valeriansaliou/sonic](https://hub.docker.com/r/valeriansaliou/sonic/). First, pull the `valeriansaliou/sonic` image: ```bash docker pull valeriansaliou/sonic:v1.4.9 ``` Then, seed it a configuration file and run it (replace `/path/to/your/sonic/config.cfg` with the path to your configuration file): ```bash docker run -p 1491:1491 -v /path/to/your/sonic/config.cfg:/etc/sonic.cfg -v /path/to/your/sonic/store/:/var/lib/sonic/store/ valeriansaliou/sonic:v1.4.9 ``` In the configuration file, ensure that: * `channel.inet` is set to `0.0.0.0:1491` (this lets Sonic be reached from outside the container) * `store.kv.path` is set to `/var/lib/sonic/store/kv/` (this lets the external KV store directory be reached by Sonic) * `store.fst.path` is set to `/var/lib/sonic/store/fst/` (this lets the external FST store directory be reached by Sonic) Sonic will be reachable from `tcp://localhost:1491`. **👉 Install from another source (non-official):** Other installation sources are available: * **Homebrew (macOS)**: `brew install sonic` ([see formula](https://formulae.brew.sh/formula/sonic)) _Note that those sources are non-official, meaning that they are not owned nor maintained by the Sonic project owners. The latest Sonic version available on those sources might be outdated, in comparison to the latest version available through the Sonic project._ ### Configuration Use the sample [config.cfg](https://github.com/valeriansaliou/sonic/blob/master/config.cfg) configuration file and adjust it to your own environment. _If you are looking to fine-tune your configuration, you may read our [detailed configuration documentation](https://github.com/valeriansaliou/sonic/blob/master/CONFIGURATION.md)._ ### Run Sonic Sonic can be run as such: `./sonic -c /path/to/config.cfg` ## Perform searches and manage objects Both searches and object management (i.e. data ingestion) is handled via the Sonic Channel protocol only. As we want to keep things simple with Sonic (similarly to how Redis does it), Sonic does not offer a HTTP endpoint or similar; connecting via Sonic Channel is the way to go when you need to interact with the Sonic search database. Sonic distributes official libraries, that let you integrate Sonic to your apps easily. Click on a library below to see library integration documentation and code. _If you are looking for details on the raw Sonic Channel TCP-based protocol, you can read our [detailed protocol documentation](https://github.com/valeriansaliou/sonic/blob/master/PROTOCOL.md). It can prove handy if you are looking to code your own Sonic Channel library._ ### 📦 Sonic Channel Libraries #### 1️⃣ Official Libraries Sonic distributes official Sonic integration libraries for your programming language (official means that those libraries have been reviewed and validated by a core maintainer): * **NodeJS**: * **[node-sonic-channel](https://www.npmjs.com/package/sonic-channel)** by [@valeriansaliou](https://github.com/valeriansaliou) * **PHP**: * **[psonic](https://github.com/ppshobi/psonic)** by [@ppshobi](https://github.com/ppshobi) * **Rust**: * **[sonic-channel](https://github.com/pleshevskiy/sonic-channel)** by [@pleshevskiy](https://github.com/pleshevskiy) #### 2️⃣ Community Libraries You can find below a list of Sonic integrations provided by the community (many thanks to them!): * **Rust**: * **[sonic_client](https://github.com/FrontMage/sonic_client)** by [@FrontMage](https://github.com/FrontMage) * **Python**: * **[asonic](https://github.com/moshe/asonic)** by [@moshe](https://github.com/moshe) * **[python-sonic-client](https://github.com/xmonader/python-sonic-client)** by [@xmonader](https://github.com/xmonader) * **[pysonic-channel](https://github.com/AlongWY/pysonic)** by [@AlongWY](https://github.com/AlongWY) * **Ruby**: * **[sonic-ruby](https://github.com/atipugin/sonic-ruby)** by [@atipugin](https://github.com/atipugin) * **Go**: * **[go-sonic](https://github.com/expectedsh/go-sonic)** by [@alexisvisco](https://github.com/alexisvisco) * **[go-sonic](https://github.com/OGKevin/go-sonic)** by [@OGKevin](https://github.com/OGKevin) * **PHP**: * **[php-sonic](https://github.com/php-sonic/php-sonic)** by [@touhonoob](https://github.com/touhonoob) * **[laravel-scout-sonic](https://github.com/james2doyle/laravel-scout-sonic)** by [@james2doyle](https://github.com/james2doyle) * **Java**: * **[java-sonic](https://github.com/twohou/java-sonic)** by [@touhonoob](https://github.com/touhonoob) * **[jsonic](https://github.com/alohaking/jsonic)** by [@alohaking](https://github.com/alohaking) * **Deno**: * **[deno-sonic](https://github.com/erfanium/deno_sonic)** by [@erfanium](https://github.com/erfanium) * **Bun**: * **[sonic-bun](https://github.com/emilianscheel/sonic-bun)** by [@emilianscheel](https://github.com/emilianscheel) * **Elixir**: * **[sonix](https://github.com/imerkle/sonix)** by [@imerkle](https://github.com/imerkle) * **Crystal**: * **[sonic-crystal](https://github.com/babelian/sonic-crystal)** by [@babelian](https://github.com/babelian) * **Nim**: * **[nim-sonic-client](https://github.com/xmonader/nim-sonic-client)** by [@xmonader](https://github.com/xmonader) * **.NET**: * **[nsonic](https://github.com/spikensbror-dotnet/nsonic)** by [@spikensbror](https://github.com/spikensbror) _ℹ️ Cannot find the library for your programming language? Build your own and be referenced here! ([contact me](https://valeriansaliou.name/))_ ## Which text languages are supported? Sonic supports a wide range of languages in its lexing system. If a language is not in this list, you will still be able to push this language to the search index, but stop-words will not be eluded, which could lead to lower-quality search results. **The languages supported by the lexing system are:** * 🇿🇦 Afrikaans * 🇸🇦 Arabic * 🇦🇲 Armenian * 🇦🇿 Azerbaijani * 🇧🇩 Bengali * 🇧🇬 Bulgarian * 🇲🇲 Burmese * 🏳 Catalan * 🇨🇳 Chinese (Simplified) * 🇹🇼 Chinese (Traditional) * 🇭🇷 Croatian * 🇨🇿 Czech * 🇩🇰 Danish * 🇳🇱 Dutch * 🇬🇧 English * 🏳 Esperanto * 🇪🇪 Estonian * 🇫🇮 Finnish * 🇫🇷 French * 🇬🇪 Georgian * 🇩🇪 German * 🇬🇷 Greek * 🇮🇳 Gujarati * 🇮🇱 Hebrew * 🇮🇳 Hindi * 🇭🇺 Hungarian * 🇮🇩 Indonesian * 🇮🇹 Italian * 🇯🇵 Japanese * 🇮🇳 Kannada * 🇰🇭 Khmer * 🇰🇷 Korean * 🏳 Latin * 🇱🇻 Latvian * 🇱🇹 Lithuanian * 🇮🇳 Malayalam * 🇮🇳 Marathi * 🇳🇵 Nepali * 🇮🇷 Persian * 🇵🇱 Polish * 🇵🇹 Portuguese * 🇮🇳 Punjabi * 🇷🇺 Russian * 🇷🇸 Serbian * 🇸🇰 Slovak * 🇸🇮 Slovene * 🇪🇸 Spanish * 🇸🇪 Swedish * 🇵🇭 Tagalog * 🇮🇳 Tamil * 🇹🇭 Thai * 🇹🇷 Turkish * 🇺🇦 Ukrainian * 🇵🇰 Urdu * 🇻🇳 Vietnamese * 🇮🇱 Yiddish * 🇿🇦 Zulu ## How fast & lightweight is it? Sonic was built for [Crisp](https://crisp.chat/) from the start. As Crisp was growing and indexing more and more search data into a full-text search SQL database, we decided it was time to switch to a proper search backend system. When reviewing Elasticsearch (ELS) and others, we found those were full-featured heavyweight systems that did not scale well with Crisp's freemium-based cost structure. At the end, we decided to build our own search backend, designed to be simple and lightweight on resources. You can run function-level benchmarks with the command: `cargo bench --features benchmark` ### 👩‍🔬 Benchmark #1 #### ➡️ Scenario We performed an extract of all messages from the Crisp team used for [Crisp](https://crisp.chat/) own customer support. We want to import all those messages into a clean Sonic instance, and then perform searches on the index we built. We will measure the time that Sonic spent executing each operation (ie. each `PUSH` and `QUERY` commands over Sonic Channel), and group results per 1,000 operations (this outputs a mean time per 1,000 operations). #### ➡️ Context **Our benchmark is ran on the following computer:** * **Device**: MacBook Pro (Retina, 15-inch, Mid 2014) * **OS**: MacOS 10.14.3 * **Disk**: 512GB SSD (formatted under the AFS file system) * **CPU**: 2.5 GHz Intel Core i7 * **RAM**: 16 GB 1600 MHz DDR3 **Sonic is compiled as following:** * **Sonic version**: 1.0.1 * **Rustc version**: `rustc 1.35.0-nightly (719b0d984 2019-03-13)` * **Compiler flags**: `release` profile (`-03` with `lto`) **Our dataset is as such:** * **Number of objects**: ~1,000,000 messages * **Total size**: ~100MB of raw message text (this does not account for identifiers and other metas) #### ➡️ Scripts **The scripts we used to perform the benchmark are:** 1. **PUSH script**: [sonic-benchmark_batch-push.js](https://gist.github.com/valeriansaliou/e5ab737b28601ebd70483f904d21aa09) 2. **QUERY script**: [sonic-benchmark_batch-query.js](https://gist.github.com/valeriansaliou/3ef8315d7282bd173c2cb9eba64fa739) #### ⏬ Results **Our findings:** * We imported ~1,000,000 messages of dynamic length (some very long, eg. emails); * Once imported, the search index weights 20MB (KV) + 1.4MB (FST) on disk; * CPU usage during import averaged 75% of a single CPU core; * RAM usage for the Sonic process peaked at 28MB during our benchmark; * We used a single Sonic Channel TCP connection, which limits the import to a single thread (we could have load-balanced this across as many Sonic Channel connections as there are CPUs); * We get an import RPS approaching 4,000 operations per second (per thread); * We get a search query RPS approaching 1,000 operations per second (per thread); * On the hyper-threaded 4-cores CPU used, we could have parallelized operations to 8 virtual cores, thus theoretically increasing the import RPS to 32,000 operations / second, while the search query RPS would be increased to 8,000 operations / second (we may be SSD-bound at some point though); **Compared results per operation (on a single object):** We took a sample of 8 results from our batched operations, which produced a total of 1,000 results (1,000,000 items, with 1,000 items batched per measurement report). _This is not very scientific, but it should give you a clear idea of Sonic performances._ **Time spent per operation:** Operation | Average | Best | Worst --------- | ------- | ----- | ----- PUSH | 275μs | 190μs | 363μs QUERY | 880μs | 852μs | 1ms **Batch PUSH results as seen from our terminal (from initial index of: 0 objects):** ![Batch PUSH benchmark](https://valeriansaliou.github.io/sonic/images/benchmark-batch-push.png) **Batch QUERY results as seen from our terminal (on index of: 1,000,000 objects):** ![Batch QUERY benchmark](https://valeriansaliou.github.io/sonic/images/benchmark-batch-query.png) ## Limitations * **Indexed data limits**: Sonic is designed for large search indexes split over thousands of search buckets per collection. An IID (ie. Internal-ID) is stored in the index as a 32 bits number, which theoretically allow up to ~4.2 billion objects to be indexed (ie. OID) per bucket. We've observed storage savings of 30% to 40%, which justifies the trade-off on large databases (versus Sonic using 64 bits IIDs). Also, Sonic only keeps the N most recently pushed results for a given word, in a sliding window way (the sliding window width can be configured). * **Search query limits**: Sonic Natural Language Processing system (NLP) does not work at the sentence-level, for storage compactness reasons (we keep the FST graph shallow as to reduce time and space complexity). It works at the word-level, and is thus able to search per-word and can predict a word based on user input, though it is unable to predict the next word in a sentence. * **Real-time limits**: the FST needs to be rebuilt every time a word is pushed or popped from the bucket graph. As this is quite heavy, Sonic batches rebuild cycles. If you have just pushed a new word to the index and you are not seeing it in the `SUGGEST` command yet, wait for the next rebuild cycle to kick-in, or force it with `TRIGGER consolidate` in a `control` channel. * **Interoperability limits**: The Sonic Channel protocol is the only way to read and write search entries to the Sonic search index. Sonic does not expose any HTTP API. Sonic Channel has been designed with performance and minimal network footprint in mind. If you need to access Sonic from an unsupported programming language, you can either [open an issue](https://github.com/valeriansaliou/sonic/issues/new) or look at the reference [node-sonic-channel](https://github.com/valeriansaliou/node-sonic-channel) implementation and build it in your target programming language. * **Hardware limits**: Sonic performs the search on the file-system directly; ie. it does not fit the index in RAM. A search query results in a lot of random accesses on the disk, which means that it will be quite slow on old-school HDDs and super-fast on newer SSDs. Do store the Sonic database on SSD-backed file systems only. ## :fire: Report A Vulnerability If you find a vulnerability in Sonic, you are more than welcome to report it directly to [@valeriansaliou](https://github.com/valeriansaliou) by sending an encrypted email to [valerian@valeriansaliou.name](mailto:valerian@valeriansaliou.name). Do not report vulnerabilities in public GitHub issues, as they may be exploited by malicious people to target production servers running an unpatched Sonic instance. **:warning: You must encrypt your email using [@valeriansaliou](https://github.com/valeriansaliou) GPG public key: [:key:valeriansaliou.gpg.pub.asc](https://valeriansaliou.name/files/keys/valeriansaliou.gpg.pub.asc).** ================================================ FILE: config.cfg ================================================ # Sonic # Fast, lightweight and schema-less search backend # Configuration file # Example: https://github.com/valeriansaliou/sonic/blob/master/config.cfg [server] log_level = "debug" [channel] inet = "[::1]:1491" tcp_timeout = 300 auth_password = "SecretPassword" [channel.search] query_limit_default = 10 query_limit_maximum = 100 query_alternates_try = 4 suggest_limit_default = 5 suggest_limit_maximum = 20 list_limit_default = 100 list_limit_maximum = 500 [store] [store.kv] path = "./data/store/kv/" retain_word_objects = 1000 [store.kv.pool] inactive_after = 1800 [store.kv.database] flush_after = 900 compress = true parallelism = 2 max_files = 100 max_compactions = 1 max_flushes = 1 write_buffer = 16384 write_ahead_log = true [store.fst] path = "./data/store/fst/" [store.fst.pool] inactive_after = 300 [store.fst.graph] consolidate_after = 180 max_size = 2048 max_words = 250000 ================================================ FILE: debian/changelog ================================================ sonic (0.0.0-1) UNRELEASED; urgency=medium * Initial release. -- Valerian Saliou Tue, 31 Aug 2023 12:00:00 +0000 ================================================ FILE: debian/compat ================================================ 10 ================================================ FILE: debian/control ================================================ Source: sonic Section: net Priority: ext Maintainer: Valerian Saliou Standards-Version: 3.9.4 Build-Depends: wget, ca-certificates Homepage: https://github.com/valeriansaliou/sonic Package: sonic Architecture: any Depends: adduser Provides: sonic Description: Fast, lightweight & schema-less search backend. An alternative to Elasticsearch that runs on a few MBs of RAM. ================================================ FILE: debian/copyright ================================================ Format: http://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ Upstream-Name: sonic Upstream-Contact: Valerian Saliou Source: https://github.com/valeriansaliou/sonic Files: * Copyright: 2023 Valerian Saliou License: MPL-2 License: MPL-2 This Source Code Form is subject to the terms of the Mozilla Public License, v. 2.0. If a copy of the MPL was not distributed with this file, You can obtain one at http://mozilla.org/MPL/2.0/. ================================================ FILE: debian/rules ================================================ #!/usr/bin/make -f DISTRIBUTION = $(shell lsb_release -sr) VERSION = 1.4.9 PACKAGEVERSION = $(VERSION)-0~$(DISTRIBUTION)0 URL = https://github.com/valeriansaliou/sonic/releases/download/v$(VERSION)/ %: dh $@ --with systemd override_dh_auto_clean: override_dh_auto_test: override_dh_auto_build: override_dh_auto_install: $(eval ENV_ARCH := $(shell dpkg --print-architecture)) $(eval ENV_ISA := $(shell if [ "$(ENV_ARCH)" = "amd64" ]; then echo "x86_64"; else echo "$(ENV_ARCH)"; fi)) $(eval ENV_TARBALL := v$(VERSION)-$(ENV_ISA)-gnu.tar.gz) echo "Architecture: $(ENV_ARCH)" echo "Instruction Set: $(ENV_ISA)" echo "Target: $(URL)$(ENV_TARBALL)" wget -N --progress=dot:mega $(URL)$(ENV_TARBALL) tar -xf $(ENV_TARBALL) strip sonic/sonic mv sonic/config.cfg sonic/sonic.cfg mkdir sonic/store/ sed -i 's/path = ".\/data\/store\//path = "\/var\/lib\/sonic\/store\//g' sonic/sonic.cfg override_dh_gencontrol: dh_gencontrol -- -v$(PACKAGEVERSION) ================================================ FILE: debian/sonic.install ================================================ sonic/sonic usr/bin/ sonic/sonic.cfg etc/ sonic/store/ var/lib/sonic/ ================================================ FILE: debian/sonic.postinst ================================================ #!/bin/sh set -e case "$1" in configure) adduser --system --disabled-password --disabled-login --home /var/empty \ --no-create-home --quiet --group sonic && \ chown sonic:sonic -R /var/lib/sonic/ ;; esac #DEBHELPER# exit 0 ================================================ FILE: debian/sonic.service ================================================ [Unit] Description=Sonic Search Index After=network.target [Service] Type=simple User=sonic Group=sonic ExecStart=/usr/bin/sonic -c /etc/sonic.cfg Restart=on-failure LimitNOFILE=infinity [Install] WantedBy=multi-user.target ================================================ FILE: debian/source/format ================================================ 3.0 (quilt) ================================================ FILE: scripts/build_packages.sh ================================================ #!/bin/bash ## # Sonic # # Fast, lightweight and schema-less search backend # Copyright: 2023, Valerian Saliou # License: Mozilla Public License v2.0 (MPL v2.0) ## # Define build pipeline function build_for_target { OS="$2" DIST="$3" ARCH="$1" ./packpack/packpack release_result=$? if [ $release_result -eq 0 ]; then mkdir -p "./packages/$2_$3/" mv ./build/*$4 "./packages/$2_$3/" echo "Result: Packaged architecture: $1 for OS: $2:$3 (*$4)" fi return $release_result } # Run release tasks ABSPATH=$(cd "$(dirname "$0")"; pwd) BASE_DIR="$ABSPATH/../" rc=0 pushd "$BASE_DIR" > /dev/null echo "Executing packages build steps for Sonic..." # Initialize `packpack` rm -rf ./packpack && \ git clone https://github.com/packpack/packpack.git packpack rc=$? # Proceed build for each target? if [ $rc -eq 0 ]; then build_for_target "x86_64" "debian" "bookworm" ".deb" rc=$? fi # Cleanup environment rm -rf ./build ./packpack if [ $rc -eq 0 ]; then echo "Success: Done executing packages build steps for Sonic" else echo "Error: Failed executing packages build steps for Sonic" fi popd > /dev/null exit $rc ================================================ FILE: scripts/release_binaries.sh ================================================ #!/bin/bash ## # Sonic # # Fast, lightweight and schema-less search backend # Copyright: 2023, Valerian Saliou # License: Mozilla Public License v2.0 (MPL v2.0) ## # Read arguments while [ "$1" != "" ]; do argument_key=`echo $1 | awk -F= '{print $1}'` argument_value=`echo $1 | awk -F= '{print $2}'` case $argument_key in -v | --version) # Notice: strip any leading 'v' to the version number SONIC_VERSION="${argument_value/v}" ;; *) echo "Unknown argument received: '$argument_key'" exit 1 ;; esac shift done # Ensure release version is provided if [ -z "$SONIC_VERSION" ]; then echo "No Sonic release version was provided, please provide it using '--version'" exit 1 fi # Define release pipeline function release_for_architecture { final_tar="v$SONIC_VERSION-$1-$2.tar.gz" rm -rf ./sonic/ && \ cargo build --target "$3" --release && \ mkdir ./sonic && \ cp -p "target/$3/release/sonic" ./sonic/ && \ cp -r ./config.cfg sonic/ && \ tar --owner=0 --group=0 -czvf "$final_tar" ./sonic && \ rm -r ./sonic/ release_result=$? if [ $release_result -eq 0 ]; then echo "Result: Packed architecture: $1 ($2) to file: $final_tar" fi return $release_result } # Run release tasks ABSPATH=$(cd "$(dirname "$0")"; pwd) BASE_DIR="$ABSPATH/../" rc=0 pushd "$BASE_DIR" > /dev/null echo "Executing release steps for Sonic v$SONIC_VERSION..." release_for_architecture "x86_64" "gnu" "x86_64-unknown-linux-gnu" rc=$? if [ $rc -eq 0 ]; then echo "Success: Done executing release steps for Sonic v$SONIC_VERSION" else echo "Error: Failed executing release steps for Sonic v$SONIC_VERSION" fi popd > /dev/null exit $rc ================================================ FILE: scripts/sign_binaries.sh ================================================ #!/bin/bash ## # Sonic # # Fast, lightweight and schema-less search backend # Copyright: 2023, Valerian Saliou # License: Mozilla Public License v2.0 (MPL v2.0) ## # Read arguments while [ "$1" != "" ]; do argument_key=`echo $1 | awk -F= '{print $1}'` argument_value=`echo $1 | awk -F= '{print $2}'` case $argument_key in -v | --version) # Notice: strip any leading 'v' to the version number SONIC_VERSION="${argument_value/v}" ;; *) echo "Unknown argument received: '$argument_key'" exit 1 ;; esac shift done # Ensure release version is provided if [ -z "$SONIC_VERSION" ]; then echo "No Sonic release version was provided, please provide it using '--version'" exit 1 fi # Define sign pipeline function sign_for_architecture { final_tar="v$SONIC_VERSION-$1-$2.tar.gz" gpg_signer="valerian@valeriansaliou.name" gpg -u "$gpg_signer" --armor --detach-sign "$final_tar" sign_result=$? if [ $sign_result -eq 0 ]; then echo "Result: Signed architecture: $1 ($2) for file: $final_tar" fi return $sign_result } # Run sign tasks ABSPATH=$(cd "$(dirname "$0")"; pwd) BASE_DIR="$ABSPATH/../" rc=0 pushd "$BASE_DIR" > /dev/null echo "Executing sign steps for Sonic v$SONIC_VERSION..." sign_for_architecture "x86_64" "gnu" rc=$? if [ $rc -eq 0 ]; then echo "Success: Done executing sign steps for Sonic v$SONIC_VERSION" else echo "Error: Failed executing sign steps for Sonic v$SONIC_VERSION" fi popd > /dev/null exit $rc ================================================ FILE: src/channel/command.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) use hashbrown::HashMap; use rand::distributions::Alphanumeric; use rand::{thread_rng, Rng}; use std::fmt; use std::path::Path; use std::str::{self, SplitWhitespace}; use std::vec::Vec; use super::format::unescape; use super::statistics::ChannelStatistics; use crate::query::builder::{QueryBuilder, QueryBuilderResult}; use crate::query::types::{ ListMetaData, QueryGenericLang, QueryMetaData, QuerySearchLimit, QuerySearchOffset, }; use crate::store::fst::StoreFSTPool; use crate::store::kv::StoreKVPool; use crate::store::operation::StoreOperationDispatch; use crate::APP_CONF; #[derive(PartialEq)] pub enum ChannelCommandError { UnknownCommand, NotFound, QueryError, InternalError, ShuttingDown, PolicyReject(&'static str), InvalidFormat(&'static str), InvalidMetaKey((String, String)), InvalidMetaValue((String, String)), } #[derive(PartialEq)] pub enum ChannelCommandResponse { Void, Ok, Pong, Pending(String), Result(String), Event(&'static str, String, String), Ended(&'static str), Err(ChannelCommandError), } pub struct ChannelCommandBase; pub struct ChannelCommandSearch; pub struct ChannelCommandIngest; pub struct ChannelCommandControl; pub type ChannelCommandResponseArgs = (&'static str, Option>); type ChannelResult = Result, ChannelCommandError>; type MetaPartsResult<'a> = Result<(&'a str, &'a str), (&'a str, &'a str)>; pub const EVENT_ID_SIZE: usize = 8; const TEXT_PART_BOUNDARY: char = '"'; const TEXT_PART_ESCAPE: char = '\\'; const META_PART_GROUP_OPEN: char = '('; const META_PART_GROUP_CLOSE: char = ')'; static BACKUP_KV_PATH: &str = "kv"; static BACKUP_FST_PATH: &str = "fst"; lazy_static! { pub static ref COMMANDS_MODE_SEARCH: Vec<&'static str> = vec!["QUERY", "SUGGEST", "LIST", "PING", "HELP", "QUIT"]; pub static ref COMMANDS_MODE_INGEST: Vec<&'static str> = vec!["PUSH", "POP", "COUNT", "FLUSHC", "FLUSHB", "FLUSHO", "PING", "HELP", "QUIT"]; pub static ref COMMANDS_MODE_CONTROL: Vec<&'static str> = vec!["TRIGGER", "INFO", "PING", "HELP", "QUIT"]; pub static ref CONTROL_TRIGGER_ACTIONS: Vec<&'static str> = vec!["consolidate", "backup", "restore"]; static ref MANUAL_MODE_SEARCH: HashMap<&'static str, &'static Vec<&'static str>> = [("commands", &*COMMANDS_MODE_SEARCH)] .iter() .cloned() .collect(); static ref MANUAL_MODE_INGEST: HashMap<&'static str, &'static Vec<&'static str>> = [("commands", &*COMMANDS_MODE_INGEST)] .iter() .cloned() .collect(); static ref MANUAL_MODE_CONTROL: HashMap<&'static str, &'static Vec<&'static str>> = [("commands", &*COMMANDS_MODE_CONTROL)] .iter() .cloned() .collect(); } impl ChannelCommandResponse { pub fn to_args(&self) -> ChannelCommandResponseArgs { // Convert internal response to channel response arguments; this either gives 'RESPONSE' \ // or 'RESPONSE <..>' whether there are values or not. match *self { ChannelCommandResponse::Void => ("", None), ChannelCommandResponse::Ok => ("OK", None), ChannelCommandResponse::Pong => ("PONG", None), ChannelCommandResponse::Pending(ref id) => ("PENDING", Some(vec![id.to_owned()])), ChannelCommandResponse::Result(ref id) => ("RESULT", Some(vec![id.to_owned()])), ChannelCommandResponse::Event(ref query, ref id, ref payload) => ( "EVENT", Some(vec![query.to_string(), id.to_owned(), payload.to_owned()]), ), ChannelCommandResponse::Ended(reason) => ("ENDED", Some(vec![reason.to_owned()])), ChannelCommandResponse::Err(ref reason) => ("ERR", Some(vec![reason.to_string()])), } } } impl ChannelCommandBase { pub fn dispatch_ping(mut parts: SplitWhitespace) -> ChannelResult { match parts.next() { None => Ok(vec![ChannelCommandResponse::Pong]), _ => Err(ChannelCommandError::InvalidFormat("PING")), } } pub fn dispatch_quit(mut parts: SplitWhitespace) -> ChannelResult { match parts.next() { None => Ok(vec![ChannelCommandResponse::Ended("quit")]), _ => Err(ChannelCommandError::InvalidFormat("QUIT")), } } pub fn generic_dispatch_help( mut parts: SplitWhitespace, manuals: &HashMap<&str, &Vec<&str>>, ) -> ChannelResult { match (parts.next(), parts.next()) { (None, _) => { let manual_list = manuals.keys().map(|k| k.to_owned()).collect::>(); Ok(vec![ChannelCommandResponse::Result(format!( "manuals({})", manual_list.join(", ") ))]) } (Some(manual_key), next_part) => { if next_part.is_none() { if let Some(manual_data) = manuals.get(manual_key) { Ok(vec![ChannelCommandResponse::Result(format!( "{}({})", manual_key, manual_data.join(", ") ))]) } else { Err(ChannelCommandError::NotFound) } } else { Err(ChannelCommandError::InvalidFormat("HELP []?")) } } } } pub fn parse_text_parts(parts: &mut SplitWhitespace) -> Option { // Parse text parts and nest them together let mut text_raw = String::new(); for text_part in parts { if !text_raw.is_empty() { text_raw.push(' '); } text_raw.push_str(text_part); // End reached? (ie. got boundary character) let text_part_bytes = text_part.as_bytes(); let text_part_bound = text_part_bytes.len(); if text_raw.len() > 1 && text_part_bytes[text_part_bound - 1] as char == TEXT_PART_BOUNDARY { // Count the total amount of escape characters before escape (check if escape \ // characters are also being escaped, or not) let mut count_escapes = 0; if text_part_bound > 1 { for index in (0..text_part_bound - 1).rev() { if text_part_bytes[index] as char != TEXT_PART_ESCAPE { break; } count_escapes += 1 } } // Boundary is not escaped, we can stop there. if count_escapes == 0 || (count_escapes % 2 == 0) { break; } } } // Ensure parsed text parts are valid let text_bytes = text_raw.as_bytes(); let text_bytes_len = text_bytes.len(); if text_raw.is_empty() || text_bytes_len < 2 || text_bytes[0] as char != TEXT_PART_BOUNDARY || text_bytes[text_bytes_len - 1] as char != TEXT_PART_BOUNDARY { info!("could not properly parse text parts: {}", text_raw); None } else { debug!( "parsed text parts (still needs post-processing): {}", text_raw ); // Return inner text (without boundary characters) match str::from_utf8(&text_bytes[1..text_bytes_len - 1]) { Ok(text_inner) => { let text_inner_string = unescape(text_inner.trim()); debug!("parsed text parts (post-processed): {}", text_inner_string); // Text must not be empty if !text_inner_string.is_empty() { Some(text_inner_string) } else { None } } Err(err) => { info!( "could not type-cast post-processed text parts: {} because: {}", text_raw, err ); None } } } } pub fn parse_next_meta_parts<'a>( parts: &'a mut SplitWhitespace, ) -> Option> { if let Some(part) = parts.next() { // Parse meta (with format: 'KEY(VALUE)'; no '(' or ')' is allowed in KEY and VALUE) if !part.is_empty() { if let Some(index_open) = part.find(META_PART_GROUP_OPEN) { let (key_bound_start, key_bound_end) = (0, index_open); let (value_bound_start, value_bound_end) = (index_open + 1, part.len() - 1); if part.as_bytes()[value_bound_end] as char == META_PART_GROUP_CLOSE { let (key, value) = ( &part[key_bound_start..key_bound_end], &part[value_bound_start..value_bound_end], ); // Ensure final key and value do not contain reserved syntax characters return if !key.contains(META_PART_GROUP_OPEN) && !key.contains(META_PART_GROUP_CLOSE) && !value.contains(META_PART_GROUP_OPEN) && !value.contains(META_PART_GROUP_CLOSE) { debug!("parsed meta part as: {} = {}", key, value); Some(Ok((key, value))) } else { info!( "parsed meta part, but it contains reserved characters: {} = {}", key, value ); Some(Err((key, value))) }; } } } info!("could not parse meta part: {}", part); Some(Err(("?", part))) } else { None } } pub fn make_error_invalid_meta_key(meta_key: &str, meta_value: &str) -> ChannelCommandError { ChannelCommandError::InvalidMetaKey((meta_key.to_owned(), meta_value.to_owned())) } pub fn make_error_invalid_meta_value(meta_key: &str, meta_value: &str) -> ChannelCommandError { ChannelCommandError::InvalidMetaValue((meta_key.to_owned(), meta_value.to_owned())) } pub fn commit_ok_operation(query_builder: QueryBuilderResult) -> ChannelResult { query_builder .and_then(StoreOperationDispatch::dispatch) .map(|_| vec![ChannelCommandResponse::Ok]) .or(Err(ChannelCommandError::QueryError)) } pub fn commit_result_operation(query_builder: QueryBuilderResult) -> ChannelResult { query_builder .and_then(StoreOperationDispatch::dispatch) .or(Err(ChannelCommandError::QueryError)) .and_then(|result| { if let Some(result_inner) = result { Ok(vec![ChannelCommandResponse::Result(result_inner)]) } else { Err(ChannelCommandError::InternalError) } }) } pub fn commit_pending_operation( query_type: &'static str, query_id: &str, query_builder: QueryBuilderResult, ) -> ChannelResult { // Idea: this could be made asynchronous in the future, if there are some latency issues \ // on large Sonic deployments. The idea would be to have a number of worker threads for \ // the whole running daemon, and channel threads dispatching work to those threads. This \ // way Sonic can be up-scaled to N CPUs instead of 1 CPU per channel connection. Now on, \ // the only way to scale Sonic executors to multiple CPUs is opening multiple parallel \ // Sonic Channel connections and dispatching work evenly to each connection. It does not \ // prevent scaling Sonic vertically, but could be made simpler for the Sonic Channel \ // consumer via a worker thread pool. query_builder .and_then(StoreOperationDispatch::dispatch) .map(|results| { vec![ ChannelCommandResponse::Pending(query_id.to_string()), ChannelCommandResponse::Event( query_type, query_id.to_string(), results.unwrap_or_default(), ), ] }) .or(Err(ChannelCommandError::QueryError)) } pub fn generate_event_id() -> String { thread_rng() .sample_iter(&Alphanumeric) .take(EVENT_ID_SIZE) .map(|value| value as char) .collect() } } impl ChannelCommandSearch { pub fn dispatch_query(mut parts: SplitWhitespace) -> ChannelResult { match ( parts.next(), parts.next(), ChannelCommandBase::parse_text_parts(&mut parts), ) { (Some(collection), Some(bucket), Some(text)) => { // Generate command identifier let event_id = ChannelCommandBase::generate_event_id(); debug!( "dispatching search query #{} on collection: {} and bucket: {}", event_id, collection, bucket ); // Define query parameters let (mut query_limit, mut query_offset, mut query_lang) = (APP_CONF.channel.search.query_limit_default, 0, None); // Parse meta parts (meta comes after text; extract meta parts second) let mut last_meta_err = None; while let Some(meta_result) = ChannelCommandBase::parse_next_meta_parts(&mut parts) { match Self::handle_query_meta(meta_result) { Ok((Some(query_limit_parsed), None, None)) => { query_limit = query_limit_parsed } Ok((None, Some(query_offset_parsed), None)) => { query_offset = query_offset_parsed } Ok((None, None, Some(query_lang_parsed))) => { query_lang = Some(query_lang_parsed) } Err(parse_err) => last_meta_err = Some(parse_err), _ => {} } } if let Some(err) = last_meta_err { Err(err) } else if query_limit < 1 || query_limit > APP_CONF.channel.search.query_limit_maximum { Err(ChannelCommandError::PolicyReject( "LIMIT out of minimum/maximum bounds", )) } else { debug!( "will search for #{} with text: {}, limit: {}, offset: {}, locale: <{:?}>", event_id, text, query_limit, query_offset, query_lang ); // Commit 'search' query ChannelCommandBase::commit_pending_operation( "QUERY", &event_id, QueryBuilder::search( &event_id, collection, bucket, &text, query_limit, query_offset, query_lang, ), ) } } _ => Err(ChannelCommandError::InvalidFormat( "QUERY \"\" [LIMIT()]? [OFFSET()]? \ [LANG()]?", )), } } pub fn dispatch_suggest(mut parts: SplitWhitespace) -> ChannelResult { match ( parts.next(), parts.next(), ChannelCommandBase::parse_text_parts(&mut parts), ) { (Some(collection), Some(bucket), Some(text)) => { // Generate command identifier let event_id = ChannelCommandBase::generate_event_id(); debug!( "dispatching search suggest #{} on collection: {} and bucket: {}", event_id, collection, bucket ); // Define suggest parameters let mut suggest_limit = APP_CONF.channel.search.suggest_limit_default; // Parse meta parts (meta comes after text; extract meta parts second) let mut last_meta_err = None; while let Some(meta_result) = ChannelCommandBase::parse_next_meta_parts(&mut parts) { match Self::handle_suggest_meta(meta_result) { Ok(Some(suggest_limit_parsed)) => suggest_limit = suggest_limit_parsed, Err(parse_err) => last_meta_err = Some(parse_err), _ => {} } } if let Some(err) = last_meta_err { Err(err) } else if suggest_limit < 1 || suggest_limit > APP_CONF.channel.search.suggest_limit_maximum { Err(ChannelCommandError::PolicyReject( "LIMIT out of minimum/maximum bounds", )) } else { debug!( "will suggest for #{} with text: {}, limit: {}", event_id, text, suggest_limit ); // Commit 'suggest' query ChannelCommandBase::commit_pending_operation( "SUGGEST", &event_id, QueryBuilder::suggest(&event_id, collection, bucket, &text, suggest_limit), ) } } _ => Err(ChannelCommandError::InvalidFormat( "SUGGEST \"\" [LIMIT()]?", )), } } pub fn dispatch_list(mut parts: SplitWhitespace) -> ChannelResult { match (parts.next(), parts.next()) { (Some(collection), Some(bucket)) => { // Generate command identifier let event_id = ChannelCommandBase::generate_event_id(); debug!( "dispatching search list #{} on collection: {} and bucket: {}", event_id, collection, bucket ); // Define list parameters let (mut list_limit, mut list_offset) = (APP_CONF.channel.search.list_limit_default, 0); // Parse meta parts (meta comes last; extract meta parts second) let mut last_meta_err = None; while let Some(meta_result) = ChannelCommandBase::parse_next_meta_parts(&mut parts) { match Self::handle_list_meta(meta_result) { Ok(metadata) => match metadata { (Some(list_limit_parsed), None) => list_limit = list_limit_parsed, (None, Some(list_offset_parsed)) => list_offset = list_offset_parsed, _ => {} }, Err(parse_err) => last_meta_err = Some(parse_err), } } if let Some(err) = last_meta_err { Err(err) } else if list_limit < 1 || list_limit > APP_CONF.channel.search.list_limit_maximum { Err(ChannelCommandError::PolicyReject( "LIMIT out of minimum/maximum bounds", )) } else { // Commit 'list' query ChannelCommandBase::commit_pending_operation( "LIST", &event_id, QueryBuilder::list(&event_id, collection, bucket, list_limit, list_offset), ) } } _ => Err(ChannelCommandError::InvalidFormat( "LIST [LIMIT()]? [OFFSET()]?", )), } } pub fn dispatch_help(parts: SplitWhitespace) -> ChannelResult { ChannelCommandBase::generic_dispatch_help(parts, &*MANUAL_MODE_SEARCH) } fn handle_query_meta( meta_result: MetaPartsResult, ) -> Result { match meta_result { Ok((meta_key, meta_value)) => { debug!("handle query meta: {} = {}", meta_key, meta_value); match meta_key { "LIMIT" => { // 'LIMIT()' where 0 <= < 2^16 if let Ok(query_limit_parsed) = meta_value.parse::() { Ok((Some(query_limit_parsed), None, None)) } else { Err(ChannelCommandBase::make_error_invalid_meta_value( meta_key, meta_value, )) } } "OFFSET" => { // 'OFFSET()' where 0 <= < 2^32 if let Ok(query_offset_parsed) = meta_value.parse::() { Ok((None, Some(query_offset_parsed), None)) } else { Err(ChannelCommandBase::make_error_invalid_meta_value( meta_key, meta_value, )) } } "LANG" => { // 'LANG()' where ∈ ISO 639-3 if let Some(query_lang_parsed) = QueryGenericLang::from_value(meta_value) { Ok((None, None, Some(query_lang_parsed))) } else { Err(ChannelCommandBase::make_error_invalid_meta_value( meta_key, meta_value, )) } } _ => Err(ChannelCommandBase::make_error_invalid_meta_key( meta_key, meta_value, )), } } Err(err) => Err(ChannelCommandBase::make_error_invalid_meta_key( err.0, err.1, )), } } fn handle_suggest_meta( meta_result: MetaPartsResult, ) -> Result, ChannelCommandError> { match meta_result { Ok((meta_key, meta_value)) => { debug!("handle suggest meta: {} = {}", meta_key, meta_value); match meta_key { "LIMIT" => { // 'LIMIT()' where 0 <= < 2^16 if let Ok(suggest_limit_parsed) = meta_value.parse::() { Ok(Some(suggest_limit_parsed)) } else { Err(ChannelCommandBase::make_error_invalid_meta_value( meta_key, meta_value, )) } } _ => Err(ChannelCommandBase::make_error_invalid_meta_key( meta_key, meta_value, )), } } Err(err) => Err(ChannelCommandBase::make_error_invalid_meta_key( err.0, err.1, )), } } fn handle_list_meta(meta_result: MetaPartsResult) -> Result { match meta_result { Ok((meta_key, meta_value)) => { debug!("handle list meta: {} = {}", meta_key, meta_value); match meta_key { "LIMIT" => { // 'LIMIT()' where 0 <= < 2^16 if let Ok(list_limit_parsed) = meta_value.parse::() { Ok((Some(list_limit_parsed), None)) } else { Err(ChannelCommandBase::make_error_invalid_meta_value( meta_key, meta_value, )) } } "OFFSET" => { // 'OFFSET()' where 0 <= < 2^32 if let Ok(list_offset_parsed) = meta_value.parse::() { Ok((None, Some(list_offset_parsed))) } else { Err(ChannelCommandBase::make_error_invalid_meta_value( meta_key, meta_value, )) } } _ => Err(ChannelCommandBase::make_error_invalid_meta_key( meta_key, meta_value, )), } } Err(err) => Err(ChannelCommandBase::make_error_invalid_meta_key( err.0, err.1, )), } } } impl ChannelCommandIngest { pub fn dispatch_push(mut parts: SplitWhitespace) -> ChannelResult { match ( parts.next(), parts.next(), parts.next(), ChannelCommandBase::parse_text_parts(&mut parts), ) { (Some(collection), Some(bucket), Some(object), Some(text)) => { debug!( "dispatching ingest push in collection: {}, bucket: {} and object: {}", collection, bucket, object ); debug!("ingest push has text: {}", text); // Define push parameters let mut push_lang = None; // Parse meta parts (meta comes after text; extract meta parts second) let mut last_meta_err = None; while let Some(meta_result) = ChannelCommandBase::parse_next_meta_parts(&mut parts) { match Self::handle_push_meta(meta_result) { Ok(Some(push_lang_parsed)) => push_lang = Some(push_lang_parsed), Err(parse_err) => last_meta_err = Some(parse_err), _ => {} } } if let Some(err) = last_meta_err { Err(err) } else { debug!( "will push for text: {} with hinted locale: <{:?}>", text, push_lang ); // Commit 'push' query ChannelCommandBase::commit_ok_operation(QueryBuilder::push( collection, bucket, object, &text, push_lang, )) } } _ => Err(ChannelCommandError::InvalidFormat( "PUSH \"\" [LANG()]?", )), } } pub fn dispatch_pop(mut parts: SplitWhitespace) -> ChannelResult { match ( parts.next(), parts.next(), parts.next(), ChannelCommandBase::parse_text_parts(&mut parts), parts.next(), ) { (Some(collection), Some(bucket), Some(object), Some(text), None) => { debug!( "dispatching ingest pop in collection: {}, bucket: {} and object: {}", collection, bucket, object ); debug!("ingest pop has text: {}", text); // Make 'pop' query ChannelCommandBase::commit_result_operation(QueryBuilder::pop( collection, bucket, object, &text, )) } _ => Err(ChannelCommandError::InvalidFormat( "POP \"\"", )), } } pub fn dispatch_count(mut parts: SplitWhitespace) -> ChannelResult { match (parts.next(), parts.next(), parts.next(), parts.next()) { (Some(collection), bucket_part, object_part, None) => { debug!("dispatching ingest count in collection: {}", collection); // Make 'count' query ChannelCommandBase::commit_result_operation(QueryBuilder::count( collection, bucket_part, object_part, )) } _ => Err(ChannelCommandError::InvalidFormat( "COUNT [ []?]?", )), } } pub fn dispatch_flushc(mut parts: SplitWhitespace) -> ChannelResult { match (parts.next(), parts.next()) { (Some(collection), None) => { debug!( "dispatching ingest flush collection in collection: {}", collection ); // Make 'flushc' query ChannelCommandBase::commit_result_operation(QueryBuilder::flushc(collection)) } _ => Err(ChannelCommandError::InvalidFormat("FLUSHC ")), } } pub fn dispatch_flushb(mut parts: SplitWhitespace) -> ChannelResult { match (parts.next(), parts.next(), parts.next()) { (Some(collection), Some(bucket), None) => { debug!( "dispatching ingest flush bucket in collection: {}, bucket: {}", collection, bucket ); // Make 'flushb' query ChannelCommandBase::commit_result_operation(QueryBuilder::flushb( collection, bucket, )) } _ => Err(ChannelCommandError::InvalidFormat( "FLUSHB ", )), } } pub fn dispatch_flusho(mut parts: SplitWhitespace) -> ChannelResult { match (parts.next(), parts.next(), parts.next(), parts.next()) { (Some(collection), Some(bucket), Some(object), None) => { debug!( "dispatching ingest flush object in collection: {}, bucket: {}, object: {}", collection, bucket, object ); // Make 'flusho' query ChannelCommandBase::commit_result_operation(QueryBuilder::flusho( collection, bucket, object, )) } _ => Err(ChannelCommandError::InvalidFormat( "FLUSHO ", )), } } pub fn dispatch_help(parts: SplitWhitespace) -> ChannelResult { ChannelCommandBase::generic_dispatch_help(parts, &*MANUAL_MODE_INGEST) } fn handle_push_meta( meta_result: MetaPartsResult, ) -> Result, ChannelCommandError> { match meta_result { Ok((meta_key, meta_value)) => { debug!("handle push meta: {} = {}", meta_key, meta_value); match meta_key { "LANG" => { // 'LANG()' where ∈ ISO 639-3 if let Some(query_lang_parsed) = QueryGenericLang::from_value(meta_value) { Ok(Some(query_lang_parsed)) } else { Err(ChannelCommandBase::make_error_invalid_meta_value( meta_key, meta_value, )) } } _ => Err(ChannelCommandBase::make_error_invalid_meta_key( meta_key, meta_value, )), } } Err(err) => Err(ChannelCommandBase::make_error_invalid_meta_key( err.0, err.1, )), } } } impl ChannelCommandControl { pub fn dispatch_trigger(mut parts: SplitWhitespace) -> ChannelResult { match (parts.next(), parts.next(), parts.next()) { (None, _, _) => Ok(vec![ChannelCommandResponse::Result(format!( "actions({})", CONTROL_TRIGGER_ACTIONS.join(", ") ))]), (Some(action_key), data_part, last_part) => { let action_key_lower = action_key.to_lowercase(); match action_key_lower.as_str() { "consolidate" => { if data_part.is_none() { // Force a FST consolidate StoreFSTPool::consolidate(true); Ok(vec![ChannelCommandResponse::Ok]) } else { Err(ChannelCommandError::InvalidFormat("TRIGGER consolidate")) } } "backup" => { match (data_part, last_part) { (Some(path), None) => { // Proceed KV + FST backup let path = Path::new(path); if StoreKVPool::backup(&path.join(BACKUP_KV_PATH)).is_ok() && StoreFSTPool::backup(&path.join(BACKUP_FST_PATH)).is_ok() { Ok(vec![ChannelCommandResponse::Ok]) } else { Err(ChannelCommandError::InternalError) } } _ => Err(ChannelCommandError::InvalidFormat("TRIGGER backup ")), } } "restore" => { match (data_part, last_part) { (Some(path), None) => { // Proceed KV + FST restore let path = Path::new(path); if StoreKVPool::restore(&path.join(BACKUP_KV_PATH)).is_ok() && StoreFSTPool::restore(&path.join(BACKUP_FST_PATH)).is_ok() { Ok(vec![ChannelCommandResponse::Ok]) } else { Err(ChannelCommandError::InternalError) } } _ => Err(ChannelCommandError::InvalidFormat("TRIGGER restore ")), } } _ => Err(ChannelCommandError::NotFound), } } } } pub fn dispatch_info(mut parts: SplitWhitespace) -> ChannelResult { match parts.next() { None => { let statistics = ChannelStatistics::gather(); Ok(vec![ChannelCommandResponse::Result(format!( "uptime({}) clients_connected({}) commands_total({}) \ command_latency_best({}) command_latency_worst({}) \ kv_open_count({}) fst_open_count({}) fst_consolidate_count({})", statistics.uptime, statistics.clients_connected, statistics.commands_total, statistics.command_latency_best, statistics.command_latency_worst, statistics.kv_open_count, statistics.fst_open_count, statistics.fst_consolidate_count ))]) } _ => Err(ChannelCommandError::InvalidFormat("INFO")), } } pub fn dispatch_help(parts: SplitWhitespace) -> ChannelResult { ChannelCommandBase::generic_dispatch_help(parts, &*MANUAL_MODE_CONTROL) } } impl fmt::Display for ChannelCommandError { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> { match self { ChannelCommandError::UnknownCommand => write!(f, "unknown_command"), ChannelCommandError::NotFound => write!(f, "not_found"), ChannelCommandError::QueryError => write!(f, "query_error"), ChannelCommandError::InternalError => write!(f, "internal_error"), ChannelCommandError::ShuttingDown => write!(f, "shutting_down"), ChannelCommandError::PolicyReject(reason) => write!(f, "policy_reject({})", reason), ChannelCommandError::InvalidFormat(format) => write!(f, "invalid_format({})", format), ChannelCommandError::InvalidMetaKey(ref data) => { write!(f, "invalid_meta_key({}[{}])", data.0, data.1) } ChannelCommandError::InvalidMetaValue(ref data) => { write!(f, "invalid_meta_value({}[{}])", data.0, data.1) } } } } #[cfg(test)] mod tests { use super::*; #[test] fn it_matches_command_response_string() { assert_eq!(ChannelCommandResponse::Ok.to_args().0, "OK"); assert_eq!(ChannelCommandResponse::Pong.to_args().0, "PONG"); assert_eq!(ChannelCommandResponse::Ended("").to_args().0, "ENDED"); assert_eq!( ChannelCommandResponse::Err(ChannelCommandError::UnknownCommand) .to_args() .0, "ERR" ); } } ================================================ FILE: src/channel/format.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub fn unescape(text: &str) -> String { // Pre-reserve a byte-aware required capacity as to avoid heap resizes (30% performance \ // gain relative to initializing this with a zero-capacity) let mut unescaped = String::with_capacity(text.as_bytes().len()); let mut characters = text.chars(); while let Some(character) = characters.next() { if character == '\\' { // Found escaped character match characters.next() { Some('n') => unescaped.push('\n'), Some('\"') => unescaped.push('\"'), _ => unescaped.push(character), }; } else { unescaped.push(character); } } unescaped } #[cfg(test)] mod tests { use super::*; #[test] fn it_unescapes_command_text() { assert_eq!(unescape(r#"hello world!"#), r#"hello world!"#.to_string()); assert_eq!( unescape(r#"i'm so good at this"#), r#"i'm so good at this"#.to_string() ); assert_eq!( unescape(r#"look at \\\\"\\\" me i'm \\"\"trying to hack you\""#), r#"look at \\"\" me i'm \""trying to hack you""#.to_string() ); } } #[cfg(all(feature = "benchmark", test))] mod benches { extern crate test; use super::*; use test::Bencher; #[bench] fn bench_unescape_command_text(b: &mut Bencher) { b.iter(|| unescape(r#"i'm \\"\"trying to hack you\""#)); } } ================================================ FILE: src/channel/handle.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) use std::collections::VecDeque; use std::io::{ErrorKind, Read, Write}; use std::net::TcpStream; use std::result::Result; use std::str; use std::time::Duration; use super::message::{ ChannelMessage, ChannelMessageModeControl, ChannelMessageModeIngest, ChannelMessageModeSearch, ChannelMessageResult, }; use super::mode::ChannelMode; use super::statistics::CLIENTS_CONNECTED; use crate::APP_CONF; use crate::LINE_FEED; pub struct ChannelHandle; enum ChannelHandleError { Closed, InvalidMode, AuthenticationRequired, AuthenticationFailed, NotRecognized, TimedOut, ConnectionAborted, Interrupted, Unknown, } const LINE_END_GAP: usize = 1; const BUFFER_SIZE: usize = 20000; const MAX_LINE_SIZE: usize = BUFFER_SIZE + LINE_END_GAP + 1; const TCP_TIMEOUT_NON_ESTABLISHED: u64 = 10; const PROTOCOL_REVISION: u8 = 1; const BUFFER_LINE_SEPARATOR: u8 = b'\n'; lazy_static! { static ref CONNECTED_BANNER: String = format!( "CONNECTED <{} v{}>", env!("CARGO_PKG_NAME"), env!("CARGO_PKG_VERSION") ); } impl ChannelHandleError { pub fn to_str(&self) -> &'static str { match *self { ChannelHandleError::Closed => "closed", ChannelHandleError::InvalidMode => "invalid_mode", ChannelHandleError::AuthenticationRequired => "authentication_required", ChannelHandleError::AuthenticationFailed => "authentication_failed", ChannelHandleError::NotRecognized => "not_recognized", ChannelHandleError::TimedOut => "timed_out", ChannelHandleError::ConnectionAborted => "connection_aborted", ChannelHandleError::Interrupted => "interrupted", ChannelHandleError::Unknown => "unknown", } } } impl ChannelHandle { pub fn client(mut stream: TcpStream) { // Configure stream (non-established) ChannelHandle::configure_stream(&stream, false); // Send connected banner write!(stream, "{}{}", *CONNECTED_BANNER, LINE_FEED).expect("write failed"); // Increment connected clients count *CLIENTS_CONNECTED.write().unwrap() += 1; // Ensure channel mode is set match Self::ensure_start(&stream) { Ok(mode) => { // Configure stream (established) ChannelHandle::configure_stream(&stream, true); // Send started acknowledgement (with environment variables) write!( stream, "STARTED {} protocol({}) buffer({}){}", mode.to_str(), PROTOCOL_REVISION, BUFFER_SIZE, LINE_FEED ) .expect("write failed"); Self::handle_stream(mode, stream); } Err(err) => { write!(stream, "ENDED {}{}", err.to_str(), LINE_FEED).expect("write failed"); } } // Decrement connected clients count *CLIENTS_CONNECTED.write().unwrap() -= 1; } fn configure_stream(stream: &TcpStream, is_established: bool) { let tcp_timeout = if is_established { APP_CONF.channel.tcp_timeout } else { TCP_TIMEOUT_NON_ESTABLISHED }; assert!(stream.set_nodelay(true).is_ok()); assert!(stream .set_read_timeout(Some(Duration::new(tcp_timeout, 0))) .is_ok()); assert!(stream .set_write_timeout(Some(Duration::new(tcp_timeout, 0))) .is_ok()); } fn handle_stream(mode: ChannelMode, mut stream: TcpStream) { // Initialize packet buffer let mut buffer: VecDeque = VecDeque::with_capacity(MAX_LINE_SIZE); // Wait for incoming messages 'handler: loop { let mut read = [0; MAX_LINE_SIZE]; match stream.read(&mut read) { Ok(n) => { // Should close? if n == 0 { break; } // Buffer overflow? { let buffer_len = n + buffer.len(); if buffer_len > MAX_LINE_SIZE { // Do not continue, as there is too much pending data in the buffer. \ // Most likely the client does not implement a proper back-pressure \ // management system, thus we terminate it. error!("closing channel thread because of buffer overflow"); panic!("buffer overflow ({}/{} bytes)", buffer_len, MAX_LINE_SIZE); } } // Add chunk to buffer buffer.extend(&read[0..n]); // Handle full lines from buffer (keep the last incomplete line in buffer) { let mut processed_line = Vec::with_capacity(MAX_LINE_SIZE); while let Some(byte) = buffer.pop_front() { // Commit line and start a new one? if byte == BUFFER_LINE_SEPARATOR { if Self::on_message(&mode, &stream, &processed_line) == ChannelMessageResult::Close { // Should close? break 'handler; } // Important: clear the contents of the line, as it has just been \ // processed. processed_line.clear(); } else { // Append current byte to processed line processed_line.push(byte); } } // Incomplete line remaining? Put it back in buffer. if !processed_line.is_empty() { buffer.extend(processed_line); } } } Err(err) => { error!("closing channel thread with traceback: {}", err); panic!("closing channel"); } } } } fn ensure_start(mut stream: &TcpStream) -> Result { #[allow(clippy::never_loop)] loop { let mut read = [0; MAX_LINE_SIZE]; match stream.read(&mut read) { Ok(n) => { if n == 0 { return Err(ChannelHandleError::Closed); } let mut parts = str::from_utf8(&read[0..n]).unwrap_or("").split_whitespace(); if parts.next().unwrap_or("").to_uppercase().as_str() == "START" { if let Some(res_mode) = parts.next() { debug!("got mode response: {}", res_mode); // Extract mode if let Ok(mode) = ChannelMode::from_str(res_mode) { // Check if authenticated? if let Some(ref auth_password) = APP_CONF.channel.auth_password { if let Some(provided_auth) = parts.next() { // Compare provided password with configured password if provided_auth != auth_password { info!("password provided, but does not match"); return Err(ChannelHandleError::AuthenticationFailed); } } else { info!("no password provided, but one required"); // No password was provided, but we require one return Err(ChannelHandleError::AuthenticationRequired); } } return Ok(mode); } } return Err(ChannelHandleError::InvalidMode); } return Err(ChannelHandleError::NotRecognized); } Err(err) => { let err_reason = match err.kind() { ErrorKind::TimedOut => ChannelHandleError::TimedOut, ErrorKind::ConnectionAborted => ChannelHandleError::ConnectionAborted, ErrorKind::Interrupted => ChannelHandleError::Interrupted, _ => ChannelHandleError::Unknown, }; return Err(err_reason); } } } } fn on_message( mode: &ChannelMode, stream: &TcpStream, message_slice: &[u8], ) -> ChannelMessageResult { match mode { ChannelMode::Search => { ChannelMessage::on::(stream, message_slice) } ChannelMode::Ingest => { ChannelMessage::on::(stream, message_slice) } ChannelMode::Control => { ChannelMessage::on::(stream, message_slice) } } } } ================================================ FILE: src/channel/listen.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) use std::net::TcpListener; use std::process; use std::sync::RwLock; use std::thread; use super::handle::ChannelHandle; use crate::{APP_CONF, THREAD_NAME_CHANNEL_CLIENT}; pub struct ChannelListenBuilder; pub struct ChannelListen; lazy_static! { pub static ref CHANNEL_AVAILABLE: RwLock = RwLock::new(true); } impl ChannelListenBuilder { pub fn build() -> ChannelListen { ChannelListen {} } } impl ChannelListen { pub fn run(&self) { match TcpListener::bind(APP_CONF.channel.inet) { Ok(listener) => { info!("listening on tcp://{}", APP_CONF.channel.inet); for stream in listener.incoming() { match stream { Ok(stream) => { thread::Builder::new() .name(THREAD_NAME_CHANNEL_CLIENT.to_string()) .spawn(move || { if let Ok(peer_addr) = stream.peer_addr() { debug!("channel client connecting: {}", peer_addr); } // Create client ChannelHandle::client(stream); }) .ok(); } Err(err) => { warn!("error handling stream: {}", err); } } } } Err(err) => { error!("error binding channel listener: {}", err); // Exit Sonic process::exit(1); } } } pub fn teardown() { // Channel cannot be used anymore *CHANNEL_AVAILABLE.write().unwrap() = false; } } ================================================ FILE: src/channel/macros.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) #[macro_export] macro_rules! gen_channel_message_mode_handle { ($message:ident, $commands:ident, { $($external:expr => $internal:expr),+, }) => {{ let (command, parts) = ChannelMessage::extract($message); if command.is_empty() == true || $commands.contains(&command.as_str()) == true { match command.as_str() { "" => Ok(vec![ChannelCommandResponse::Void]), $( $external => $internal(parts), )+ "PING" => ChannelCommandBase::dispatch_ping(parts), "QUIT" => ChannelCommandBase::dispatch_quit(parts), _ => Ok(vec![ChannelCommandResponse::Err( ChannelCommandError::InternalError, )]), } } else { Ok(vec![ChannelCommandResponse::Err( ChannelCommandError::UnknownCommand, )]) } }}; } ================================================ FILE: src/channel/message.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) use std::io::Write; use std::net::TcpStream; use std::str::{self, SplitWhitespace}; use std::time::Instant; use super::command::{ ChannelCommandBase, ChannelCommandControl, ChannelCommandError, ChannelCommandIngest, ChannelCommandResponse, ChannelCommandResponseArgs, ChannelCommandSearch, COMMANDS_MODE_CONTROL, COMMANDS_MODE_INGEST, COMMANDS_MODE_SEARCH, }; use super::listen::CHANNEL_AVAILABLE; use super::statistics::{COMMANDS_TOTAL, COMMAND_LATENCY_BEST, COMMAND_LATENCY_WORST}; use crate::LINE_FEED; pub struct ChannelMessage; pub struct ChannelMessageModeSearch; pub struct ChannelMessageModeIngest; pub struct ChannelMessageModeControl; const COMMAND_ELAPSED_MILLIS_SLOW_WARN: u128 = 50; #[derive(PartialEq)] pub enum ChannelMessageResult { Continue, Close, } pub trait ChannelMessageMode { fn handle(message: &str) -> Result, ChannelCommandError>; } impl ChannelMessage { pub fn on( mut stream: &TcpStream, message_slice: &[u8], ) -> ChannelMessageResult { let message = str::from_utf8(message_slice).unwrap_or(""); debug!("got channel message: {}", message); let command_start = Instant::now(); let mut result = ChannelMessageResult::Continue; // Process response for issued command let response_args_groups: Vec; if !(*CHANNEL_AVAILABLE.read().unwrap()) { // Server going down, reject command response_args_groups = vec![ChannelCommandResponse::Err(ChannelCommandError::ShuttingDown).to_args()]; } else { // Handle response arguments to issued command response_args_groups = match M::handle(message) { Ok(resp_groups) => resp_groups .iter() .map(|resp| match resp { ChannelCommandResponse::Ok | ChannelCommandResponse::Pong | ChannelCommandResponse::Pending(_) | ChannelCommandResponse::Result(_) | ChannelCommandResponse::Event(_, _, _) | ChannelCommandResponse::Void | ChannelCommandResponse::Err(_) => resp.to_args(), ChannelCommandResponse::Ended(_) => { result = ChannelMessageResult::Close; resp.to_args() } }) .collect(), Err(reason) => vec![ChannelCommandResponse::Err(reason).to_args()], }; } // Serve response messages on socket for response_args in response_args_groups { if !response_args.0.is_empty() { if let Some(ref values) = response_args.1 { let values_string = values.join(" "); write!(stream, "{} {}{}", response_args.0, values_string, LINE_FEED) .expect("write failed"); debug!( "wrote response with values: {} ({})", response_args.0, values_string ); } else { write!(stream, "{}{}", response_args.0, LINE_FEED).expect("write failed"); debug!("wrote response with no values: {}", response_args.0); } } } // Measure and log time it took to execute command // Notice: this is critical as to raise developer awareness on the performance bits when \ // altering commands-related code, or when making changes to underlying store executors. let command_took = command_start.elapsed(); if command_took.as_millis() >= COMMAND_ELAPSED_MILLIS_SLOW_WARN { warn!( "took a lot of time: {}ms to process channel message", command_took.as_millis(), ); } else { info!( "took {}ms/{}us/{}ns to process channel message", command_took.as_millis(), command_took.as_micros(), command_took.as_nanos(), ); } // Update command statistics { // Update performance measures // Notice: commands that take 0ms are not accounted for there (ie. those are usually \ // commands that do no work or I/O; they would make statistics less accurate) // Important: acquire write locks instead of read + write locks, as to prevent \ // deadlocks (explained here: https://github.com/valeriansaliou/sonic/pull/211) let command_took_millis = command_took.as_millis() as u32; { let mut worst = COMMAND_LATENCY_WORST.write().unwrap(); if command_took_millis > *worst { *worst = command_took_millis; } } { let mut best = COMMAND_LATENCY_BEST.write().unwrap(); if command_took_millis > 0 && (*best == 0 || command_took_millis < *best) { *best = command_took_millis; } } // Increment total commands *COMMANDS_TOTAL.write().unwrap() += 1; } result } fn extract(message: &str) -> (String, SplitWhitespace<'_>) { // Extract command name and arguments let mut parts = message.split_whitespace(); let command = parts.next().unwrap_or("").to_uppercase(); debug!("will dispatch search command: {}", command); (command, parts) } } impl ChannelMessageMode for ChannelMessageModeSearch { fn handle(message: &str) -> Result, ChannelCommandError> { gen_channel_message_mode_handle!(message, COMMANDS_MODE_SEARCH, { "QUERY" => ChannelCommandSearch::dispatch_query, "SUGGEST" => ChannelCommandSearch::dispatch_suggest, "LIST" => ChannelCommandSearch::dispatch_list, "HELP" => ChannelCommandSearch::dispatch_help, }) } } impl ChannelMessageMode for ChannelMessageModeIngest { fn handle(message: &str) -> Result, ChannelCommandError> { gen_channel_message_mode_handle!(message, COMMANDS_MODE_INGEST, { "PUSH" => ChannelCommandIngest::dispatch_push, "POP" => ChannelCommandIngest::dispatch_pop, "COUNT" => ChannelCommandIngest::dispatch_count, "FLUSHC" => ChannelCommandIngest::dispatch_flushc, "FLUSHB" => ChannelCommandIngest::dispatch_flushb, "FLUSHO" => ChannelCommandIngest::dispatch_flusho, "HELP" => ChannelCommandIngest::dispatch_help, }) } } impl ChannelMessageMode for ChannelMessageModeControl { fn handle(message: &str) -> Result, ChannelCommandError> { gen_channel_message_mode_handle!(message, COMMANDS_MODE_CONTROL, { "TRIGGER" => ChannelCommandControl::dispatch_trigger, "INFO" => ChannelCommandControl::dispatch_info, "HELP" => ChannelCommandControl::dispatch_help, }) } } ================================================ FILE: src/channel/mod.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) #[macro_use] mod macros; mod command; mod format; mod handle; mod message; mod mode; pub mod listen; pub mod statistics; ================================================ FILE: src/channel/mode.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub enum ChannelMode { Search, Ingest, Control, } impl ChannelMode { pub fn from_str(value: &str) -> Result { match value { "search" => Ok(ChannelMode::Search), "ingest" => Ok(ChannelMode::Ingest), "control" => Ok(ChannelMode::Control), _ => Err(()), } } pub fn to_str(&self) -> &'static str { match *self { ChannelMode::Search => "search", ChannelMode::Ingest => "ingest", ChannelMode::Control => "control", } } } ================================================ FILE: src/channel/statistics.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) use std::ops::Deref; use std::sync::RwLock; use std::time::Instant; use crate::store::fst::StoreFSTPool; use crate::store::kv::StoreKVPool; lazy_static! { static ref START_TIME: Instant = Instant::now(); pub static ref CLIENTS_CONNECTED: RwLock = RwLock::new(0); pub static ref COMMANDS_TOTAL: RwLock = RwLock::new(0); pub static ref COMMAND_LATENCY_BEST: RwLock = RwLock::new(0); pub static ref COMMAND_LATENCY_WORST: RwLock = RwLock::new(0); } #[derive(Default)] pub struct ChannelStatistics { pub uptime: u64, pub clients_connected: u32, pub commands_total: u64, pub command_latency_best: u32, pub command_latency_worst: u32, pub kv_open_count: usize, pub fst_open_count: usize, pub fst_consolidate_count: usize, } pub fn ensure_states() { // Ensure all statics are initialized (a `deref` is enough to lazily initialize them) let (_, _, _, _, _) = ( START_TIME.deref(), CLIENTS_CONNECTED.deref(), COMMANDS_TOTAL.deref(), COMMAND_LATENCY_BEST.deref(), COMMAND_LATENCY_WORST.deref(), ); } impl ChannelStatistics { pub fn gather() -> ChannelStatistics { let (kv_count, fst_count) = (StoreKVPool::count(), StoreFSTPool::count()); ChannelStatistics { uptime: START_TIME.elapsed().as_secs(), clients_connected: *CLIENTS_CONNECTED.read().unwrap(), commands_total: *COMMANDS_TOTAL.read().unwrap(), command_latency_best: *COMMAND_LATENCY_BEST.read().unwrap(), command_latency_worst: *COMMAND_LATENCY_WORST.read().unwrap(), kv_open_count: kv_count, fst_open_count: fst_count.0, fst_consolidate_count: fst_count.1, } } } ================================================ FILE: src/config/defaults.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) use std::net::SocketAddr; use std::path::PathBuf; pub fn server_log_level() -> String { "error".to_string() } pub fn channel_inet() -> SocketAddr { "[::1]:1491".parse().unwrap() } pub fn channel_tcp_timeout() -> u64 { 300 } pub fn channel_search_query_limit_default() -> u16 { 10 } pub fn channel_search_query_limit_maximum() -> u16 { 100 } pub fn channel_search_query_alternates_try() -> usize { 4 } pub fn channel_search_suggest_limit_default() -> u16 { 5 } pub fn channel_search_suggest_limit_maximum() -> u16 { 20 } pub fn channel_search_list_limit_default() -> u16 { 100 } pub fn channel_search_list_limit_maximum() -> u16 { 500 } pub fn store_kv_path() -> PathBuf { PathBuf::from("./data/store/kv/") } pub fn store_kv_retain_word_objects() -> usize { 1000 } pub fn store_kv_pool_inactive_after() -> u64 { 1800 } pub fn store_kv_database_flush_after() -> u64 { 900 } pub fn store_kv_database_compress() -> bool { true } pub fn store_kv_database_parallelism() -> u16 { 2 } pub fn store_kv_database_max_compactions() -> u16 { 1 } pub fn store_kv_database_max_flushes() -> u16 { 1 } pub fn store_kv_database_write_buffer() -> usize { 16384 } pub fn store_kv_database_write_ahead_log() -> bool { true } pub fn store_fst_path() -> PathBuf { PathBuf::from("./data/store/fst/") } pub fn store_fst_pool_inactive_after() -> u64 { 300 } pub fn store_fst_graph_consolidate_after() -> u64 { 180 } pub fn store_fst_graph_max_size() -> usize { 2048 } pub fn store_fst_graph_max_words() -> usize { 250000 } ================================================ FILE: src/config/env_var.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) use regex::Regex; use serde::{Deserialize, Deserializer}; use std::net::SocketAddr; use std::path::PathBuf; #[derive(Deserialize, PartialEq)] struct WrappedString(String); pub fn str<'de, D>(deserializer: D) -> Result where D: Deserializer<'de>, { let value = String::deserialize(deserializer)?; match is_env_var(&value) { true => Ok(get_env_var(&value)), false => Ok(value), } } pub fn opt_str<'de, D>(deserializer: D) -> Result, D::Error> where D: Deserializer<'de>, { Option::::deserialize(deserializer).map(|option: Option| { option.map(|wrapped: WrappedString| { let value = wrapped.0; match is_env_var(&value) { true => get_env_var(&value), false => value, } }) }) } pub fn socket_addr<'de, D>(deserializer: D) -> Result where D: Deserializer<'de>, { let value = String::deserialize(deserializer)?; match is_env_var(&value) { true => Ok(get_env_var(&value).parse().unwrap()), false => Ok(value.parse().unwrap()), } } pub fn path_buf<'de, D>(deserializer: D) -> Result where D: Deserializer<'de>, { let value = String::deserialize(deserializer)?; match is_env_var(&value) { true => Ok(PathBuf::from(get_env_var(&value))), false => Ok(PathBuf::from(value)), } } fn is_env_var(value: &str) -> bool { Regex::new(r"^\$\{env\.\w+\}$") .expect("env_var: regex is invalid") .is_match(value) } fn get_env_var(wrapped_key: &str) -> String { let key: String = String::from(wrapped_key) .drain(6..(wrapped_key.len() - 1)) .collect(); std::env::var(key.clone()).unwrap_or_else(|_| panic!("env_var: variable '{}' is not set", key)) } #[cfg(test)] mod tests { use super::*; #[test] fn it_checks_environment_variable_patterns() { assert!(is_env_var("${env.XXX}")); assert!(!is_env_var("${env.XXX")); assert!(!is_env_var("${env.XXX}a")); assert!(!is_env_var("a${env.XXX}")); assert!(!is_env_var("{env.XXX}")); assert!(!is_env_var("$env.XXX}")); assert!(!is_env_var("${envXXX}")); assert!(!is_env_var("${.XXX}")); assert!(!is_env_var("${XXX}")); } #[test] fn it_gets_environment_variable() { std::env::set_var("TEST", "test"); assert_eq!(get_env_var("${env.TEST}"), "test"); std::env::remove_var("TEST"); } } ================================================ FILE: src/config/logger.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) use log::{Level, LevelFilter, Metadata, Record, SetLoggerError}; pub struct ConfigLogger; impl log::Log for ConfigLogger { fn enabled(&self, metadata: &Metadata) -> bool { metadata.level() <= Level::Debug } fn log(&self, record: &Record) { if self.enabled(record.metadata()) { println!("({}) - {}", record.level(), record.args()); } } fn flush(&self) {} } impl ConfigLogger { pub fn init(level: LevelFilter) -> Result<(), SetLoggerError> { log::set_max_level(level); log::set_logger(&ConfigLogger) } } ================================================ FILE: src/config/mod.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) mod defaults; mod env_var; pub mod logger; pub mod options; pub mod reader; ================================================ FILE: src/config/options.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) use std::net::SocketAddr; use std::path::PathBuf; use super::defaults; use super::env_var; #[derive(Deserialize)] pub struct Config { pub server: ConfigServer, pub channel: ConfigChannel, pub store: ConfigStore, } #[derive(Deserialize)] pub struct ConfigServer { #[serde( default = "defaults::server_log_level", deserialize_with = "env_var::str" )] pub log_level: String, } #[derive(Deserialize)] pub struct ConfigChannel { #[serde( default = "defaults::channel_inet", deserialize_with = "env_var::socket_addr" )] pub inet: SocketAddr, #[serde(default = "defaults::channel_tcp_timeout")] pub tcp_timeout: u64, #[serde(default, deserialize_with = "env_var::opt_str")] pub auth_password: Option, pub search: ConfigChannelSearch, } #[derive(Deserialize)] pub struct ConfigChannelSearch { #[serde(default = "defaults::channel_search_query_limit_default")] pub query_limit_default: u16, #[serde(default = "defaults::channel_search_query_limit_maximum")] pub query_limit_maximum: u16, #[serde(default = "defaults::channel_search_query_alternates_try")] pub query_alternates_try: usize, #[serde(default = "defaults::channel_search_suggest_limit_default")] pub suggest_limit_default: u16, #[serde(default = "defaults::channel_search_suggest_limit_maximum")] pub suggest_limit_maximum: u16, #[serde(default = "defaults::channel_search_list_limit_default")] pub list_limit_default: u16, #[serde(default = "defaults::channel_search_list_limit_maximum")] pub list_limit_maximum: u16, } #[derive(Deserialize)] pub struct ConfigStore { pub kv: ConfigStoreKV, pub fst: ConfigStoreFST, } #[derive(Deserialize)] pub struct ConfigStoreKV { #[serde( default = "defaults::store_kv_path", deserialize_with = "env_var::path_buf" )] pub path: PathBuf, #[serde(default = "defaults::store_kv_retain_word_objects")] pub retain_word_objects: usize, pub pool: ConfigStoreKVPool, pub database: ConfigStoreKVDatabase, } #[derive(Deserialize)] pub struct ConfigStoreKVPool { #[serde(default = "defaults::store_kv_pool_inactive_after")] pub inactive_after: u64, } #[derive(Deserialize)] pub struct ConfigStoreKVDatabase { #[serde(default = "defaults::store_kv_database_flush_after")] pub flush_after: u64, #[serde(default = "defaults::store_kv_database_compress")] pub compress: bool, #[serde(default = "defaults::store_kv_database_parallelism")] pub parallelism: u16, pub max_files: Option, #[serde(default = "defaults::store_kv_database_max_compactions")] pub max_compactions: u16, #[serde(default = "defaults::store_kv_database_max_flushes")] pub max_flushes: u16, #[serde(default = "defaults::store_kv_database_write_buffer")] pub write_buffer: usize, #[serde(default = "defaults::store_kv_database_write_ahead_log")] pub write_ahead_log: bool, } #[derive(Deserialize)] pub struct ConfigStoreFST { #[serde( default = "defaults::store_fst_path", deserialize_with = "env_var::path_buf" )] pub path: PathBuf, pub pool: ConfigStoreFSTPool, pub graph: ConfigStoreFSTGraph, } #[derive(Deserialize)] pub struct ConfigStoreFSTPool { #[serde(default = "defaults::store_fst_pool_inactive_after")] pub inactive_after: u64, } #[derive(Deserialize)] pub struct ConfigStoreFSTGraph { #[serde(default = "defaults::store_fst_graph_consolidate_after")] pub consolidate_after: u64, #[serde(default = "defaults::store_fst_graph_max_size")] pub max_size: usize, #[serde(default = "defaults::store_fst_graph_max_words")] pub max_words: usize, } ================================================ FILE: src/config/reader.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) use std::fs::File; use std::io::Read; use super::options::Config; use crate::APP_ARGS; pub struct ConfigReader; impl ConfigReader { pub fn make() -> Config { debug!("reading config file: {}", &APP_ARGS.config); let mut file = File::open(&APP_ARGS.config).expect("cannot find config file"); let mut conf = String::new(); file.read_to_string(&mut conf) .expect("cannot read config file"); debug!("read config file: {}", &APP_ARGS.config); // Parse configuration let config = toml::from_str(&conf).expect("syntax error in config file"); // Validate configuration Self::validate(&config); config } fn validate(config: &Config) { // Check 'write_buffer' for KV if config.store.kv.database.write_buffer == 0 { panic!("write_buffer for kv must not be zero"); } // Check 'flush_after' for KV if config.store.kv.database.flush_after >= config.store.kv.pool.inactive_after { panic!("flush_after for kv must be strictly lower than inactive_after"); } // Check 'consolidate_after' for FST if config.store.fst.graph.consolidate_after >= config.store.fst.pool.inactive_after { panic!("consolidate_after for fst must be strictly lower than inactive_after"); } } } ================================================ FILE: src/executor/count.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) use crate::store::fst::StoreFSTPool; use crate::store::fst::{StoreFSTActionBuilder, StoreFSTMisc}; use crate::store::item::StoreItem; use crate::store::kv::StoreKVActionBuilder; use crate::store::kv::{StoreKVAcquireMode, StoreKVPool}; pub struct ExecutorCount; impl ExecutorCount { pub fn execute(store: StoreItem) -> Result { match store { // Count terms in (collection, bucket, object) from KV StoreItem(collection, Some(bucket), Some(object)) => { // Important: acquire database access read lock, and reference it in context. This \ // prevents the database from being erased while using it in this block. general_kv_access_lock_read!(); if let Ok(kv_store) = StoreKVPool::acquire(StoreKVAcquireMode::OpenOnly, collection) { // Important: acquire bucket store read lock executor_kv_lock_read!(kv_store); let kv_action = StoreKVActionBuilder::access(bucket, kv_store); // Try to resolve existing OID to IID let oid = object.as_str(); kv_action .get_oid_to_iid(oid) .unwrap_or(None) .map(|iid| { // List terms for IID if let Some(terms) = kv_action.get_iid_to_terms(iid).unwrap_or(None) { terms.len() as u32 } else { 0 } }) .ok_or(()) .or(Ok(0)) } else { Err(()) } } // Count terms in (collection, bucket) from FST StoreItem(collection, Some(bucket), None) => { // Important: acquire graph access read lock, and reference it in context. This \ // prevents the graph from being erased while using it in this block. general_fst_access_lock_read!(); if let Ok(fst_store) = StoreFSTPool::acquire(collection, bucket) { let fst_action = StoreFSTActionBuilder::access(fst_store); Ok(fst_action.count_words() as u32) } else { Err(()) } } // Count buckets in (collection) from FS StoreItem(collection, None, None) => { StoreFSTMisc::count_collection_buckets(collection).map(|count| count as u32) } _ => Err(()), } } } ================================================ FILE: src/executor/flushb.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) use crate::store::fst::StoreFSTActionBuilder; use crate::store::item::StoreItem; use crate::store::kv::{StoreKVAcquireMode, StoreKVActionBuilder, StoreKVPool}; pub struct ExecutorFlushB; impl ExecutorFlushB { pub fn execute(store: StoreItem) -> Result { if let StoreItem(collection, Some(bucket), None) = store { // Important: acquire database access read lock, and reference it in context. This \ // prevents the database from being erased while using it in this block. // Notice: acquire FST lock in write mode, as we will erase it. general_kv_access_lock_read!(); general_fst_access_lock_write!(); if let Ok(kv_store) = StoreKVPool::acquire(StoreKVAcquireMode::OpenOnly, collection) { // Important: acquire bucket store write lock executor_kv_lock_write!(kv_store); if kv_store.is_some() { // Store exists, proceed erasure. debug!( "collection store exists, erasing: {} from {}", bucket.as_str(), collection.as_str() ); let kv_action = StoreKVActionBuilder::access(bucket, kv_store); // Notice: we cannot use the provided KV bucket erasure helper there, as \ // erasing a bucket requires a database lock, which would incur a dead-lock, \ // thus we need to perform the erasure from there. if let Ok(erase_count) = kv_action.batch_erase_bucket() { if StoreFSTActionBuilder::erase(collection, Some(bucket)).is_ok() { debug!("done with bucket erasure"); return Ok(erase_count); } } } else { // Store does not exist, consider as already erased. debug!( "collection store does not exist, consider {} from {} already erased", bucket.as_str(), collection.as_str() ); return Ok(0); } } } Err(()) } } ================================================ FILE: src/executor/flushc.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) use crate::store::fst::StoreFSTActionBuilder; use crate::store::item::StoreItem; use crate::store::kv::StoreKVActionBuilder; pub struct ExecutorFlushC; impl ExecutorFlushC { pub fn execute(store: StoreItem) -> Result { // Important: do not acquire the store from there, as otherwise it will remain open \ // even if dropped in the inner function, as this caller would still own a reference to \ // it. if let StoreItem(collection, None, None) = store { // Acquire KV + FST locks in write mode, as we will erase them, we need to prevent any \ // other consumer to use them. general_kv_access_lock_write!(); general_fst_access_lock_write!(); match ( StoreKVActionBuilder::erase(collection, None), StoreFSTActionBuilder::erase(collection, None), ) { (Ok(erase_count), Ok(_)) => Ok(erase_count), _ => Err(()), } } else { Err(()) } } } ================================================ FILE: src/executor/flusho.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) use crate::store::item::StoreItem; use crate::store::kv::{StoreKVAcquireMode, StoreKVActionBuilder, StoreKVPool}; pub struct ExecutorFlushO; impl ExecutorFlushO { pub fn execute(store: StoreItem) -> Result { if let StoreItem(collection, Some(bucket), Some(object)) = store { // Important: acquire database access read lock, and reference it in context. This \ // prevents the database from being erased while using it in this block. general_kv_access_lock_read!(); if let Ok(kv_store) = StoreKVPool::acquire(StoreKVAcquireMode::OpenOnly, collection) { // Important: acquire bucket store write lock executor_kv_lock_write!(kv_store); let kv_action = StoreKVActionBuilder::access(bucket, kv_store); // Try to resolve existing OID to IID (if it does not exist, there is nothing to \ // be flushed) let oid = object.as_str(); if let Ok(iid_value) = kv_action.get_oid_to_iid(oid) { let mut count_flushed = 0; if let Some(iid) = iid_value { // Resolve terms associated to IID let iid_terms = { if let Ok(iid_terms_value) = kv_action.get_iid_to_terms(iid) { iid_terms_value.unwrap_or_default() } else { error!("failed getting flusho executor iid-to-terms"); Vec::new() } }; // Flush bucket (batch operation, as it is shared w/ other executors) if let Ok(batch_count) = kv_action.batch_flush_bucket(iid, oid, &iid_terms) { count_flushed += batch_count; } else { error!("failed executing batch-flush-bucket in flusho executor"); } } return Ok(count_flushed); } else { error!("failed getting flusho executor oid-to-iid"); } } } Err(()) } } ================================================ FILE: src/executor/list.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2022, Troy Kohler // License: Mozilla Public License v2.0 (MPL v2.0) use crate::query::types::{QuerySearchID, QuerySearchLimit, QuerySearchOffset}; use crate::store::fst::StoreFSTActionBuilder; use crate::store::fst::StoreFSTPool; use crate::store::item::StoreItem; pub struct ExecutorList; impl ExecutorList { pub fn execute( store: StoreItem, _event_id: QuerySearchID, limit: QuerySearchLimit, offset: QuerySearchOffset, ) -> Result, ()> { if let StoreItem(collection, Some(bucket), None) = store { // Important: acquire graph access read lock, and reference it in context. This \ // prevents the graph from being erased while using it in this block. general_fst_access_lock_read!(); if let Ok(fst_store) = StoreFSTPool::acquire(collection, bucket) { let fst_action = StoreFSTActionBuilder::access(fst_store); debug!("running list"); return fst_action.list_words(limit as usize, offset as usize); } } Err(()) } } ================================================ FILE: src/executor/macros.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) #[macro_export] macro_rules! executor_ensure_op { ($operation:expr) => { match $operation { Ok(_) => {} Err(err) => error!("executor operation failed: {:?}", err), } }; } #[macro_export] macro_rules! executor_kv_lock_read { ($store:ident) => { let kv_store_reference = $store.clone(); let _kv_store_lock = kv_store_reference .as_ref() .map(|inner| inner.lock.read().unwrap()); }; } #[macro_export] macro_rules! executor_kv_lock_write { ($store:ident) => { let kv_store_reference = $store.clone(); let _kv_store_lock = kv_store_reference .as_ref() .map(|inner| inner.lock.write().unwrap()); }; } #[macro_export] macro_rules! general_kv_access_lock_read { () => { use crate::store::kv::STORE_ACCESS_LOCK; let _kv_access = STORE_ACCESS_LOCK.read().unwrap(); }; } #[macro_export] macro_rules! general_kv_access_lock_write { () => { use crate::store::kv::STORE_ACCESS_LOCK; let _kv_access = STORE_ACCESS_LOCK.write().unwrap(); }; } #[macro_export] macro_rules! general_fst_access_lock_read { () => { use crate::store::fst::GRAPH_ACCESS_LOCK; let _fst_access = GRAPH_ACCESS_LOCK.read().unwrap(); }; } #[macro_export] macro_rules! general_fst_access_lock_write { () => { use crate::store::fst::GRAPH_ACCESS_LOCK; let _fst_access = GRAPH_ACCESS_LOCK.write().unwrap(); }; } ================================================ FILE: src/executor/mod.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) #[macro_use] mod macros; pub mod count; pub mod flushb; pub mod flushc; pub mod flusho; pub mod list; pub mod pop; pub mod push; pub mod search; pub mod suggest; ================================================ FILE: src/executor/pop.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) use linked_hash_set::LinkedHashSet; use std::iter::FromIterator; use crate::lexer::token::TokenLexer; use crate::store::fst::{StoreFSTActionBuilder, StoreFSTPool}; use crate::store::identifiers::StoreTermHashed; use crate::store::item::StoreItem; use crate::store::kv::{StoreKVAcquireMode, StoreKVActionBuilder, StoreKVPool}; pub struct ExecutorPop; impl ExecutorPop { pub fn execute<'a>(store: StoreItem<'a>, lexer: TokenLexer<'a>) -> Result { if let StoreItem(collection, Some(bucket), Some(object)) = store { // Important: acquire database access read lock, and reference it in context. This \ // prevents the database from being erased while using it in this block. general_kv_access_lock_read!(); general_fst_access_lock_read!(); if let (Ok(kv_store), Ok(fst_store)) = ( StoreKVPool::acquire(StoreKVAcquireMode::OpenOnly, collection), StoreFSTPool::acquire(collection, bucket), ) { // Important: acquire bucket store write lock executor_kv_lock_write!(kv_store); let (kv_action, fst_action) = ( StoreKVActionBuilder::access(bucket, kv_store), StoreFSTActionBuilder::access(fst_store), ); // Try to resolve existing OID to IID (if it does not exist, there is nothing to \ // be flushed) let oid = object.as_str(); if let Ok(iid_value) = kv_action.get_oid_to_iid(oid) { let mut count_popped = 0; if let Some(iid) = iid_value { // Try to resolve existing search terms from IID, and perform an algebraic \ // AND on all popped terms to generate a list of terms to be cleaned up. if let Ok(Some(iid_terms_hashed_vec)) = kv_action.get_iid_to_terms(iid) { info!( "got pop executor stored iid-to-terms: {:?}", iid_terms_hashed_vec ); let pop_terms: Vec<(String, StoreTermHashed)> = lexer.collect(); let iid_terms_hashed: LinkedHashSet = LinkedHashSet::from_iter(iid_terms_hashed_vec.iter().copied()); let remaining_terms: LinkedHashSet = iid_terms_hashed .difference(&LinkedHashSet::from_iter( pop_terms.iter().map(|item| item.1), )) .copied() .collect(); debug!( "got pop executor terms remaining terms: {:?} for iid: {}", remaining_terms, iid ); count_popped = (iid_terms_hashed.len() - remaining_terms.len()) as u32; if count_popped > 0 { if remaining_terms.is_empty() { info!("nuke whole bucket for pop executor"); // Flush bucket (batch operation, as it is shared w/ other \ // executors) executor_ensure_op!(kv_action.batch_flush_bucket( iid, oid, &iid_terms_hashed_vec )); } else { info!("nuke only certain terms for pop executor"); // Nuke IID in Term-to-IIDs list for (pop_term, pop_term_hashed) in &pop_terms { // Check that term is linked to IID (and should be removed) if iid_terms_hashed.contains(pop_term_hashed) { if let Ok(Some(mut pop_term_iids)) = kv_action.get_term_to_iids(*pop_term_hashed) { // Remove IID from list of IIDs to be popped pop_term_iids.retain(|cur_iid| cur_iid != &iid); if pop_term_iids.is_empty() { // IIDs list was empty, delete whole key executor_ensure_op!(kv_action .delete_term_to_iids(*pop_term_hashed)); // Pop from FST graph (does not exist anymore) if fst_action.pop_word(pop_term) { debug!( "pop term hash nuked from graph: {}", pop_term_hashed ); } } else { // Re-build IIDs list w/o current IID executor_ensure_op!(kv_action .set_term_to_iids( *pop_term_hashed, &pop_term_iids, )); } } else { error!( "failed getting term-to-iids in pop executor" ); } } } // Bump IID-to-Terms list let remaining_terms_vec: Vec = Vec::from_iter(remaining_terms.into_iter()); executor_ensure_op!( kv_action.set_iid_to_terms(iid, &remaining_terms_vec) ); } } } else { error!("failed getting iid-to-terms in pop executor"); } } return Ok(count_popped); } } } Err(()) } } ================================================ FILE: src/executor/push.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) use linked_hash_set::LinkedHashSet; use std::iter::FromIterator; use crate::lexer::token::TokenLexer; use crate::store::fst::{StoreFSTActionBuilder, StoreFSTPool}; use crate::store::identifiers::{StoreMetaKey, StoreMetaValue, StoreTermHashed}; use crate::store::item::StoreItem; use crate::store::kv::{StoreKVAcquireMode, StoreKVActionBuilder, StoreKVPool}; use crate::APP_CONF; pub struct ExecutorPush; impl ExecutorPush { pub fn execute<'a>(store: StoreItem<'a>, lexer: TokenLexer<'a>) -> Result<(), ()> { if let StoreItem(collection, Some(bucket), Some(object)) = store { // Important: acquire database access read lock, and reference it in context. This \ // prevents the database from being erased while using it in this block. general_kv_access_lock_read!(); general_fst_access_lock_read!(); if let (Ok(kv_store), Ok(fst_store)) = ( StoreKVPool::acquire(StoreKVAcquireMode::Any, collection), StoreFSTPool::acquire(collection, bucket), ) { // Important: acquire bucket store write lock executor_kv_lock_write!(kv_store); let (kv_action, fst_action) = ( StoreKVActionBuilder::access(bucket, kv_store), StoreFSTActionBuilder::access(fst_store), ); // Try to resolve existing OID to IID, otherwise initialize IID (store the \ // bi-directional relationship) let oid = object.as_str(); let iid = kv_action.get_oid_to_iid(oid).unwrap_or(None).or_else(|| { info!("must initialize push executor oid-to-iid and iid-to-oid"); if let Ok(iid_incr) = kv_action.get_meta_to_value(StoreMetaKey::IIDIncr) { let iid_incr = if let Some(iid_incr) = iid_incr { match iid_incr { StoreMetaValue::IIDIncr(iid_incr) => iid_incr + 1, } } else { 0 }; // Bump last stored increment if kv_action .set_meta_to_value( StoreMetaKey::IIDIncr, StoreMetaValue::IIDIncr(iid_incr), ) .is_ok() { // Associate OID <> IID (bidirectional) executor_ensure_op!(kv_action.set_oid_to_iid(oid, iid_incr)); executor_ensure_op!(kv_action.set_iid_to_oid(iid_incr, oid)); Some(iid_incr) } else { error!("failed updating push executor meta-to-value iid increment"); None } } else { error!("failed getting push executor meta-to-value iid increment"); None } }); if let Some(iid) = iid { let mut has_commits = false; // Acquire list of terms for IID let mut iid_terms_hashed: LinkedHashSet = LinkedHashSet::from_iter( kv_action .get_iid_to_terms(iid) .unwrap_or(None) .unwrap_or_default(), ); info!( "got push executor stored iid-to-terms: {:?}", iid_terms_hashed ); for (term, term_hashed) in lexer { // Check that term is not already linked to IID if !iid_terms_hashed.contains(&term_hashed) { if let Ok(term_iids) = kv_action.get_term_to_iids(term_hashed) { has_commits = true; // Add IID in first position in list for terms let mut term_iids = term_iids.unwrap_or_default(); // Remove IID from list of IIDs to be popped before inserting in \ // first position? if term_iids.contains(&iid) { term_iids.retain(|cur_iid| cur_iid != &iid); } info!("has push executor term-to-iids: {}", iid); term_iids.insert(0, iid); // Truncate IIDs linked to term? (ie. storage is too long) let truncate_limit = APP_CONF.store.kv.retain_word_objects; if term_iids.len() > truncate_limit { info!( "push executor term-to-iids object too long (limit: {})", truncate_limit ); // Drain overflowing IIDs (ie. oldest ones that overflow) let term_iids_drain = term_iids.drain(truncate_limit..); executor_ensure_op!(kv_action .batch_truncate_object(term_hashed, term_iids_drain)); } executor_ensure_op!( kv_action.set_term_to_iids(term_hashed, &term_iids) ); // Insert term into IID to terms map iid_terms_hashed.insert(term_hashed); } else { error!("failed getting push executor term-to-iids"); } } // Push to FST graph? (this consumes the term; to avoid sub-clones) if fst_action.push_word(&term) { debug!("push term committed to graph: {}", term); } } // Commit updated list of terms for IID? (if any commit made) if has_commits { let collected_iids: Vec = iid_terms_hashed.into_iter().collect(); info!( "has push executor iid-to-terms commits: {:?}", collected_iids ); executor_ensure_op!(kv_action.set_iid_to_terms(iid, &collected_iids)); } return Ok(()); } } } Err(()) } } ================================================ FILE: src/executor/search.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) use linked_hash_set::LinkedHashSet; use std::iter::FromIterator; use crate::lexer::token::TokenLexer; use crate::query::types::{QuerySearchID, QuerySearchLimit, QuerySearchOffset}; use crate::store::fst::{StoreFSTActionBuilder, StoreFSTPool}; use crate::store::identifiers::{StoreObjectIID, StoreTermHash}; use crate::store::item::StoreItem; use crate::store::kv::{StoreKVAcquireMode, StoreKVActionBuilder, StoreKVPool}; use crate::APP_CONF; pub struct ExecutorSearch; impl ExecutorSearch { pub fn execute<'a>( store: StoreItem<'a>, _event_id: QuerySearchID, lexer: TokenLexer<'a>, limit: QuerySearchLimit, offset: QuerySearchOffset, ) -> Result>, ()> { if let StoreItem(collection, Some(bucket), None) = store { // Important: acquire database access read lock, and reference it in context. This \ // prevents the database from being erased while using it in this block. general_kv_access_lock_read!(); general_fst_access_lock_read!(); if let (Ok(kv_store), Ok(fst_store)) = ( StoreKVPool::acquire(StoreKVAcquireMode::OpenOnly, collection), StoreFSTPool::acquire(collection, bucket), ) { // Important: acquire bucket store read lock executor_kv_lock_read!(kv_store); let (kv_action, fst_action) = ( StoreKVActionBuilder::access(bucket, kv_store), StoreFSTActionBuilder::access(fst_store), ); // Try to resolve existing search terms to IIDs, and perform an algebraic AND on \ // all resulting IIDs for each given term. let mut found_iids: LinkedHashSet = LinkedHashSet::new(); 'lexing: for (term, term_hashed) in lexer { let mut iids = LinkedHashSet::from_iter( kv_action .get_term_to_iids(term_hashed) .unwrap_or(None) .unwrap_or_default() .into_iter(), ); // No IIDs? Try to complete with a suggested alternate word // Notice: this may sound dirty to try generating as many results as the \ // 'retain_word_objects' value, but as we do not know if another lexed word \ // comes next we need to exhaust all search space as to intersect it with \ // the (likely) upcoming word. let (higher_limit, alternates_try) = ( APP_CONF.store.kv.retain_word_objects, APP_CONF.channel.search.query_alternates_try, ); if iids.len() < higher_limit && alternates_try > 0 { debug!( "not enough iids were found ({}/{}), completing for term: {}", iids.len(), higher_limit, term ); // Suggest N words, in case the first one is found in FST as an exact \ // match of term, we can pick next ones to complete search even further. // Notice: we add '1' to the 'alternates_try' number as to account for \ // exact match suggestion that comes as first result and is to be ignored. if let Some(suggested_words) = fst_action.suggest_words(&term, alternates_try + 1, Some(1)) { let mut iids_new_len = iids.len(); // This loop will be broken early if we get enough results at some \ // iteration 'suggestions: for suggested_word in suggested_words { // Do not load base results twice for same term as base term if suggested_word == term { continue 'suggestions; } debug!("got completed word: {} for term: {}", suggested_word, term); if let Some(suggested_iids) = kv_action .get_term_to_iids(StoreTermHash::from(&suggested_word)) .unwrap_or(None) { for suggested_iid in suggested_iids { // Do not append the same IID twice (can happen a lot \ // when completing from suggested results that point \ // to the same end-OID) if !iids.contains(&suggested_iid) { iids.insert(suggested_iid); iids_new_len += 1; // Higher limit now reached? Stop acquiring new \ // suggested IIDs now. if iids_new_len >= higher_limit { debug!( "got enough completed results for term: {}", term ); break 'suggestions; } } } } } debug!( "done completing results for term: {}, now {} results", term, iids_new_len ); } else { debug!("did not get any completed word for term: {}", term); } } debug!("got search executor iids: {:?} for term: {}", iids, term); // Intersect found IIDs with previous batch if found_iids.is_empty() { found_iids = iids; } else { found_iids = found_iids.intersection(&iids).copied().collect(); } debug!( "got search executor iid intersection: {:?} for term: {}", found_iids, term ); // No IID found? (stop there) if found_iids.is_empty() { info!( "stop search executor as no iid was found in common for term: {}", term ); break 'lexing; } } // Resolve OIDs from IIDs // Notice: we also proceed paging from there let (limit_usize, offset_usize) = (limit as usize, offset as usize); let mut result_oids = Vec::with_capacity(limit_usize); 'paging: for (index, found_iid) in found_iids.iter().skip(offset_usize).enumerate() { // Stop there? if index >= limit_usize { break 'paging; } // Read IID-to-OID for this found IID if let Ok(Some(oid)) = kv_action.get_iid_to_oid(*found_iid) { result_oids.push(oid); } else { error!("failed getting search executor iid-to-oid"); } } info!("got search executor final oids: {:?}", result_oids); return Ok(if !result_oids.is_empty() { Some(result_oids) } else { None }); } } Err(()) } } ================================================ FILE: src/executor/suggest.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) use crate::lexer::token::TokenLexer; use crate::query::types::{QuerySearchID, QuerySearchLimit}; use crate::store::fst::{StoreFSTActionBuilder, StoreFSTPool}; use crate::store::item::StoreItem; pub struct ExecutorSuggest; impl ExecutorSuggest { pub fn execute<'a>( store: StoreItem<'a>, _event_id: QuerySearchID, mut lexer: TokenLexer<'a>, limit: QuerySearchLimit, ) -> Result>, ()> { if let StoreItem(collection, Some(bucket), None) = store { // Important: acquire graph access read lock, and reference it in context. This \ // prevents the graph from being erased while using it in this block. general_fst_access_lock_read!(); if let Ok(fst_store) = StoreFSTPool::acquire(collection, bucket) { let fst_action = StoreFSTActionBuilder::access(fst_store); if let (Some(word), None) = (lexer.next(), lexer.next()) { debug!("running suggest on word: {}", word.0); return Ok(fst_action.suggest_words(&word.0, limit as usize, None)); } } } Err(()) } } ================================================ FILE: src/lexer/mod.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) mod stopwords; pub mod ranges; pub mod token; ================================================ FILE: src/lexer/ranges.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) use std::fmt; use whatlang::{detect_script, Script}; struct LexerRange; #[derive(PartialEq, Debug)] pub struct LexerRegexRange(&'static [(char, char)]); const RANGE_LATIN: &[(char, char)] = &[('\u{0000}', '\u{024F}')]; const RANGE_CYRILLIC: &[(char, char)] = &[('\u{0400}', '\u{052F}')]; const RANGE_ARABIC: &[(char, char)] = &[('\u{0600}', '\u{06FF}'), ('\u{0750}', '\u{077F}')]; const RANGE_ARMENIAN: &[(char, char)] = &[('\u{0530}', '\u{058F}')]; const RANGE_DEVANAGARI: &[(char, char)] = &[('\u{0900}', '\u{097F}')]; const RANGE_HIRAGANA: &[(char, char)] = &[('\u{3040}', '\u{309F}')]; const RANGE_KATAKANA: &[(char, char)] = &[('\u{30A0}', '\u{30FF}'), ('\u{31F0}', '\u{31FF}')]; const RANGE_ETHIOPIC: &[(char, char)] = &[('\u{1200}', '\u{139F}'), ('\u{2D80}', '\u{2DDF}')]; const RANGE_HEBREW: &[(char, char)] = &[('\u{0590}', '\u{05FF}')]; const RANGE_BENGALI: &[(char, char)] = &[('\u{0980}', '\u{09FF}')]; const RANGE_GEORGIAN: &[(char, char)] = &[('\u{10A0}', '\u{10FF}'), ('\u{2D00}', '\u{2D2F}')]; const RANGE_MANDARIN: &[(char, char)] = &[ ('\u{4E00}', '\u{9FFF}'), ('\u{3400}', '\u{4DBF}'), ('\u{20000}', '\u{2A6DF}'), ('\u{2A700}', '\u{2CEAF}'), ]; const RANGE_HANGUL: &[(char, char)] = &[('\u{1100}', '\u{11FF}'), ('\u{3130}', '\u{318F}')]; const RANGE_GREEK: &[(char, char)] = &[('\u{0370}', '\u{03FF}'), ('\u{1F00}', '\u{1FFF}')]; const RANGE_KANNADA: &[(char, char)] = &[('\u{0C80}', '\u{0CFF}')]; const RANGE_TAMIL: &[(char, char)] = &[('\u{0B80}', '\u{0BFF}')]; const RANGE_THAI: &[(char, char)] = &[('\u{0E00}', '\u{0E7F}')]; const RANGE_GUJARATI: &[(char, char)] = &[('\u{0A80}', '\u{0AFF}')]; const RANGE_GURMUKHI: &[(char, char)] = &[('\u{0A00}', '\u{0A7F}')]; const RANGE_TELUGU: &[(char, char)] = &[('\u{0C00}', '\u{0C7F}')]; const RANGE_MALAYALAM: &[(char, char)] = &[('\u{0D00}', '\u{0D7F}')]; const RANGE_ORIYA: &[(char, char)] = &[('\u{0B00}', '\u{0B7F}')]; const RANGE_MYANMAR: &[(char, char)] = &[('\u{1000}', '\u{109F}')]; const RANGE_SINHALA: &[(char, char)] = &[('\u{0D80}', '\u{0DFF}')]; const RANGE_KHMER: &[(char, char)] = &[('\u{1780}', '\u{17FF}'), ('\u{19E0}', '\u{19FF}')]; impl LexerRange { pub fn from(text: &str) -> Option<&'static [(char, char)]> { detect_script(text).map(|script| match script { Script::Latin => RANGE_LATIN, Script::Cyrillic => RANGE_CYRILLIC, Script::Arabic => RANGE_ARABIC, Script::Armenian => RANGE_ARMENIAN, Script::Devanagari => RANGE_DEVANAGARI, Script::Hiragana => RANGE_HIRAGANA, Script::Katakana => RANGE_KATAKANA, Script::Ethiopic => RANGE_ETHIOPIC, Script::Hebrew => RANGE_HEBREW, Script::Bengali => RANGE_BENGALI, Script::Georgian => RANGE_GEORGIAN, Script::Mandarin => RANGE_MANDARIN, Script::Hangul => RANGE_HANGUL, Script::Greek => RANGE_GREEK, Script::Kannada => RANGE_KANNADA, Script::Tamil => RANGE_TAMIL, Script::Thai => RANGE_THAI, Script::Gujarati => RANGE_GUJARATI, Script::Gurmukhi => RANGE_GURMUKHI, Script::Telugu => RANGE_TELUGU, Script::Malayalam => RANGE_MALAYALAM, Script::Oriya => RANGE_ORIYA, Script::Myanmar => RANGE_MYANMAR, Script::Sinhala => RANGE_SINHALA, Script::Khmer => RANGE_KHMER, }) } } impl LexerRegexRange { pub fn from(text: &str) -> Option { LexerRange::from(text).map(LexerRegexRange) } pub fn write_to(&self, formatter: &mut W) -> Result<(), fmt::Error> { // Format range to regex range formatter.write_char('[')?; for range in self.0 { write!( formatter, "\\x{{{:X}}}-\\x{{{:X}}}", range.0 as u32, range.1 as u32 )?; } formatter.write_char(']')?; Ok(()) } } impl Default for LexerRegexRange { fn default() -> Self { LexerRegexRange(RANGE_LATIN) } } #[cfg(test)] mod tests { use super::*; #[test] fn it_gives_ranges() { assert_eq!(LexerRange::from("fox"), Some(RANGE_LATIN)); assert_eq!(LexerRange::from("快狐跨懒狗"), Some(RANGE_MANDARIN)); assert_eq!(LexerRange::from("Доброе утро."), Some(RANGE_CYRILLIC)); } #[test] fn it_gives_regex_range() { assert_eq!( LexerRegexRange::from("fox"), Some(LexerRegexRange(RANGE_LATIN)) ); } } #[cfg(all(feature = "benchmark", test))] mod benches { extern crate test; use super::*; use test::Bencher; #[bench] fn bench_give_ranges_latin(b: &mut Bencher) { b.iter(|| LexerRange::from("fox")); } #[bench] fn bench_give_ranges_mandarin(b: &mut Bencher) { b.iter(|| LexerRange::from("快狐跨懒狗")); } #[bench] fn bench_give_ranges_cyrillic(b: &mut Bencher) { b.iter(|| LexerRange::from("Доброе утро.")); } #[bench] fn bench_give_regex_range_latin(b: &mut Bencher) { b.iter(|| LexerRegexRange::from("fox")); } } ================================================ FILE: src/lexer/stopwords.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) use hashbrown::HashSet; use whatlang::{Lang, Script}; use crate::stopwords::*; pub struct LexerStopWord; // Recursion group #1 (10 items) lazy_static! { static ref STOPWORDS_EPO: HashSet<&'static str> = make(epo::STOPWORDS_EPO); static ref STOPWORDS_ENG: HashSet<&'static str> = make(eng::STOPWORDS_ENG); static ref STOPWORDS_RUS: HashSet<&'static str> = make(rus::STOPWORDS_RUS); static ref STOPWORDS_CMN: HashSet<&'static str> = make(cmn::STOPWORDS_CMN); static ref STOPWORDS_SPA: HashSet<&'static str> = make(spa::STOPWORDS_SPA); static ref STOPWORDS_POR: HashSet<&'static str> = make(por::STOPWORDS_POR); static ref STOPWORDS_ITA: HashSet<&'static str> = make(ita::STOPWORDS_ITA); static ref STOPWORDS_BEN: HashSet<&'static str> = make(ben::STOPWORDS_BEN); static ref STOPWORDS_FRA: HashSet<&'static str> = make(fra::STOPWORDS_FRA); static ref STOPWORDS_DEU: HashSet<&'static str> = make(deu::STOPWORDS_DEU); } // Recursion group #2 (10 items) lazy_static! { static ref STOPWORDS_UKR: HashSet<&'static str> = make(ukr::STOPWORDS_UKR); static ref STOPWORDS_KAT: HashSet<&'static str> = make(kat::STOPWORDS_KAT); static ref STOPWORDS_ARA: HashSet<&'static str> = make(ara::STOPWORDS_ARA); static ref STOPWORDS_HIN: HashSet<&'static str> = make(hin::STOPWORDS_HIN); static ref STOPWORDS_JPN: HashSet<&'static str> = make(jpn::STOPWORDS_JPN); static ref STOPWORDS_HEB: HashSet<&'static str> = make(heb::STOPWORDS_HEB); static ref STOPWORDS_YID: HashSet<&'static str> = make(yid::STOPWORDS_YID); static ref STOPWORDS_POL: HashSet<&'static str> = make(pol::STOPWORDS_POL); static ref STOPWORDS_AMH: HashSet<&'static str> = make(amh::STOPWORDS_AMH); static ref STOPWORDS_JAV: HashSet<&'static str> = make(jav::STOPWORDS_JAV); } // Recursion group #3 (10 items) lazy_static! { static ref STOPWORDS_KOR: HashSet<&'static str> = make(kor::STOPWORDS_KOR); static ref STOPWORDS_NOB: HashSet<&'static str> = make(nob::STOPWORDS_NOB); static ref STOPWORDS_DAN: HashSet<&'static str> = make(dan::STOPWORDS_DAN); static ref STOPWORDS_SWE: HashSet<&'static str> = make(swe::STOPWORDS_SWE); static ref STOPWORDS_FIN: HashSet<&'static str> = make(fin::STOPWORDS_FIN); static ref STOPWORDS_TUR: HashSet<&'static str> = make(tur::STOPWORDS_TUR); static ref STOPWORDS_NLD: HashSet<&'static str> = make(nld::STOPWORDS_NLD); static ref STOPWORDS_HUN: HashSet<&'static str> = make(hun::STOPWORDS_HUN); static ref STOPWORDS_CES: HashSet<&'static str> = make(ces::STOPWORDS_CES); static ref STOPWORDS_ELL: HashSet<&'static str> = make(ell::STOPWORDS_ELL); } // Recursion group #4 (10 items) lazy_static! { static ref STOPWORDS_BUL: HashSet<&'static str> = make(bul::STOPWORDS_BUL); static ref STOPWORDS_BEL: HashSet<&'static str> = make(bel::STOPWORDS_BEL); static ref STOPWORDS_MAR: HashSet<&'static str> = make(mar::STOPWORDS_MAR); static ref STOPWORDS_KAN: HashSet<&'static str> = make(kan::STOPWORDS_KAN); static ref STOPWORDS_RON: HashSet<&'static str> = make(ron::STOPWORDS_RON); static ref STOPWORDS_SLV: HashSet<&'static str> = make(slv::STOPWORDS_SLV); static ref STOPWORDS_HRV: HashSet<&'static str> = make(hrv::STOPWORDS_HRV); static ref STOPWORDS_SRP: HashSet<&'static str> = make(srp::STOPWORDS_SRP); static ref STOPWORDS_MKD: HashSet<&'static str> = make(mkd::STOPWORDS_MKD); static ref STOPWORDS_LIT: HashSet<&'static str> = make(lit::STOPWORDS_LIT); } // Recursion group #5 (10 items) lazy_static! { static ref STOPWORDS_LAV: HashSet<&'static str> = make(lav::STOPWORDS_LAV); static ref STOPWORDS_EST: HashSet<&'static str> = make(est::STOPWORDS_EST); static ref STOPWORDS_TAM: HashSet<&'static str> = make(tam::STOPWORDS_TAM); static ref STOPWORDS_VIE: HashSet<&'static str> = make(vie::STOPWORDS_VIE); static ref STOPWORDS_URD: HashSet<&'static str> = make(urd::STOPWORDS_URD); static ref STOPWORDS_THA: HashSet<&'static str> = make(tha::STOPWORDS_THA); static ref STOPWORDS_GUJ: HashSet<&'static str> = make(guj::STOPWORDS_GUJ); static ref STOPWORDS_UZB: HashSet<&'static str> = make(uzb::STOPWORDS_UZB); static ref STOPWORDS_PAN: HashSet<&'static str> = make(pan::STOPWORDS_PAN); static ref STOPWORDS_AZE: HashSet<&'static str> = make(aze::STOPWORDS_AZE); } // Recursion group #6 (10 items) lazy_static! { static ref STOPWORDS_IND: HashSet<&'static str> = make(ind::STOPWORDS_IND); static ref STOPWORDS_TEL: HashSet<&'static str> = make(tel::STOPWORDS_TEL); static ref STOPWORDS_PES: HashSet<&'static str> = make(pes::STOPWORDS_PES); static ref STOPWORDS_MAL: HashSet<&'static str> = make(mal::STOPWORDS_MAL); static ref STOPWORDS_ORI: HashSet<&'static str> = make(ori::STOPWORDS_ORI); static ref STOPWORDS_MYA: HashSet<&'static str> = make(mya::STOPWORDS_MYA); static ref STOPWORDS_NEP: HashSet<&'static str> = make(nep::STOPWORDS_NEP); static ref STOPWORDS_SIN: HashSet<&'static str> = make(sin::STOPWORDS_SIN); static ref STOPWORDS_KHM: HashSet<&'static str> = make(khm::STOPWORDS_KHM); static ref STOPWORDS_TUK: HashSet<&'static str> = make(tuk::STOPWORDS_TUK); } // Recursion group #7 (9 items) lazy_static! { static ref STOPWORDS_AKA: HashSet<&'static str> = make(aka::STOPWORDS_AKA); static ref STOPWORDS_ZUL: HashSet<&'static str> = make(zul::STOPWORDS_ZUL); static ref STOPWORDS_SNA: HashSet<&'static str> = make(sna::STOPWORDS_SNA); static ref STOPWORDS_AFR: HashSet<&'static str> = make(afr::STOPWORDS_AFR); static ref STOPWORDS_LAT: HashSet<&'static str> = make(lat::STOPWORDS_LAT); static ref STOPWORDS_SLK: HashSet<&'static str> = make(slk::STOPWORDS_SLK); static ref STOPWORDS_CAT: HashSet<&'static str> = make(cat::STOPWORDS_CAT); static ref STOPWORDS_TGL: HashSet<&'static str> = make(tgl::STOPWORDS_TGL); static ref STOPWORDS_HYE: HashSet<&'static str> = make(hye::STOPWORDS_HYE); } fn make<'a>(words: &[&'a str]) -> HashSet<&'a str> { words.iter().copied().collect() } impl LexerStopWord { pub fn is(word: &str, locale: Option) -> bool { if let Some(locale) = locale { // Word is a stopword (given locale) if Self::lang_stopwords(locale).contains(word) { return true; } } // Not a stopword, or may not be (default) false } pub fn guess_lang(text: &str, script: Script) -> Option { debug!( "guessing locale from stopwords for script: {} and text: {}", script, text ); let script_langs = Self::script_langs(script); // Count found stop-words in text for each language let (mut likely_count, mut likely_lang) = (0, None); // Split the text and consume the iterator // Notice: this may seem dirty as we allocate memory, but there may be a lot of \ // 'script_langs' to iterate over (plus, we need to exhaust the whole list as we \ // cannot break early by design). We have noticed a 65% performance increase on \ // texts of ~100 characters when collecting the iterator there, with a very low memory \ // cost as the strings are references and thus there should be no heap allocation. We \ // expect this gain to increase even further for longer texts. let text_split = text.split_whitespace().collect::>(); for script_lang in script_langs { let lang_stopwords = Self::lang_stopwords(*script_lang); if !lang_stopwords.is_empty() { let mut lang_count = 0; // This is a simple split, that does not take into account uppercase letters and \ // punctuation, as to prevent memory allocations and other heavy operations. \ // Trade-offs are made as this is a best-effort last-resort check. for word in &text_split { if lang_stopwords.contains(word) { lang_count += 1; } } // Found stopwords for this locale in text? if lang_count > 0 { debug!( "got {} common stopwords in guess for locale: {}", lang_count, script_lang ); if lang_count > likely_count { likely_count = lang_count; likely_lang = Some(*script_lang); } } } } // Return most likely locale (if any) likely_lang } fn lang_stopwords(lang: Lang) -> &'static HashSet<&'static str> { match lang { Lang::Epo => &*STOPWORDS_EPO, Lang::Eng => &*STOPWORDS_ENG, Lang::Rus => &*STOPWORDS_RUS, Lang::Cmn => &*STOPWORDS_CMN, Lang::Spa => &*STOPWORDS_SPA, Lang::Por => &*STOPWORDS_POR, Lang::Ita => &*STOPWORDS_ITA, Lang::Ben => &*STOPWORDS_BEN, Lang::Fra => &*STOPWORDS_FRA, Lang::Deu => &*STOPWORDS_DEU, Lang::Ukr => &*STOPWORDS_UKR, Lang::Kat => &*STOPWORDS_KAT, Lang::Ara => &*STOPWORDS_ARA, Lang::Hin => &*STOPWORDS_HIN, Lang::Jpn => &*STOPWORDS_JPN, Lang::Heb => &*STOPWORDS_HEB, Lang::Yid => &*STOPWORDS_YID, Lang::Pol => &*STOPWORDS_POL, Lang::Amh => &*STOPWORDS_AMH, Lang::Jav => &*STOPWORDS_JAV, Lang::Kor => &*STOPWORDS_KOR, Lang::Nob => &*STOPWORDS_NOB, Lang::Dan => &*STOPWORDS_DAN, Lang::Swe => &*STOPWORDS_SWE, Lang::Fin => &*STOPWORDS_FIN, Lang::Tur => &*STOPWORDS_TUR, Lang::Nld => &*STOPWORDS_NLD, Lang::Hun => &*STOPWORDS_HUN, Lang::Ces => &*STOPWORDS_CES, Lang::Ell => &*STOPWORDS_ELL, Lang::Bul => &*STOPWORDS_BUL, Lang::Bel => &*STOPWORDS_BEL, Lang::Mar => &*STOPWORDS_MAR, Lang::Kan => &*STOPWORDS_KAN, Lang::Ron => &*STOPWORDS_RON, Lang::Slv => &*STOPWORDS_SLV, Lang::Hrv => &*STOPWORDS_HRV, Lang::Srp => &*STOPWORDS_SRP, Lang::Mkd => &*STOPWORDS_MKD, Lang::Lit => &*STOPWORDS_LIT, Lang::Lav => &*STOPWORDS_LAV, Lang::Est => &*STOPWORDS_EST, Lang::Tam => &*STOPWORDS_TAM, Lang::Vie => &*STOPWORDS_VIE, Lang::Urd => &*STOPWORDS_URD, Lang::Tha => &*STOPWORDS_THA, Lang::Guj => &*STOPWORDS_GUJ, Lang::Uzb => &*STOPWORDS_UZB, Lang::Pan => &*STOPWORDS_PAN, Lang::Aze => &*STOPWORDS_AZE, Lang::Ind => &*STOPWORDS_IND, Lang::Tel => &*STOPWORDS_TEL, Lang::Pes => &*STOPWORDS_PES, Lang::Mal => &*STOPWORDS_MAL, Lang::Ori => &*STOPWORDS_ORI, Lang::Mya => &*STOPWORDS_MYA, Lang::Nep => &*STOPWORDS_NEP, Lang::Sin => &*STOPWORDS_SIN, Lang::Khm => &*STOPWORDS_KHM, Lang::Tuk => &*STOPWORDS_TUK, Lang::Aka => &*STOPWORDS_AKA, Lang::Zul => &*STOPWORDS_ZUL, Lang::Sna => &*STOPWORDS_SNA, Lang::Afr => &*STOPWORDS_AFR, Lang::Lat => &*STOPWORDS_LAT, Lang::Slk => &*STOPWORDS_SLK, Lang::Cat => &*STOPWORDS_CAT, Lang::Tgl => &*STOPWORDS_TGL, Lang::Hye => &*STOPWORDS_HYE, } } fn script_langs(script: Script) -> &'static [Lang] { match script { Script::Latin => &[ Lang::Spa, Lang::Eng, Lang::Por, Lang::Ind, Lang::Fra, Lang::Deu, Lang::Jav, Lang::Vie, Lang::Ita, Lang::Tur, Lang::Pol, Lang::Ron, Lang::Hrv, Lang::Nld, Lang::Uzb, Lang::Hun, Lang::Aze, Lang::Ces, Lang::Zul, Lang::Swe, Lang::Aka, Lang::Sna, Lang::Afr, Lang::Fin, Lang::Tuk, Lang::Dan, Lang::Nob, Lang::Lit, Lang::Slv, Lang::Epo, Lang::Lav, Lang::Est, Lang::Lat, Lang::Slk, Lang::Cat, Lang::Tgl, ], Script::Cyrillic => &[ Lang::Rus, Lang::Ukr, Lang::Srp, Lang::Aze, Lang::Bel, Lang::Bul, Lang::Tuk, Lang::Mkd, ], Script::Arabic => &[Lang::Ara, Lang::Urd, Lang::Pes], Script::Armenian => &[Lang::Hye], Script::Devanagari => &[Lang::Hin, Lang::Mar, Lang::Nep], Script::Ethiopic => &[Lang::Amh], Script::Hebrew => &[Lang::Heb, Lang::Yid], Script::Mandarin => &[Lang::Cmn], Script::Bengali => &[Lang::Ben], Script::Hangul => &[Lang::Kor], Script::Georgian => &[Lang::Kat], Script::Greek => &[Lang::Ell], Script::Kannada => &[Lang::Kan], Script::Tamil => &[Lang::Tam], Script::Thai => &[Lang::Tha], Script::Gujarati => &[Lang::Guj], Script::Gurmukhi => &[Lang::Pan], Script::Telugu => &[Lang::Tel], Script::Malayalam => &[Lang::Mal], Script::Oriya => &[Lang::Ori], Script::Myanmar => &[Lang::Mya], Script::Sinhala => &[Lang::Sin], Script::Khmer => &[Lang::Khm], Script::Katakana | Script::Hiragana => &[Lang::Jpn], } } } #[cfg(test)] mod tests { use super::*; #[test] fn it_detects_stopwords() { assert!(!LexerStopWord::is("the", None)); assert!(LexerStopWord::is("the", Some(Lang::Eng))); assert!(!LexerStopWord::is("fox", Some(Lang::Eng))); assert!(!LexerStopWord::is("bonjour", Some(Lang::Fra))); assert!(LexerStopWord::is("ici", Some(Lang::Fra))); assert!(LexerStopWord::is("adéu", Some(Lang::Cat))); } #[test] fn it_guesses_language() { assert_eq!( LexerStopWord::guess_lang( "I believe there is an extremely simple way to whip climate change.", Script::Latin ), Some(Lang::Eng) ); assert_eq!( LexerStopWord::guess_lang( "permettre aux pharmaciens de délivrer sous certaines conditions des médicaments", Script::Latin ), Some(Lang::Fra) ); assert_eq!( LexerStopWord::guess_lang( "Tarlós István főpolgármester utasítása alapján a Főváros a Budapest Portálon", Script::Latin ), Some(Lang::Hun) ); assert_eq!( LexerStopWord::guess_lang( "Tots els éssers humans neixen lliures i iguals en dignitat i en drets.", Script::Latin ), Some(Lang::Cat) ); assert_eq!( LexerStopWord::guess_lang("aux", Script::Latin), Some(Lang::Fra) ); assert_eq!( LexerStopWord::guess_lang("feefeffd zd", Script::Latin), None ); } } #[cfg(all(feature = "benchmark", test))] mod benches { extern crate test; use super::*; use test::Bencher; #[bench] fn bench_detect_stopwords_not_found(b: &mut Bencher) { b.iter(|| LexerStopWord::is("fox", Some(Lang::Eng))); } #[bench] fn bench_detect_stopwords_found(b: &mut Bencher) { b.iter(|| LexerStopWord::is("the", Some(Lang::Eng))); } #[bench] fn bench_guess_language_latin(b: &mut Bencher) { b.iter(|| { LexerStopWord::guess_lang( "I believe there is an extremely simple way to whip climate change.", Script::Latin, ) }); } #[bench] fn bench_guess_language_mandarin(b: &mut Bencher) { b.iter(|| LexerStopWord::guess_lang("快狐跨懒狗", Script::Mandarin)); } } ================================================ FILE: src/lexer/token.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) use hashbrown::HashSet; use std::time::Instant; use unicode_segmentation::{UnicodeSegmentation, UnicodeWords}; use whatlang::{ detect as lang_detect_all, detect_lang as lang_detect, detect_script as script_detect, Lang, }; #[cfg(feature = "tokenizer-chinese")] use std::vec::IntoIter; use super::stopwords::LexerStopWord; use crate::query::types::QueryGenericLang; use crate::store::identifiers::{StoreTermHash, StoreTermHashed}; pub struct TokenLexerBuilder; pub struct TokenLexer<'a> { mode: TokenLexerMode, locale: Option, words: TokenLexerWords<'a>, yields: HashSet, } #[derive(PartialEq)] pub enum TokenLexerMode { NormalizeAndCleanup(Option), NormalizeOnly, } enum TokenLexerWords<'a> { UAX29(UnicodeWords<'a>), #[cfg(feature = "tokenizer-chinese")] JieBa(IntoIter<&'a str>), #[cfg(feature = "tokenizer-japanese")] Lindera(IntoIter>), } const TEXT_LANG_TRUNCATE_OVER_CHARS: usize = 200; const TEXT_LANG_DETECT_PROCEED_OVER_CHARS: usize = 20; const TEXT_LANG_DETECT_NGRAM_UNDER_CHARS: usize = 60; #[cfg(feature = "tokenizer-chinese")] lazy_static! { static ref TOKENIZER_JIEBA: jieba_rs::Jieba = jieba_rs::Jieba::new(); } #[cfg(feature = "tokenizer-japanese")] lazy_static! { static ref TOKENIZER_LINDERA: lindera_tokenizer::tokenizer::Tokenizer = lindera_tokenizer::tokenizer::Tokenizer::from_config( lindera_tokenizer::tokenizer::TokenizerConfig { dictionary: lindera_dictionary::DictionaryConfig { kind: Some(lindera_dictionary::DictionaryKind::UniDic), path: None }, user_dictionary: None, mode: lindera_core::mode::Mode::Normal, } ) .expect("unable to initialize japanese tokenizer"); } impl TokenLexerBuilder { pub fn from(mode: TokenLexerMode, text: &str) -> Result, ()> { let locale = match mode { TokenLexerMode::NormalizeAndCleanup(None) => { // Detect text language (current lexer mode asks for a cleanup) debug!("detecting locale from lexer text: {}", text); Self::detect_lang(text) } TokenLexerMode::NormalizeAndCleanup(Some(lang)) => { // Use hinted language (current lexer mode asks for a cleanup) debug!("using hinted locale: {} from lexer text: {}", lang, text); Some(lang) } TokenLexerMode::NormalizeOnly => { debug!("not detecting locale from lexer text: {}", text); // May be 'NormalizeOnly' mode; no need to perform a locale detection None } }; // Build final token builder iterator Ok(TokenLexer::new(mode, text, locale)) } fn detect_lang(text: &str) -> Option { // Detect only if text is long-enough to allow the text locale detection system to \ // function properly if text.len() < TEXT_LANG_DETECT_PROCEED_OVER_CHARS { return None; } // Truncate text if necessary, as to avoid the ngram or stopwords detector to be \ // ran on more words than those that are enough to reliably detect a locale. let safe_text = if text.len() > TEXT_LANG_TRUNCATE_OVER_CHARS { debug!( "lexer text needs to be truncated, as it is too long ({}/{}): {}", text.len(), TEXT_LANG_TRUNCATE_OVER_CHARS, text ); // Perform an UTF-8 aware truncation // Notice: then 'len()' check above was not UTF-8 aware, but is better than \ // nothing as it avoids entering the below iterator for small strings. // Notice: we fallback on text if the result is 'None'; as if it is 'None' there \ // was less characters than the truncate limit in the UTF-8 parsed text. With \ // this unwrap-way, we avoid doing a 'text.chars().count()' every time, which is \ // a O(N) operation, and rather guard this block with a 'text.len()' which is \ // a O(1) operation but which is not 100% reliable when approaching the truncate \ // limit. This is a trade-off, which saves quite a lot CPU cycles at scale. text.char_indices() .nth(TEXT_LANG_TRUNCATE_OVER_CHARS) .map(|(end_index, _)| &text[0..end_index]) .unwrap_or(text) } else { text }; debug!("will detect locale for lexer safe text: {}", safe_text); // Attempt to detect the locale from text using an hybrid method that maximizes both \ // accuracy and performance. // Notice: as the 'ngram' method is almost 10x slower than the 'stopwords' method, we \ // prefer using the 'stopwords' method on long texts where we can be sure to see quite \ // a lot of stopwords which will produce a reliable result. However, for shorter texts \ // there are not enough north none stopwords, thus we use the slower 'ngram' method as \ // an attempt to extract the locale using trigrams. Still, if either of these methods \ // fails at detecting a locale it will try using the other method in fallback as to \ // produce the most reliable result while minimizing CPU cycles. if safe_text.len() < TEXT_LANG_DETECT_NGRAM_UNDER_CHARS { debug!( "lexer text is shorter than {} characters, using the slow method", TEXT_LANG_DETECT_NGRAM_UNDER_CHARS ); Self::detect_lang_slow(safe_text) } else { debug!( "lexer text is equal or longer than {} characters, using the fast method", TEXT_LANG_DETECT_NGRAM_UNDER_CHARS ); Self::detect_lang_fast(safe_text) } } fn detect_lang_slow(safe_text: &str) -> Option { let ngram_start = Instant::now(); match lang_detect_all(safe_text) { Some(detector) => { let ngram_took = ngram_start.elapsed(); let mut locale = detector.lang(); info!( "[slow lexer] locale detected from text: {} ({} from {} at {}/1; {}s + {}ms)", safe_text, locale, detector.script(), detector.confidence(), ngram_took.as_secs(), ngram_took.subsec_millis() ); // Confidence is low, try to detect locale from stop-words. // Notice: this is a fallback but should not be too reliable for short \ // texts. if !detector.is_reliable() { debug!("[slow lexer] trying to detect locale from stopwords instead"); // Better alternate locale found? if let Some(alternate_locale) = LexerStopWord::guess_lang(safe_text, detector.script()) { info!( "[slow lexer] detected more accurate locale from stopwords: {}", alternate_locale ); locale = alternate_locale; } } Some(locale) } None => { info!( "[slow lexer] no locale could be detected from text: {}", safe_text ); None } } } fn detect_lang_fast(safe_text: &str) -> Option { let stopwords_start = Instant::now(); match script_detect(safe_text) { Some(script) => { // Locale found? if let Some(locale) = LexerStopWord::guess_lang(safe_text, script) { let stopwords_took = stopwords_start.elapsed(); info!( "[fast lexer] locale detected from text: {} ({}; {}s + {}ms)", safe_text, locale, stopwords_took.as_secs(), stopwords_took.subsec_millis() ); Some(locale) } else { debug!("[fast lexer] trying to detect locale from fallback ngram instead"); // No locale found, fallback on slow ngram. lang_detect(safe_text) } } None => { info!( "[fast lexer] no script could be detected from text: {}", safe_text ); None } } } } impl<'a> TokenLexer<'a> { fn new(mode: TokenLexerMode, text: &'a str, locale: Option) -> TokenLexer<'a> { // Tokenize words (depending on the locale) let words = match locale { #[cfg(feature = "tokenizer-chinese")] Some(Lang::Cmn) => TokenLexerWords::JieBa(TOKENIZER_JIEBA.cut(text, false).into_iter()), #[cfg(feature = "tokenizer-japanese")] Some(Lang::Jpn) => match TOKENIZER_LINDERA.tokenize(text) { Ok(tokens) => TokenLexerWords::Lindera(tokens.into_iter()), Err(err) => { warn!("unable to tokenize japanese, falling back: {}", err); TokenLexerWords::UAX29(text.unicode_words()) } }, _ => TokenLexerWords::UAX29(text.unicode_words()), }; TokenLexer { mode, locale, words, yields: HashSet::new(), } } } impl TokenLexerMode { pub fn from_query_lang(lang: Option) -> TokenLexerMode { match lang { Some(QueryGenericLang::Enabled(lang)) => { // Cleanup with provided language TokenLexerMode::NormalizeAndCleanup(Some(lang)) } Some(QueryGenericLang::Disabled) => { // Normalize only (language purposefully set to 'none') TokenLexerMode::NormalizeOnly } None => { // Auto-detect language and cleanup (this is the default behavior) TokenLexerMode::NormalizeAndCleanup(None) } } } } impl<'a> Iterator for TokenLexer<'a> { type Item = (String, StoreTermHashed); // Guarantees provided by the lexer on the output: \ // - Text is split per-word in a script-aware way \ // - Words are normalized (ie. lower-case) \ // - Gibberish words are removed (ie. words that may just be junk) \ // - Stop-words are removed fn next(&mut self) -> Option { for word in &mut self.words { // Lower-case word // Notice: unfortunately, as Rust is unicode-aware, we need to convert the str slice \ // to a heap-indexed String; as lower-cased characters may change in bit size. let word = word.to_lowercase(); // Check if normalized word is a stop-word? (if should normalize and cleanup) if self.mode == TokenLexerMode::NormalizeOnly || !LexerStopWord::is(&word, self.locale) { // Hash the term (this is used by all iterator consumers, as well as internally \ // in the iterator to keep track of already-yielded words in a space-optimized \ // manner, ie. by using 32-bit unsigned integer hashes) let term_hash = StoreTermHash::from(&word); // Check if word was not already yielded? (we return unique words) if !self.yields.contains(&term_hash) { debug!("lexer yielded word: {}", word); self.yields.insert(term_hash); return Some((word, term_hash)); } else { debug!( "lexer did not yield word: {} because: word already yielded", word ); } } else { debug!( "lexer did not yield word: {} because: word is a stop-word", word ); } } None } } impl<'a> Iterator for TokenLexerWords<'a> { type Item = &'a str; fn next(&mut self) -> Option { match self { TokenLexerWords::UAX29(token) => token.next(), #[cfg(feature = "tokenizer-chinese")] TokenLexerWords::JieBa(token) => token.next(), #[cfg(feature = "tokenizer-japanese")] TokenLexerWords::Lindera(token) => match token.next() { Some(inner) => Some(inner.text), None => None, }, } } } #[cfg(test)] mod tests { use super::*; #[test] fn it_cleans_token_english() { let mut token_cleaner = TokenLexerBuilder::from( TokenLexerMode::NormalizeAndCleanup(None), "The quick brown fox jumps over the lazy dog!", ) .unwrap(); assert_eq!(token_cleaner.locale, Some(Lang::Eng)); assert_eq!( token_cleaner.next(), Some(("quick".to_string(), 4179131656)) ); assert_eq!( token_cleaner.next(), Some(("brown".to_string(), 1268820067)) ); assert_eq!(token_cleaner.next(), Some(("fox".to_string(), 667256324))); assert_eq!(token_cleaner.next(), Some(("jumps".to_string(), 633865164))); assert_eq!(token_cleaner.next(), Some(("lazy".to_string(), 4130433347))); assert_eq!(token_cleaner.next(), Some(("dog".to_string(), 2044924251))); assert_eq!(token_cleaner.next(), None); } #[test] fn it_cleans_token_french() { let mut token_cleaner = TokenLexerBuilder::from( TokenLexerMode::NormalizeAndCleanup(None), "Le vif renard brun saute par dessus le chien paresseux.", ) .unwrap(); assert_eq!(token_cleaner.locale, Some(Lang::Fra)); assert_eq!( token_cleaner.next(), Some(("renard".to_string(), 1635186311)) ); assert_eq!(token_cleaner.next(), Some(("brun".to_string(), 2763604928))); assert_eq!( token_cleaner.next(), Some(("saute".to_string(), 1918158211)) ); assert_eq!( token_cleaner.next(), Some(("chien".to_string(), 2177818351)) ); assert_eq!( token_cleaner.next(), Some(("paresseux".to_string(), 1678693110)) ); assert_eq!(token_cleaner.next(), None); } #[cfg(feature = "tokenizer-chinese")] #[test] fn it_cleans_token_chinese_jieba() { let mut token_cleaner = TokenLexerBuilder::from( TokenLexerMode::NormalizeAndCleanup(None), "我们中出了一个叛徒", ) .unwrap(); assert_eq!(token_cleaner.locale, Some(Lang::Cmn)); assert_eq!(token_cleaner.next(), Some(("出".to_string(), 241978070))); assert_eq!(token_cleaner.next(), Some(("一个".to_string(), 2596274530))); assert_eq!(token_cleaner.next(), Some(("叛徒".to_string(), 3244183759))); assert_eq!(token_cleaner.next(), None); } #[cfg(not(feature = "tokenizer-chinese"))] #[test] fn it_cleans_token_chinese_naive() { let mut token_cleaner = TokenLexerBuilder::from( TokenLexerMode::NormalizeAndCleanup(None), "快狐跨懒狗快狐跨懒狗", ) .unwrap(); assert_eq!(token_cleaner.locale, Some(Lang::Cmn)); assert_eq!(token_cleaner.next(), Some(("快".to_string(), 126546256))); assert_eq!(token_cleaner.next(), Some(("狐".to_string(), 2879689662))); assert_eq!(token_cleaner.next(), Some(("跨".to_string(), 2913342670))); assert_eq!(token_cleaner.next(), Some(("懒".to_string(), 3199935961))); assert_eq!(token_cleaner.next(), Some(("狗".to_string(), 3360772096))); assert_eq!(token_cleaner.next(), None); } #[cfg(feature = "tokenizer-japanese")] #[test] fn it_cleans_token_japanese_lindera_product() { let mut token_cleaner = TokenLexerBuilder::from( TokenLexerMode::NormalizeAndCleanup(None), "関西国際空港限定トートバッグ", ) .unwrap(); assert_eq!(token_cleaner.locale, Some(Lang::Jpn)); assert_eq!(token_cleaner.next(), Some(("関西".to_string(), 1283572620))); assert_eq!(token_cleaner.next(), Some(("国際".to_string(), 2132457693))); assert_eq!(token_cleaner.next(), Some(("空港".to_string(), 865668138))); assert_eq!(token_cleaner.next(), Some(("限定".to_string(), 3708465176))); assert_eq!( token_cleaner.next(), Some(("トート".to_string(), 881444746)) ); assert_eq!( token_cleaner.next(), Some(("バッグ".to_string(), 3515727814)) ); assert_eq!(token_cleaner.next(), None); } #[cfg(feature = "tokenizer-japanese")] #[test] fn it_cleans_token_japanese_lindera_food() { let token_cleaner = TokenLexerBuilder::from(TokenLexerMode::NormalizeAndCleanup(None), "𠮷野家").unwrap(); assert_eq!(token_cleaner.locale, None); let token_cleaner = TokenLexerBuilder::from(TokenLexerMode::NormalizeAndCleanup(None), "ヱビスビール") .unwrap(); assert_eq!(token_cleaner.locale, None); } #[cfg(feature = "tokenizer-japanese")] #[test] fn it_cleans_token_japanese_lindera_sentence() { let mut token_cleaner = TokenLexerBuilder::from( TokenLexerMode::NormalizeAndCleanup(None), "𠮷野家でヱビスビールを飲んだ", ) .unwrap(); assert_eq!(token_cleaner.locale, Some(Lang::Jpn)); assert_eq!(token_cleaner.next(), Some(("𠮷".to_string(), 2866455824))); assert_eq!(token_cleaner.next(), Some(("野家".to_string(), 1324395598))); assert_eq!( token_cleaner.next(), Some(("ヱビス".to_string(), 1696836208)) ); assert_eq!( token_cleaner.next(), Some(("ビール".to_string(), 3421909800)) ); assert_eq!(token_cleaner.next(), Some(("飲ん".to_string(), 3196735184))); assert_eq!(token_cleaner.next(), None); } #[test] fn it_cleans_token_emojis() { let mut token_cleaner = TokenLexerBuilder::from(TokenLexerMode::NormalizeAndCleanup(None), "🚀 🙋‍♂️🙋‍♂️🙋‍♂️") .unwrap(); assert_eq!(token_cleaner.locale, None); assert_eq!(token_cleaner.next(), None); } #[test] fn it_cleans_token_lang_hinted() { let mut token_cleaner_right = TokenLexerBuilder::from( TokenLexerMode::NormalizeAndCleanup(Some(Lang::Eng)), "This will be cleaned properly, as English was hinted rightfully so.", ) .unwrap(); let mut token_cleaner_wrong = TokenLexerBuilder::from( TokenLexerMode::NormalizeAndCleanup(Some(Lang::Fra)), "This will not be cleaned properly, as French was hinted but this is English.", ) .unwrap(); assert_eq!(token_cleaner_right.locale, Some(Lang::Eng)); assert_eq!(token_cleaner_wrong.locale, Some(Lang::Fra)); assert_eq!( token_cleaner_right.next(), Some(("cleaned".to_string(), 3550382624)) ); assert_eq!( token_cleaner_wrong.next(), Some(("this".to_string(), 493303710)) ); } #[test] fn it_detects_lang_english_regular() { assert_eq!( TokenLexerBuilder::detect_lang("The quick brown fox jumps over the lazy dog!"), Some(Lang::Eng) ); } #[test] fn it_detects_lang_english_long() { assert_eq!( TokenLexerBuilder::detect_lang( r#"Running an electrical current through water splits it into oxygen and hydrogen, the latter of which can be used as a reliable, zero-emission fuel source. In the past, the process of purifying water beforehand was too energy intensive for this process to be useful — but now scientists have figured out how to skip the process altogether and convert seawater into usable hydrogen"# ), Some(Lang::Eng) ); } #[test] fn it_doesnt_detect_lang_english_tiny() { assert_eq!(TokenLexerBuilder::detect_lang("The quick"), None); } } #[cfg(all(feature = "benchmark", test))] mod benches { extern crate test; use super::*; use test::Bencher; #[bench] fn bench_normalize_token_french_build(b: &mut Bencher) { b.iter(|| { TokenLexerBuilder::from( TokenLexerMode::NormalizeOnly, "Le vif renard brun saute par dessus le chien paresseux.", ) }); } #[bench] fn bench_normalize_token_french_exhaust(b: &mut Bencher) { b.iter(|| { let token_cleaner = TokenLexerBuilder::from( TokenLexerMode::NormalizeOnly, "Le vif renard brun saute par dessus le chien paresseux.", ) .unwrap(); token_cleaner.map(|value| value.1).collect::>() }); } #[bench] fn bench_clean_token_english_regular_build(b: &mut Bencher) { b.iter(|| { TokenLexerBuilder::from( TokenLexerMode::NormalizeAndCleanup(None), "The quick brown fox jumps over the lazy dog!", ) }); } #[bench] fn bench_clean_token_english_regular_exhaust(b: &mut Bencher) { b.iter(|| { let token_cleaner = TokenLexerBuilder::from( TokenLexerMode::NormalizeAndCleanup(None), "The quick brown fox jumps over the lazy dog!", ) .unwrap(); token_cleaner.map(|value| value.1).collect::>() }); } #[bench] fn bench_clean_token_english_long_exhaust(b: &mut Bencher) { b.iter(|| { let token_cleaner = TokenLexerBuilder::from( TokenLexerMode::NormalizeAndCleanup(None), r#"Running an electrical current through water splits it into oxygen and hydrogen, the latter of which can be used as a reliable, zero-emission fuel source. In the past, the process of purifying water beforehand was too energy intensive for this process to be useful — but now scientists have figured out how to skip the process altogether and convert seawater into usable hydrogen"#, ) .unwrap(); token_cleaner.map(|value| value.1).collect::>() }); } #[bench] fn bench_clean_token_english_hinted_build(b: &mut Bencher) { b.iter(|| { TokenLexerBuilder::from( TokenLexerMode::NormalizeAndCleanup(Some(Lang::Eng)), "The quick brown fox jumps over the lazy dog!", ) }); } #[bench] fn bench_clean_token_english_hinted_exhaust(b: &mut Bencher) { b.iter(|| { let token_cleaner = TokenLexerBuilder::from( TokenLexerMode::NormalizeAndCleanup(Some(Lang::Eng)), "The quick brown fox jumps over the lazy dog!", ) .unwrap(); token_cleaner.map(|value| value.1).collect::>() }); } #[bench] fn bench_clean_token_chinese_build(b: &mut Bencher) { b.iter(|| { TokenLexerBuilder::from( TokenLexerMode::NormalizeAndCleanup(None), "我们中出了一个叛徒", ) }); } #[bench] fn bench_clean_token_chinese_exhaust(b: &mut Bencher) { b.iter(|| { let token_cleaner = TokenLexerBuilder::from( TokenLexerMode::NormalizeAndCleanup(None), "我们中出了一个叛徒", ) .unwrap(); token_cleaner.map(|value| value.1).collect::>() }); } #[bench] fn bench_clean_token_japanese_build(b: &mut Bencher) { b.iter(|| { TokenLexerBuilder::from( TokenLexerMode::NormalizeAndCleanup(None), "関西国際空港限定トートバッグ", ) }); } #[bench] fn bench_clean_token_japanese_exhaust(b: &mut Bencher) { b.iter(|| { let token_cleaner = TokenLexerBuilder::from( TokenLexerMode::NormalizeAndCleanup(None), "関西国際空港限定トートバッグ", ) .unwrap(); token_cleaner.map(|value| value.1).collect::>() }); } #[bench] fn bench_detect_lang_english_short(b: &mut Bencher) { b.iter(|| TokenLexerBuilder::detect_lang("The quick brown fox.")); } #[bench] fn bench_detect_lang_english_regular(b: &mut Bencher) { b.iter(|| TokenLexerBuilder::detect_lang("The quick brown fox jumps over the lazy dog!")); } #[bench] fn bench_detect_lang_english_long(b: &mut Bencher) { b.iter(|| { TokenLexerBuilder::detect_lang( r#"Running an electrical current through water splits it into oxygen and hydrogen, the latter of which can be used as a reliable, zero-emission fuel source. In the past, the process of purifying water beforehand was too energy intensive for this process to be useful — but now scientists have figured out how to skip the process altogether and convert seawater into usable hydrogen"#, ) }); } #[bench] fn bench_dont_detect_lang_english_tiny(b: &mut Bencher) { b.iter(|| TokenLexerBuilder::detect_lang("The quick")); } } ================================================ FILE: src/main.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) #![cfg_attr(feature = "benchmark", feature(test))] #![deny(unstable_features, unused_imports, unused_qualifications, clippy::all)] #[macro_use] extern crate log; #[macro_use] extern crate lazy_static; #[macro_use] extern crate serde_derive; mod channel; mod config; mod executor; mod lexer; mod query; mod stopwords; mod store; mod tasker; use std::ops::Deref; use std::str::FromStr; use std::thread; use std::time::Duration; use clap::{App, Arg}; use log::LevelFilter; use channel::listen::{ChannelListen, ChannelListenBuilder}; use channel::statistics::ensure_states as ensure_states_channel_statistics; use config::logger::ConfigLogger; use config::options::Config; use config::reader::ConfigReader; use store::fst::StoreFSTPool; use store::kv::StoreKVPool; use tasker::runtime::TaskerBuilder; use tasker::shutdown::ShutdownSignal; struct AppArgs { config: String, } #[cfg(unix)] #[cfg(feature = "allocator-jemalloc")] #[global_allocator] static ALLOC: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; pub static LINE_FEED: &str = "\r\n"; pub static THREAD_NAME_CHANNEL_MASTER: &str = "sonic-channel-master"; pub static THREAD_NAME_CHANNEL_CLIENT: &str = "sonic-channel-client"; pub static THREAD_NAME_TASKER: &str = "sonic-tasker"; macro_rules! gen_spawn_managed { ($name:expr, $method:ident, $thread_name:ident, $managed_fn:ident) => { fn $method() { debug!("spawn managed thread: {}", $name); let worker = thread::Builder::new() .name($thread_name.to_string()) .spawn(|| $managed_fn::build().run()); // Block on worker thread (join it) let has_error = if let Ok(worker_thread) = worker { worker_thread.join().is_err() } else { true }; // Worker thread crashed? if has_error == true { error!("managed thread crashed ({}), setting it up again", $name); // Prevents thread start loop floods thread::sleep(Duration::from_secs(1)); $method(); } } }; } lazy_static! { static ref APP_ARGS: AppArgs = make_app_args(); static ref APP_CONF: Config = ConfigReader::make(); } gen_spawn_managed!( "channel", spawn_channel, THREAD_NAME_CHANNEL_MASTER, ChannelListenBuilder ); gen_spawn_managed!("tasker", spawn_tasker, THREAD_NAME_TASKER, TaskerBuilder); fn make_app_args() -> AppArgs { let matches = App::new(clap::crate_name!()) .version(clap::crate_version!()) .author(clap::crate_authors!()) .about(clap::crate_description!()) .arg( Arg::new("config") .short('c') .long("config") .help("Path to configuration file") .default_value("./config.cfg") .takes_value(true), ) .get_matches(); // Generate owned app arguments AppArgs { config: String::from(matches.value_of("config").expect("invalid config value")), } } fn ensure_states() { // Ensure all statics are valid (a `deref` is enough to lazily initialize them) let (_, _) = (APP_ARGS.deref(), APP_CONF.deref()); // Ensure per-module states ensure_states_channel_statistics(); } fn main() { let _logger = ConfigLogger::init( LevelFilter::from_str(&APP_CONF.server.log_level).expect("invalid log level"), ); let shutdown_signal = ShutdownSignal::new(); info!("starting up"); // Ensure all states are bound ensure_states(); // Spawn tasker (background thread) thread::spawn(spawn_tasker); // Spawn channel (foreground thread) thread::spawn(spawn_channel); info!("started"); shutdown_signal.at_exit(move |signal| { info!("stopping gracefully (got signal: {})", signal); // Teardown Sonic Channel ChannelListen::teardown(); // Perform a KV flush (ensures all in-memory changes are synced on-disk before shutdown) StoreKVPool::flush(true); // Perform a FST consolidation (ensures all in-memory items are synced on-disk before \ // shutdown; otherwise we would lose all non-consolidated FST changes) StoreFSTPool::consolidate(true); info!("stopped"); }); } ================================================ FILE: src/query/actions.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) use super::types::*; use crate::lexer::token::TokenLexer; use crate::store::item::StoreItem; pub enum Query<'a> { Search( StoreItem<'a>, QuerySearchID<'a>, TokenLexer<'a>, QuerySearchLimit, QuerySearchOffset, ), Suggest( StoreItem<'a>, QuerySearchID<'a>, TokenLexer<'a>, QuerySearchLimit, ), List( StoreItem<'a>, QuerySearchID<'a>, QuerySearchLimit, QuerySearchOffset, ), Push(StoreItem<'a>, TokenLexer<'a>), Pop(StoreItem<'a>, TokenLexer<'a>), Count(StoreItem<'a>), FlushC(StoreItem<'a>), FlushB(StoreItem<'a>), FlushO(StoreItem<'a>), } ================================================ FILE: src/query/builder.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) use super::actions::Query; use super::types::{QueryGenericLang, QuerySearchLimit, QuerySearchOffset}; use crate::lexer::token::{TokenLexerBuilder, TokenLexerMode}; use crate::store::item::StoreItemBuilder; pub struct QueryBuilder; pub type QueryBuilderResult<'a> = Result, ()>; impl QueryBuilder { pub fn search<'a>( query_id: &'a str, collection: &'a str, bucket: &'a str, terms: &'a str, limit: QuerySearchLimit, offset: QuerySearchOffset, lang: Option, ) -> QueryBuilderResult<'a> { match ( StoreItemBuilder::from_depth_2(collection, bucket), TokenLexerBuilder::from(TokenLexerMode::from_query_lang(lang), terms), ) { (Ok(store), Ok(text_lexed)) => { Ok(Query::Search(store, query_id, text_lexed, limit, offset)) } _ => Err(()), } } pub fn suggest<'a>( query_id: &'a str, collection: &'a str, bucket: &'a str, terms: &'a str, limit: QuerySearchLimit, ) -> QueryBuilderResult<'a> { match ( StoreItemBuilder::from_depth_2(collection, bucket), TokenLexerBuilder::from(TokenLexerMode::NormalizeOnly, terms), ) { (Ok(store), Ok(text_lexed)) => Ok(Query::Suggest(store, query_id, text_lexed, limit)), _ => Err(()), } } pub fn list<'a>( query_id: &'a str, collection: &'a str, bucket: &'a str, limit: QuerySearchLimit, offset: QuerySearchOffset, ) -> QueryBuilderResult<'a> { match StoreItemBuilder::from_depth_2(collection, bucket) { Ok(store) => Ok(Query::List(store, query_id, limit, offset)), _ => Err(()), } } pub fn push<'a>( collection: &'a str, bucket: &'a str, object: &'a str, text: &'a str, lang: Option, ) -> QueryBuilderResult<'a> { match ( StoreItemBuilder::from_depth_3(collection, bucket, object), TokenLexerBuilder::from(TokenLexerMode::from_query_lang(lang), text), ) { (Ok(store), Ok(text_lexed)) => Ok(Query::Push(store, text_lexed)), _ => Err(()), } } pub fn pop<'a>( collection: &'a str, bucket: &'a str, object: &'a str, text: &'a str, ) -> QueryBuilderResult<'a> { match ( StoreItemBuilder::from_depth_3(collection, bucket, object), TokenLexerBuilder::from(TokenLexerMode::NormalizeOnly, text), ) { (Ok(store), Ok(text_lexed)) => Ok(Query::Pop(store, text_lexed)), _ => Err(()), } } pub fn count<'a>( collection: &'a str, bucket: Option<&'a str>, object: Option<&'a str>, ) -> QueryBuilderResult<'a> { let store_result = match (bucket, object) { (Some(bucket_inner), Some(object_inner)) => { StoreItemBuilder::from_depth_3(collection, bucket_inner, object_inner) } (Some(bucket_inner), None) => StoreItemBuilder::from_depth_2(collection, bucket_inner), _ => StoreItemBuilder::from_depth_1(collection), }; match store_result { Ok(store) => Ok(Query::Count(store)), _ => Err(()), } } pub fn flushc(collection: &str) -> QueryBuilderResult<'_> { match StoreItemBuilder::from_depth_1(collection) { Ok(store) => Ok(Query::FlushC(store)), _ => Err(()), } } pub fn flushb<'a>(collection: &'a str, bucket: &'a str) -> QueryBuilderResult<'a> { match StoreItemBuilder::from_depth_2(collection, bucket) { Ok(store) => Ok(Query::FlushB(store)), _ => Err(()), } } pub fn flusho<'a>( collection: &'a str, bucket: &'a str, object: &'a str, ) -> QueryBuilderResult<'a> { match StoreItemBuilder::from_depth_3(collection, bucket, object) { Ok(store) => Ok(Query::FlushO(store)), _ => Err(()), } } } #[cfg(test)] mod tests { use super::*; #[test] fn it_builds_search_query() { assert!( QueryBuilder::search("id1", "c:test:1", "b:test:1", "Michael Dake", 10, 20, None) .is_ok() ); assert!(QueryBuilder::search("id2", "c:test:1", "", "Michael Dake", 1, 0, None).is_err()); } #[test] fn it_builds_suggest_query() { assert!(QueryBuilder::suggest("id1", "c:test:2", "b:test:2", "Micha", 5).is_ok()); assert!(QueryBuilder::suggest("id2", "c:test:2", "", "Micha", 1).is_err()); } #[test] fn it_builds_list_query() { assert!(QueryBuilder::list("id1", "c:test:2", "b:test:2", 100, 0).is_ok()); assert!(QueryBuilder::list("id2", "c:test:2", "", 10, 0).is_err()); } #[test] fn it_builds_push_query() { assert!(QueryBuilder::push( "c:test:3", "b:test:3", "o:test:3", "My name is Michael Dake. I'm ordering in the US.", None ) .is_ok()); assert!( QueryBuilder::push("c:test:3", "", "o:test:3", "My name is Michael Dake.", None) .is_err() ); } #[test] fn it_builds_pop_query() { assert!(QueryBuilder::pop("c:test:4", "b:test:4", "o:test:4", "ordering US").is_ok()); assert!(QueryBuilder::pop("c:test:4", "", "o:test:4", "ordering US").is_err()); } #[test] fn it_builds_count_query() { assert!(QueryBuilder::count("c:test:5", None, None).is_ok()); assert!(QueryBuilder::count("c:test:5", Some("b:test:5"), None).is_ok()); assert!(QueryBuilder::count("c:test:5", Some("b:test:5"), Some("o:test:5")).is_ok()); assert!(QueryBuilder::count("c:test:5", Some(""), Some("o:test:5")).is_err()); } #[test] fn it_builds_flushc_query() { assert!(QueryBuilder::flushc("c:test:6").is_ok()); assert!(QueryBuilder::flushc("").is_err()); } #[test] fn it_builds_flushb_query() { assert!(QueryBuilder::flushb("c:test:7", "b:test:7").is_ok()); assert!(QueryBuilder::flushb("c:test:7", "").is_err()); } #[test] fn it_builds_flusho_query() { assert!(QueryBuilder::flusho("c:test:8", "b:test:8", "o:test:8").is_ok()); assert!(QueryBuilder::flusho("c:test:8", "b:test:8", "").is_err()); } } ================================================ FILE: src/query/mod.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub mod actions; pub mod builder; pub mod types; ================================================ FILE: src/query/types.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) use whatlang::Lang; #[derive(Debug, PartialEq)] pub enum QueryGenericLang { Enabled(Lang), Disabled, } pub type QuerySearchID<'a> = &'a str; pub type QuerySearchLimit = u16; pub type QuerySearchOffset = u32; pub type QueryMetaData = ( Option, Option, Option, ); pub type ListMetaData = (Option, Option); impl QueryGenericLang { pub fn from_value(value: &str) -> Option { if value == "none" { Some(QueryGenericLang::Disabled) } else { Lang::from_code(value).map(QueryGenericLang::Enabled) } } } #[cfg(test)] mod tests { use super::*; #[test] fn it_parses_generic_lang_from_value() { assert_eq!( QueryGenericLang::from_value("none"), Some(QueryGenericLang::Disabled) ); assert_eq!( QueryGenericLang::from_value("fra"), Some(QueryGenericLang::Enabled(Lang::Fra)) ); assert_eq!(QueryGenericLang::from_value("xxx"), None); } } ================================================ FILE: src/stopwords/afr.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_AFR: &[&str] = &[ "'n", "aan", "af", "al", "as", "baie", "by", "daar", "dag", "dat", "die", "dit", "een", "ek", "en", "gaan", "gesê", "haar", "het", "hom", "hulle", "hy", "in", "is", "jou", "jy", "kan", "kom", "ma", "maar", "met", "my", "na", "nie", "om", "ons", "op", "saam", "sal", "se", "sien", "so", "sy", "te", "toe", "uit", "van", "vir", "was", "wat", "ʼn", ]; ================================================ FILE: src/stopwords/aka.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) // Notice: we do not have stopwords for this language yet. pub static STOPWORDS_AKA: &[&str] = &[]; ================================================ FILE: src/stopwords/amh.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) // Notice: we do not have stopwords for this language yet. pub static STOPWORDS_AMH: &[&str] = &[]; ================================================ FILE: src/stopwords/ara.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_ARA: &[&str] = &[ "،", "آض", "آمينَ", "آه", "آهاً", "آي", "أ", "أب", "أجل", "أجمع", "أخ", "أخذ", "أصبح", "أضحى", "أقبل", "أقل", "أكثر", "ألا", "أم", "أما", "أمامك", "أمامكَ", "أمسى", "أمّا", "أن", "أنا", "أنت", "أنتم", "أنتما", "أنتن", "أنتِ", "أنشأ", "أنّى", "أو", "أوشك", "أولئك", "أولئكم", "أولاء", "أولالك", "أوّهْ", "أي", "أيا", "أين", "أينما", "أيّ", "أَنَّ", "أََيُّ", "أُفٍّ", "إذ", "إذا", "إذاً", "إذما", "إذن", "إلى", "إليكم", "إليكما", "إليكنّ", "إليكَ", "إلَيْكَ", "إلّا", "إمّا", "إن", "إنّما", "إي", "إياك", "إياكم", "إياكما", "إياكن", "إيانا", "إياه", "إياها", "إياهم", "إياهما", "إياهن", "إياي", "إيهٍ", "إِنَّ", "ا", "ابتدأ", "اثر", "اجل", "احد", "اخرى", "اخلولق", "اذا", "اربعة", "ارتدّ", "استحال", "اطار", "اعادة", "اعلنت", "اف", "اكثر", "اكد", "الألاء", "الألى", "الا", "الاخيرة", "الان", "الاول", "الاولى", "التى", "التي", "الثاني", "الثانية", "الذاتي", "الذى", "الذي", "الذين", "السابق", "الف", "اللائي", "اللاتي", "اللتان", "اللتيا", "اللتين", "اللذان", "اللذين", "اللواتي", "الماضي", "المقبل", "الوقت", "الى", "اليوم", "اما", "امام", "امس", "ان", "انبرى", "انقلب", "انه", "انها", "او", "اول", "اي", "ايار", "ايام", "ايضا", "ب", "بات", "باسم", "بان", "بخٍ", "برس", "بسبب", "بسّ", "بشكل", "بضع", "بطآن", "بعد", "بعض", "بك", "بكم", "بكما", "بكن", "بل", "بلى", "بما", "بماذا", "بمن", "بن", "بنا", "به", "بها", "بي", "بيد", "بين", "بَسْ", "بَلْهَ", "بِئْسَ", "تانِ", "تانِك", "تبدّل", "تجاه", "تحوّل", "تلقاء", "تلك", "تلكم", "تلكما", "تم", "تينك", "تَيْنِ", "تِه", "تِي", "ثلاثة", "ثم", "ثمّ", "ثمّة", "ثُمَّ", "جعل", "جلل", "جميع", "جير", "حار", "حاشا", "حاليا", "حاي", "حتى", "حرى", "حسب", "حم", "حوالى", "حول", "حيث", "حيثما", "حين", "حيَّ", "حَبَّذَا", "حَتَّى", "حَذارِ", "خلا", "خلال", "دون", "دونك", "ذا", "ذات", "ذاك", "ذانك", "ذانِ", "ذلك", "ذلكم", "ذلكما", "ذلكن", "ذو", "ذوا", "ذواتا", "ذواتي", "ذيت", "ذينك", "ذَيْنِ", "ذِه", "ذِي", "راح", "رجع", "رويدك", "ريث", "رُبَّ", "زيارة", "سبحان", "سرعان", "سنة", "سنوات", "سوف", "سوى", "سَاءَ", "سَاءَمَا", "شبه", "شخصا", "شرع", "شَتَّانَ", "صار", "صباح", "صفر", "صهٍ", "صهْ", "ضد", "ضمن", "طاق", "طالما", "طفق", "طَق", "ظلّ", "عاد", "عام", "عاما", "عامة", "عدا", "عدة", "عدد", "عدم", "عسى", "عشر", "عشرة", "علق", "على", "عليك", "عليه", "عليها", "علًّ", "عن", "عند", "عندما", "عوض", "عين", "عَدَسْ", "عَمَّا", "غدا", "غير", "ـ", "ف", "فان", "فلان", "فو", "فى", "في", "فيم", "فيما", "فيه", "فيها", "قال", "قام", "قبل", "قد", "قطّ", "قلما", "قوة", "كأنّما", "كأين", "كأيّ", "كأيّن", "كاد", "كان", "كانت", "كذا", "كذلك", "كرب", "كل", "كلا", "كلاهما", "كلتا", "كلم", "كليكما", "كليهما", "كلّما", "كلَّا", "كم", "كما", "كي", "كيت", "كيف", "كيفما", "كَأَنَّ", "كِخ", "لئن", "لا", "لات", "لاسيما", "لدن", "لدى", "لعمر", "لقاء", "لك", "لكم", "لكما", "لكن", "لكنَّما", "لكي", "لكيلا", "للامم", "لم", "لما", "لمّا", "لن", "لنا", "له", "لها", "لو", "لوكالة", "لولا", "لوما", "لي", "لَسْتَ", "لَسْتُ", "لَسْتُم", "لَسْتُمَا", "لَسْتُنَّ", "لَسْتِ", "لَسْنَ", "لَعَلَّ", "لَكِنَّ", "لَيْتَ", "لَيْسَ", "لَيْسَا", "لَيْسَتَا", "لَيْسَتْ", "لَيْسُوا", "لَِسْنَا", "ما", "ماانفك", "مابرح", "مادام", "ماذا", "مازال", "مافتئ", "مايو", "متى", "مثل", "مذ", "مساء", "مع", "معاذ", "مقابل", "مكانكم", "مكانكما", "مكانكنّ", "مكانَك", "مليار", "مليون", "مما", "ممن", "من", "منذ", "منها", "مه", "مهما", "مَنْ", "مِن", "نحن", "نحو", "نعم", "نفس", "نفسه", "نهاية", "نَخْ", "نِعِمّا", "نِعْمَ", "ها", "هاؤم", "هاكَ", "هاهنا", "هبّ", "هذا", "هذه", "هكذا", "هل", "هلمَّ", "هلّا", "هم", "هما", "هن", "هنا", "هناك", "هنالك", "هو", "هي", "هيا", "هيت", "هيّا", "هَؤلاء", "هَاتانِ", "هَاتَيْنِ", "هَاتِه", "هَاتِي", "هَجْ", "هَذا", "هَذانِ", "هَذَيْنِ", "هَذِه", "هَذِي", "هَيْهَاتَ", "و", "و6", "وا", "واحد", "واضاف", "واضافت", "واكد", "وان", "واهاً", "واوضح", "وراءَك", "وفي", "وقال", "وقالت", "وقد", "وقف", "وكان", "وكانت", "ولا", "ولم", "ومن", "وهو", "وهي", "ويكأنّ", "وَيْ", "وُشْكَانََ", "يكون", "يمكن", "يوم", "ّأيّان", ]; ================================================ FILE: src/stopwords/aze.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_AZE: &[&str] = &[ "a", "ad", "altı", "altmış", "amma", "arasında", "artıq", "ay", "az", "bax", "belə", "bəli", "bəlkə", "beş", "bəy", "bəzən", "bəzi", "bilər", "bir", "biraz", "biri", "birşey", "biz", "bizim", "bizlər", "bu", "buna", "bundan", "bunların", "bunu", "bunun", "buradan", "bütün", "ci", "cı", "çox", "cu", "cü", "çünki", "da", "daha", "də", "dedi", "dək", "dən", "dəqiqə", "deyil", "dir", "doqquz", "doqsan", "dörd", "düz", "ə", "edən", "edir", "əgər", "əlbəttə", "elə", "əlli", "ən", "əslində", "et", "etdi", "etmə", "etmək", "faiz", "gilə", "görə", "ha", "haqqında", "harada", "hə", "heç", "həm", "həmin", "həmişə", "hər", "ı", "idi", "iki", "il", "ildə", "ilə", "ilk", "in", "indi", "isə", "istifadə", "iyirmi", "ki", "kim", "kimə", "kimi", "lakin", "lap", "məhz", "mən", "mənə", "mirşey", "nə", "nəhayət", "niyə", "o", "obirisi", "of", "olan", "olar", "olaraq", "oldu", "olduğu", "olmadı", "olmaz", "olmuşdur", "olsun", "olur", "on", "ona", "ondan", "onlar", "onlardan", "onların ", "onsuzda", "onu", "onun", "oradan", "otuz", "öz", "özü", "qarşı", "qədər", "qırx", "saat", "sadəcə", "saniyə", "səhv", "səkkiz", "səksən", "sən", "sənə", "sənin", "siz", "sizin", "sizlər", "sonra", "təəssüf", "ü", "üç", "üçün", "var", "və", "xan", "xanım", "xeyr", "ya", "yalnız", "yaxşı", "yeddi", "yenə", "yəni", "yetmiş", "yox", "yoxdur", "yoxsa", "yüz", "zaman", ]; ================================================ FILE: src/stopwords/bel.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) // Notice: we do not have stopwords for this language yet. pub static STOPWORDS_BEL: &[&str] = &[]; ================================================ FILE: src/stopwords/ben.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_BEN: &[&str] = &[ "অতএব", "অথচ", "অথবা", "অনুযায়ী", "অনেক", "অনেকে", "অনেকেই", "অন্তত", "অন্য", "অবধি", "অবশ্য", "অর্থাত", "আই", "আগামী", "আগে", "আগেই", "আছে", "আজ", "আদ্যভাগে", "আপনার", "আপনি", "আবার", "আমরা", "আমাকে", "আমাদের", "আমার", "আমি", "আর", "আরও", "ই", "ইত্যাদি", "ইহা", "উচিত", "উত্তর", "উনি", "উপর", "উপরে", "এ", "এঁদের", "এঁরা", "এই", "একই", "একটি", "একবার", "একে", "এক্", "এখন", "এখনও", "এখানে", "এখানেই", "এটা", "এটাই", "এটি", "এত", "এতটাই", "এতে", "এদের", "এব", "এবং", "এবার", "এমন", "এমনকী", "এমনি", "এর", "এরা", "এল", "এস", "এসে", "ঐ", "ও", "ওঁদের", "ওঁর", "ওঁরা", "ওই", "ওকে", "ওখানে", "ওদের", "ওর", "ওরা", "কখনও", "কত", "কবে", "কমনে", "কয়েক", "কয়েকটি", "করছে", "করছেন", "করতে", "করবে", "করবেন", "করলে", "করলেন", "করা", "করাই", "করায়", "করার", "করি", "করিতে", "করিয়া", "করিয়ে", "করে", "করেই", "করেছিলেন", "করেছে", "করেছেন", "করেন", "কাউকে", "কাছ", "কাছে", "কাজ", "কাজে", "কারও", "কারণ", "কি", "কিংবা", "কিছু", "কিছুই", "কিন্তু", "কী", "কে", "কেউ", "কেউই", "কেখা", "কেন", "কোটি", "কোন", "কোনও", "কোনো", "ক্ষেত্রে", "কয়েক", "খুব", "গিয়ে", "গিয়েছে", "গিয়ে", "গুলি", "গেছে", "গেল", "গেলে", "গোটা", "চলে", "চান", "চায়", "চার", "চালু", "চেয়ে", "চেষ্টা", "ছাড়া", "ছাড়াও", "ছিল", "ছিলেন", "জন", "জনকে", "জনের", "জন্য", "জন্যওজে", "জানতে", "জানা", "জানানো", "জানায়", "জানিয়ে", "জানিয়েছে", "জে", "জ্নজন", "টি", "ঠিক", "তখন", "তত", "তথা", "তবু", "তবে", "তা", "তাঁকে", "তাঁদের", "তাঁর", "তাঁরা", "তাঁাহারা", "তাই", "তাও", "তাকে", "তাতে", "তাদের", "তার", "তারপর", "তারা", "তারৈ", "তাহলে", "তাহা", "তাহাতে", "তাহার", "তিনঐ", "তিনি", "তিনিও", "তুমি", "তুলে", "তেমন", "তো", "তোমার", "থাকবে", "থাকবেন", "থাকা", "থাকায়", "থাকে", "থাকেন", "থেকে", "থেকেই", "থেকেও", "দিকে", "দিতে", "দিন", "দিয়ে", "দিয়েছে", "দিয়েছেন", "দিলেন", "দু", "দুই", "দুটি", "দুটো", "দেওয়া", "দেওয়ার", "দেওয়া", "দেখতে", "দেখা", "দেখে", "দেন", "দেয়", "দ্বারা", "ধরা", "ধরে", "ধামার", "নতুন", "নয়", "না", "নাই", "নাকি", "নাগাদ", "নানা", "নিজে", "নিজেই", "নিজেদের", "নিজের", "নিতে", "নিয়ে", "নিয়ে", "নেই", "নেওয়া", "নেওয়ার", "নেওয়া", "নয়", "পক্ষে", "পর", "পরে", "পরেই", "পরেও", "পর্যন্ত", "পাওয়া", "পাচ", "পারি", "পারে", "পারেন", "পি", "পেয়ে", "পেয়্র্", "প্রতি", "প্রথম", "প্রভৃতি", "প্রযন্ত", "প্রাথমিক", "প্রায়", "প্রায়", "ফলে", "ফিরে", "ফের", "বক্তব্য", "বদলে", "বন", "বরং", "বলতে", "বলল", "বললেন", "বলা", "বলে", "বলেছেন", "বলেন", "বসে", "বহু", "বা", "বাদে", "বার", "বি", "বিনা", "বিভিন্ন", "বিশেষ", "বিষয়টি", "বেশ", "বেশি", "ব্যবহার", "ব্যাপারে", "ভাবে", "ভাবেই", "মতো", "মতোই", "মধ্যভাগে", "মধ্যে", "মধ্যেই", "মধ্যেও", "মনে", "মাত্র", "মাধ্যমে", "মোট", "মোটেই", "যখন", "যত", "যতটা", "যথেষ্ট", "যদি", "যদিও", "যা", "যাঁর", "যাঁরা", "যাওয়া", "যাওয়ার", "যাওয়া", "যাকে", "যাচ্ছে", "যাতে", "যাদের", "যান", "যাবে", "যায়", "যার", "যারা", "যিনি", "যে", "যেখানে", "যেতে", "যেন", "যেমন", "র", "রকম", "রয়েছে", "রাখা", "রেখে", "লক্ষ", "শুধু", "শুরু", "সঙ্গে", "সঙ্গেও", "সব", "সবার", "সমস্ত", "সম্প্রতি", "সহ", "সহিত", "সাধারণ", "সামনে", "সি", "সুতরাং", "সে", "সেই", "সেখান", "সেখানে", "সেটা", "সেটাই", "সেটাও", "সেটি", "স্পষ্ট", "স্বয়ং", "হইতে", "হইবে", "হইয়া", "হওয়া", "হওয়ায়", "হওয়ার", "হচ্ছে", "হত", "হতে", "হতেই", "হন", "হবে", "হবেন", "হয়", "হয়তো", "হয়নি", "হয়ে", "হয়েই", "হয়েছিল", "হয়েছে", "হয়েছেন", "হল", "হলে", "হলেই", "হলেও", "হলো", "হাজার", "হিসাবে", "হৈলে", "হোক", "হয়", ]; ================================================ FILE: src/stopwords/bul.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_BUL: &[&str] = &[ "ð°", "ð°ð²ñ‚ðµð½ñ‚ð¸ñ‡ðµð½", "ð°ð·", "ð°ðºð¾", "ð°ð»ð°", "ð±ðµ", "ð±ðµð·", "ð±ðµñˆðµ", "ð±ð¸", "ð±ð¸ð²ñˆ", "ð±ð¸ð²ñˆð°", "ð±ð¸ð²ñˆð¾", "ð±ð¸ð»", "ð±ð¸ð»ð°", "ð±ð¸ð»ð¸", "ð±ð¸ð»ð¾", "ð±ð»ð°ð³ð¾ð´ð°ñ€ñ", "ð±ð»ð¸ð·ð¾", "ð±ññ…ð°", "ð±ñšð´ð°ñ‚", "ð±ñšð´ðµ", "ð²", "ð²ð°ñ", "ð²ð°ñˆ", "ð²ð°ñˆð°", "ð²ðµñ‡ðµ", "ð²ðµñ€ð¾ññ‚ð½ð¾", "ð²ð·ðµð¼ð°", "ð²ð¸", "ð²ð¸ðµ", "ð²ð¸ð½ð°ð³ð¸", "ð²ð½ð¸ð¼ð°ð²ð°", "ð²ñðµ", "ð²ñðµðºð¸", "ð²ñð¸ñ‡ðºð¸", "ð²ñð¸ñ‡ðºð¾", "ð²ññðºð°", "ð²ñšð²", "ð²ñšð¿ñ€ðµðºð¸", "ð²ñšñ€ñ…ñƒ", "ð²ñ€ðµð¼ðµ", "ð³", "ð³ð¸", "ð³ð»ð°ð²ðµð½", "ð³ð»ð°ð²ð½ð°", "ð³ð»ð°ð²ð½ð¾", "ð³ð»ð°ñ", "ð³ð¾", "ð³ð¾ð´ð¸ð½ð°", "ð³ð¾ð´ð¸ð½ð¸", "ð³ð¾ð´ð¸ñˆðµð½", "ð´", "ð´ð°", "ð´ð°ð»ð¸", "ð´ð²ð°", "ð´ð²ð°ð¼ð°", "ð´ð²ð°ð¼ð°ñ‚ð°", "ð´ð²ðµ", "ð´ð²ðµñ‚ðµ", "ð´ðµð½", "ð´ð½ðµñ", "ð´ð½ð¸", "ð´ð¾", "ð´ð¾ð±ñšñ€", "ð´ð¾ð±ñ€ð°", "ð´ð¾ð±ñ€ðµ", "ð´ð¾ð±ñ€ð¾", "ð´ð¾ðºð°ñ‚ð¾", "ð´ð¾ðºð¾ð³ð°", "ð´ð¾ñðµð³ð°", "ð´ð¾ññ‚ð°", "ð´ð¾ñ€ð¸", "ð´ñ€ñƒð³", "ð´ñ€ñƒð³ð°", "ð´ñ€ñƒð³ð¸", "ðµ", "ðµð²ñ‚ð¸ð½", "ðµð´ð²ð°", "ðµð´ð¸ð½", "ðµð´ð½ð°", "ðµð´ð½ð°ðºð²ð°", "ðµð´ð½ð°ðºð²ð¸", "ðµð´ð½ð°ðºñšð²", "ðµð´ð½ð¾", "ðµðºð¸ð¿", "ðµñ‚ð¾", "ð¶ð¸ð²ð¾ñ‚", "ð·ð°", "ð·ð°ð±ð°ð²ñð¼", "ð·ð°ð´", "ð·ð°ðµð´ð½ð¾", "ð·ð°ñðµð³ð°", "ð·ð°ñð¿ð°ð»", "ð·ð°ñ‚ð¾ð²ð°", "ð·ð°ñ‰ð¾", "ð·ð°ñ‰ð¾ñ‚ð¾", "ð·ð°ñ€ð°ð´ð¸", "ð¸", "ð¸ð·", "ð¸ð»ð¸", "ð¸ð¼", "ð¸ð¼ð°", "ð¸ð¼ð°ñ‚", "ð¸ñðºð°", "ð¹", "ðºð°ð·ð°", "ðºð°ðº", "ðºð°ðºð²ð°", "ðºð°ðºð²ð¾", "ðºð°ðºñšð²", "ðºð°ðºñ‚ð¾", "ðºð°ñ‚ð¾", "ðºð¾ð³ð°", "ðºð¾ð³ð°ñ‚ð¾", "ðºð¾ðµñ‚ð¾", "ðºð¾ð¸ñ‚ð¾", "ðºð¾ð¹", "ðºð¾ð¹ñ‚ð¾", "ðºð¾ð»ðºð¾", "ðºð¾ññ‚ð¾", "ðºñšð´ðµ", "ðºñšð´ðµñ‚ð¾", "ðºñšð¼", "ð»ðµñðµð½", "ð»ðµñð½ð¾", "ð»ð¸", "ð»ð¾ñˆ", "ð¼", "ð¼ð°ð¹", "ð¼ð°ð»ðºð¾", "ð¼ðµ", "ð¼ðµð¶ð´ñƒ", "ð¼ðµðº", "ð¼ðµð½", "ð¼ðµñðµñ†", "ð¼ð¸", "ð¼ð½ð¾ð³ð¾", "ð¼ð½ð¾ð·ð¸ð½ð°", "ð¼ð¾ð³ð°", "ð¼ð¾ð³ð°ñ‚", "ð¼ð¾ð¶ðµ", "ð¼ð¾ðºñšñ€", "ð¼ð¾ð»ñ", "ð¼ð¾ð¼ðµð½ñ‚ð°", "ð¼ñƒ", "ð½", "ð½ð°", "ð½ð°ð´", "ð½ð°ð·ð°ð´", "ð½ð°ð¹", "ð½ð°ð¿ñ€ð°ð²ð¸", "ð½ð°ð¿ñ€ðµð´", "ð½ð°ð¿ñ€ð¸ð¼ðµñ€", "ð½ð°ñ", "ð½ðµ", "ð½ðµð³ð¾", "ð½ðµñ", "ð½ðµñ‰ð¾", "ð½ð¸", "ð½ð¸ðµ", "ð½ð¸ðºð¾ð¹", "ð½ð¸ñ‚ð¾", "ð½ð¸ñ‰ð¾", "ð½ð¾", "ð½ð¾ð²", "ð½ð¾ð²ð°", "ð½ð¾ð²ð¸", "ð½ð¾ð²ð¸ð½ð°", "ð½ñðºð¾ð¸", "ð½ñðºð¾ð¹", "ð½ñðºð¾ð»ðºð¾", "ð½ñð¼ð°", "ð¾ð±ð°ñ‡ðµ", "ð¾ðºð¾ð»ð¾", "ð¾ñð²ðµð½", "ð¾ñð¾ð±ðµð½ð¾", "ð¾ñ‚", "ð¾ñ‚ð³ð¾ñ€ðµ", "ð¾ñ‚ð½ð¾ð²ð¾", "ð¾ñ‰ðµ", "ð¿ð°ðº", "ð¿ð¾", "ð¿ð¾ð²ðµñ‡ðµ", "ð¿ð¾ð²ðµñ‡ðµñ‚ð¾", "ð¿ð¾ð´", "ð¿ð¾ð½ðµ", "ð¿ð¾ñð»ðµ", "ð¿ð¾ñ‡ñ‚ð¸", "ð¿ð¾ñ€ð°ð´ð¸", "ð¿ñšðº", "ð¿ñšñ‚ð¸", "ð¿ñšñ€ð²ð°ñ‚ð°", "ð¿ñšñ€ð²ð¸", "ð¿ñšñ€ð²ð¾", "ð¿ñ€ð°ð²ð¸", "ð¿ñ€ðµð´", "ð¿ñ€ðµð´ð¸", "ð¿ñ€ðµð·", "ð¿ñ€ð¸", "ñ", "ñð°", "ñð°ð¼", "ñð°ð¼ð¾", "ñðµ", "ñðµð³ð°", "ñð¸", "ñð¸ð½", "ñðºð¾ñ€ð¾", "ñð»ðµð´", "ñð»ðµð´ð²ð°ñ‰", "ñð¼ðµ", "ñð¼ññ…", "ñð¿ð¾ñ€ðµð´", "ññšð¼", "ññšñ", "ññšñ‰ð¾", "ññ‚ðµ", "ññ€ðµð´", "ññ€ðµñ‰ñƒ", "ñ", "ñðº", "ñžð¼ñ€ñƒðº", "ñƒ", "ñƒñ‚ñ€ðµ", "ñ‚", "ñ‚.ð½.", "ñ‚ð°ð·ð¸", "ñ‚ð°ðºð°", "ñ‚ð°ðºð¸ð²ð°", "ñ‚ð°ðºñšð²", "ñ‚ð°ð¼", "ñ‚ð²ð¾ð¹", "ñ‚ðµ", "ñ‚ðµð·ð¸", "ñ‚ð¸", "ñ‚ð¾", "ñ‚ð¾ð²ð°", "ñ‚ð¾ð³ð°ð²ð°", "ñ‚ð¾ð·ð¸", "ñ‚ð¾ð¹", "ñ‚ð¾ð»ðºð¾ð²ð°", "ñ‚ð¾ñ‡ð½ð¾", "ñ‚ñ", "ñ‚ññ…", "ñ‚ñšð¹", "ñ‚ñƒðº", "ñ‚ñ€ð¸", "ñ‚ñ€ñð±ð²ð°", "ñ‡", "ñ‡ð°ñð°", "ñ‡ðµ", "ñ‡ðµññ‚ð¾", "ñ‡ñ€ðµð·", "ñ…ð°ñ€ðµñð²ð°", "ñ…ð¸ð»ñð´ð¸", "ñ‰ðµ", "ñ‰ð¾ð¼", "ñ€ð°ð²ðµð½", "ñ€ð°ð²ð½ð°", "а", "автентичен", "аз", "ако", "ала", "бе", "без", "беше", "би", "бивш", "бивша", "бившо", "бил", "била", "били", "било", "благодаря", "близо", "бъдат", "бъде", "бяха", "в", "вас", "ваш", "ваша", "вероятно", "вече", "взема", "ви", "вие", "винаги", "внимава", "време", "все", "всеки", "всички", "всичко", "всяка", "във", "въпреки", "върху", "г", "ги", "главен", "главна", "главно", "глас", "го", "година", "години", "годишен", "д", "да", "дали", "два", "двама", "двамата", "две", "двете", "ден", "днес", "дни", "до", "добра", "добре", "добро", "добър", "докато", "докога", "дори", "досега", "доста", "друг", "друга", "други", "е", "евтин", "едва", "един", "една", "еднаква", "еднакви", "еднакъв", "едно", "екип", "ето", "живот", "за", "забавям", "зад", "заедно", "заради", "засега", "заспал", "затова", "защо", "защото", "и", "из", "или", "им", "има", "имат", "иска", "й", "каза", "как", "каква", "какво", "както", "какъв", "като", "кога", "когато", "което", "които", "кой", "който", "колко", "която", "къде", "където", "към", "лесен", "лесно", "ли", "лош", "м", "май", "малко", "ме", "между", "мек", "мен", "месец", "ми", "много", "мнозина", "мога", "могат", "може", "мокър", "моля", "момента", "му", "н", "на", "над", "назад", "най", "направи", "напред", "например", "нас", "не", "него", "нещо", "нея", "ни", "ние", "никой", "нито", "нищо", "но", "нов", "нова", "нови", "новина", "някои", "някой", "няколко", "няма", "обаче", "около", "освен", "особено", "от", "отгоре", "отново", "още", "пак", "по", "повече", "повечето", "под", "поне", "поради", "после", "почти", "прави", "пред", "преди", "през", "при", "пък", "първата", "първи", "първо", "пъти", "равен", "равна", "с", "са", "сам", "само", "се", "сега", "си", "син", "скоро", "след", "следващ", "сме", "смях", "според", "сред", "срещу", "сте", "съм", "със", "също", "т", "т.н.", "тази", "така", "такива", "такъв", "там", "твой", "те", "тези", "ти", "то", "това", "тогава", "този", "той", "толкова", "точно", "три", "трябва", "тук", "тъй", "тя", "тях", "у", "утре", "харесва", "хиляди", "ч", "часа", "че", "често", "чрез", "ще", "щом", "юмрук", "я", "як", ]; ================================================ FILE: src/stopwords/cat.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2020, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) // Stopwords list's original author: Lluís de Yzaguirre i Maura, Laboratori de Tecnologies \ // Lingüístiques de l'IULA-UPF (Institut de Lingüística Aplicada de la Universitat Pompeu Fabra) pub static STOPWORDS_CAT: &[&str] = &[ "a", "abans", "abans-d'ahir", "abintestat", "ací", "adesiara", "adés", "adéu", "adàgio", "ah", "ahir", "ai", "aitambé", "aitampoc", "aitan", "aitant", "aitantost", "aixà", "això", "així", "aleshores", "algun", "alguna", "algunes", "alguns", "algú", "alhora", "allà", "allèn", "allò", "allí", "almenys", "alto", "altra", "altre", "altres", "altresí", "altri", "alça", "al·legro", "amargament", "amb", "ambdues", "ambdós", "amunt", "amén", "anc", "andante", "andantino", "anit", "ans", "antany", "apa", "aprés", "aqueix", "aqueixa", "aqueixes", "aqueixos", "aqueixs", "aquell", "aquella", "aquelles", "aquells", "aquest", "aquesta", "aquestes", "aquests", "aquèn", "aquí", "ara", "arran", "arrera", "arrere", "arreu", "arri", "arruix", "atxim", "au", "avall", "avant", "aviat", "avui", "açò", "bah", "baix", "baldament", "ballmanetes", "banzim-banzam", "bastant", "bastants", "ben", "bis", "bitllo-bitllo", "bo", "bé", "ca", "cada", "cal", "cap", "car", "caram", "catorze", "cent", "centes", "cents", "cerca", "cert", "certa", "certes", "certs", "cinc", "cinquanta", "cinquena", "cinquenes", "cinquens", "cinquè", "com", "comsevulla", "contra", "cordons", "corrents", "cric-crac", "d", "daixonses", "daixò", "dallonses", "dallò", "dalt", "daltabaix", "damunt", "darrera", "darrere", "davall", "davant", "de", "debades", "dedins", "defora", "dejorn", "dejús", "dellà", "dementre", "dempeus", "demés", "demà", "des", "desena", "desenes", "desens", "després", "dessobre", "dessota", "dessús", "desè", "deu", "devers", "devora", "deçà", "diferents", "dinou", "dins", "dintre", "disset", "divers", "diversa", "diverses", "diversos", "divuit", "doncs", "dos", "dotze", "dues", "durant", "ecs", "eh", "el", "ela", "elis", "ell", "ella", "elles", "ells", "els", "em", "emperò", "en", "enans", "enant", "encara", "encontinent", "endalt", "endarrera", "endarrere", "endavant", "endebades", "endemig", "endemés", "endemà", "endins", "endintre", "enfora", "engir", "enguany", "enguanyasses", "enjús", "enlaire", "enlloc", "enllà", "enrera", "enrere", "ens", "ensems", "ensota", "ensús", "entorn", "entre", "entremig", "entretant", "entrò", "envers", "envides", "environs", "enviró", "ençà", "ep", "ep", "era", "eren", "eres", "ergo", "es", "escar", "essent", "esser", "est", "esta", "estada", "estades", "estan", "estant", "estar", "estaran", "estarem", "estareu", "estaria", "estarien", "estaries", "estaré", "estarà", "estaràs", "estaríem", "estaríeu", "estat", "estats", "estava", "estaven", "estaves", "estem", "estes", "esteu", "estic", "estiguem", "estigueren", "estigueres", "estigues", "estiguessis", "estigueu", "estigui", "estiguin", "estiguis", "estigué", "estiguérem", "estiguéreu", "estigués", "estiguí", "estos", "està", "estàs", "estàvem", "estàveu", "et", "etc", "etcètera", "ets", "excepte", "fins", "fora", "foren", "fores", "força", "fos", "fossin", "fossis", "fou", "fra", "fui", "fóra", "fórem", "fóreu", "fóreu", "fóssim", "fóssiu", "gaire", "gairebé", "gaires", "gens", "girientorn", "gratis", "ha", "hagi", "hagin", "hagis", "haguda", "hagudes", "hagueren", "hagueres", "haguessin", "haguessis", "hagut", "haguts", "hagué", "haguérem", "haguéreu", "hagués", "haguéssim", "haguéssiu", "haguí", "hala", "han", "has", "hauran", "haurem", "haureu", "hauria", "haurien", "hauries", "hauré", "haurà", "hauràs", "hauríem", "hauríeu", "havem", "havent", "haver", "haveu", "havia", "havien", "havies", "havíem", "havíeu", "he", "hem", "heu", "hi", "ho", "hom", "hui", "hàgim", "hàgiu", "i", "igual", "iguals", "inclusive", "ja", "jamai", "jo", "l", "la", "leri-leri", "les", "li", "lla", "llavors", "llevat", "lluny", "llur", "llurs", "lo", "los", "ls", "m", "ma", "mai", "mal", "malament", "malgrat", "manco", "mant", "manta", "mantes", "mantinent", "mants", "massa", "mateix", "mateixa", "mateixes", "mateixos", "me", "mentre", "mentrestant", "menys", "mes", "meu", "meua", "meues", "meus", "meva", "meves", "mi", "mig", "mil", "mitges", "mitja", "mitjançant", "mitjos", "moixoni", "molt", "molta", "moltes", "molts", "mon", "mos", "més", "n", "na", "ne", "ni", "ningú", "no", "nogensmenys", "només", "noranta", "nos", "nosaltres", "nostra", "nostre", "nostres", "nou", "novena", "novenes", "novens", "novè", "ns", "nòs", "nós", "o", "oh", "oi", "oidà", "on", "onsevulga", "onsevulla", "onze", "pas", "pengim-penjam", "per", "perquè", "pertot", "però", "piano", "pla", "poc", "poca", "pocs", "poques", "potser", "prest", "primer", "primera", "primeres", "primers", "pro", "prompte", "prop", "prou", "puix", "pus", "pàssim", "qual", "quals", "qualsevol", "qualsevulla", "qualssevol", "qualssevulla", "quan", "quant", "quanta", "quantes", "quants", "quaranta", "quart", "quarta", "quartes", "quarts", "quasi", "quatre", "que", "quelcom", "qui", "quin", "quina", "quines", "quins", "quinze", "quisvulla", "què", "ran", "re", "rebé", "renoi", "rera", "rere", "res", "retruc", "s", "sa", "salvament", "salvant", "salvat", "se", "segon", "segona", "segones", "segons", "seguida", "seixanta", "sempre", "sengles", "sens", "sense", "ser", "seran", "serem", "sereu", "seria", "serien", "series", "seré", "serà", "seràs", "seríem", "seríeu", "ses", "set", "setanta", "setena", "setenes", "setens", "setze", "setè", "seu", "seua", "seues", "seus", "seva", "seves", "si", "sia", "siau", "sic", "siguem", "sigues", "sigueu", "sigui", "siguin", "siguis", "sinó", "sis", "sisena", "sisenes", "sisens", "sisè", "sobre", "sobretot", "sol", "sola", "solament", "soles", "sols", "som", "son", "sos", "sota", "sots", "sou", "sovint", "suara", "sí", "sóc", "són", "t", "ta", "tal", "tals", "també", "tampoc", "tan", "tanmateix", "tant", "tanta", "tantes", "tantost", "tants", "te", "tercer", "tercera", "terceres", "tercers", "tes", "teu", "teua", "teues", "teus", "teva", "teves", "ton", "tos", "tost", "tostemps", "tot", "tota", "total", "totes", "tothom", "tothora", "tots", "trenta", "tres", "tret", "tretze", "tu", "tururut", "u", "uf", "ui", "uix", "ultra", "un", "una", "unes", "uns", "up", "upa", "us", "va", "vagi", "vagin", "vagis", "vaig", "vair", "vam", "van", "vares", "vas", "vau", "vem", "verbigràcia", "vers", "vet", "veu", "vint", "vora", "vos", "vosaltres", "vostra", "vostre", "vostres", "vostè", "vostès", "vuit", "vuitanta", "vuitena", "vuitenes", "vuitens", "vuitè", "vés", "vàreig", "vàrem", "vàreu", "vós", "xano-xano", "xau-xau", "xec", "érem", "éreu", "és", "ésser", "àdhuc", "àlies", "ça", "ço", "òlim", "ídem", "últim", "última", "últimes", "últims", "únic", "única", "únics", "úniques", ]; ================================================ FILE: src/stopwords/ces.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_CES: &[&str] = &[ "a", "aby", "ahoj", "aj", "ale", "anebo", "ani", "aniž", "ano", "asi", "aspoåˆ", "aspoň", "atd", "atp", "az", "aäkoli", "ačkoli", "až", "bez", "beze", "blã\u{AD}zko", "blízko", "bohuå¾el", "bohužel", "brzo", "bude", "budem", "budeme", "budes", "budete", "budeå¡", "budeš", "budou", "budu", "by", "byl", "byla", "byli", "bylo", "byly", "bys", "byt", "bä›hem", "být", "během", "chce", "chceme", "chcete", "chceå¡", "chceš", "chci", "chtã\u{AD}t", "chtä›jã\u{AD}", "chtít", "chtějí", "chut'", "chuti", "ci", "clanek", "clanku", "clanky", "co", "coz", "což", "cz", "daleko", "dalsi", "další", "den", "deset", "design", "devatenáct", "devatenã¡ct", "devä›t", "devět", "dnes", "do", "dobrã½", "dobrý", "docela", "dva", "dvacet", "dvanáct", "dvanã¡ct", "dvä›", "dvě", "dál", "dále", "dã¡l", "dã¡le", "dä›kovat", "dä›kujeme", "dä›kuji", "děkovat", "děkujeme", "děkuji", "email", "ho", "hodnä›", "hodně", "i", "jak", "jakmile", "jako", "jakož", "jde", "je", "jeden", "jedenáct", "jedenã¡ct", "jedna", "jedno", "jednou", "jedou", "jeho", "jehož", "jej", "jeji", "jejich", "jejã\u{AD}", "její", "jelikož", "jemu", "jen", "jenom", "jenž", "jeste", "jestli", "jestliå¾e", "jestliže", "jeå¡tä›", "ještě", "jež", "ji", "jich", "jimi", "jinak", "jine", "jiné", "jiz", "již", "jsem", "jses", "jseš", "jsi", "jsme", "jsou", "jste", "já", "jã¡", "jã\u{AD}", "jã\u{AD}m", "jí", "jím", "jíž", "jšte", "k", "kam", "každý", "kde", "kdo", "kdy", "kdyz", "kdyå¾", "když", "ke", "kolik", "kromä›", "kromě", "ktera", "ktere", "kteri", "kterou", "ktery", "která", "kterã¡", "kterã©", "kterã½", "které", "který", "kteå™ã\u{AD}", "kteři", "kteří", "ku", "kvå¯li", "kvůli", "ma", "majã\u{AD}", "mají", "mate", "me", "mezi", "mi", "mit", "mne", "mnou", "mnä›", "mně", "moc", "mohl", "mohou", "moje", "moji", "moå¾nã¡", "možná", "muj", "musã\u{AD}", "musí", "muze", "my", "má", "málo", "mám", "máme", "máte", "máš", "mã¡", "mã¡lo", "mã¡m", "mã¡me", "mã¡te", "mã¡å¡", "mã©", "mã\u{AD}", "mã\u{AD}t", "mä›", "må¯j", "må¯å¾e", "mé", "mí", "mít", "mě", "můj", "může", "na", "nad", "nade", "nam", "napiste", "napište", "naproti", "nas", "nasi", "naå¡e", "naå¡i", "načež", "naše", "naši", "ne", "nebo", "nebyl", "nebyla", "nebyli", "nebyly", "nechť", "nedä›lajã\u{AD}", "nedä›lã¡", "nedä›lã¡m", "nedä›lã¡me", "nedä›lã¡te", "nedä›lã¡å¡", "nedělají", "nedělá", "nedělám", "neděláme", "neděláte", "neděláš", "neg", "nejsi", "nejsou", "nemajã\u{AD}", "nemají", "nemáme", "nemáte", "nemã¡me", "nemã¡te", "nemä›l", "neměl", "neni", "nenã\u{AD}", "není", "nestaäã\u{AD}", "nestačí", "nevadã\u{AD}", "nevadí", "nez", "neå¾", "než", "nic", "nich", "nimi", "nove", "novy", "nové", "nový", "nula", "ná", "nám", "námi", "nás", "náš", "nã¡m", "nã¡mi", "nã¡s", "nã¡å¡", "nã\u{AD}m", "nä›", "nä›co", "nä›jak", "nä›kde", "nä›kdo", "nä›mu", "ní", "ním", "ně", "něco", "nějak", "někde", "někdo", "němu", "němuž", "o", "od", "ode", "on", "ona", "oni", "ono", "ony", "osm", "osmnáct", "osmnã¡ct", "pak", "patnáct", "patnã¡ct", "po", "pod", "podle", "pokud", "potom", "pouze", "pozdä›", "pozdě", "poå™ã¡d", "pořád", "prave", "pravé", "pred", "pres", "pri", "pro", "proc", "prostä›", "prostě", "prosã\u{AD}m", "prosím", "proti", "proto", "protoze", "protoå¾e", "protože", "proä", "proč", "prvni", "první", "práve", "pta", "pä›t", "på™ed", "på™es", "på™ese", "pět", "před", "přede", "přes", "přese", "při", "přičemž", "re", "rovnä›", "rovně", "s", "se", "sedm", "sedmnáct", "sedmnã¡ct", "si", "sice", "skoro", "smã\u{AD}", "smä›jã\u{AD}", "smí", "smějí", "snad", "spolu", "sta", "sto", "strana", "stã©", "sté", "sve", "svych", "svym", "svymi", "své", "svých", "svým", "svými", "svůj", "ta", "tady", "tak", "take", "takhle", "taky", "takze", "také", "takže", "tam", "tamhle", "tamhleto", "tamto", "tato", "te", "tebe", "tebou", "ted'", "tedy", "tema", "ten", "tento", "teto", "ti", "tim", "timto", "tipy", "tisã\u{AD}c", "tisã\u{AD}ce", "tisíc", "tisíce", "to", "tobä›", "tobě", "tohle", "toho", "tohoto", "tom", "tomto", "tomu", "tomuto", "toto", "troå¡ku", "trošku", "tu", "tuto", "tvoje", "tvá", "tvã¡", "tvã©", "två¯j", "tvé", "tvůj", "ty", "tyto", "tä›", "tå™eba", "tå™i", "tå™inã¡ct", "téma", "této", "tím", "tímto", "tě", "těm", "těma", "těmu", "třeba", "tři", "třináct", "u", "uräitä›", "určitě", "uz", "uå¾", "už", "v", "vam", "vas", "vase", "vaå¡e", "vaå¡i", "vaše", "vaši", "ve", "vedle", "veäer", "večer", "vice", "vlastnä›", "vlastně", "vsak", "vy", "vám", "vámi", "vás", "váš", "vã¡m", "vã¡mi", "vã¡s", "vã¡å¡", "vå¡echno", "vå¡ichni", "vå¯bec", "vå¾dy", "více", "však", "všechen", "všechno", "všichni", "vůbec", "vždy", "z", "za", "zatã\u{AD}mco", "zatímco", "zaä", "zač", "zda", "zde", "ze", "zpet", "zpravy", "zprávy", "zpět", "äau", "ätrnã¡ct", "ätyå™i", "å¡est", "å¡estnã¡ct", "å¾e", "čau", "či", "článek", "článku", "články", "čtrnáct", "čtyři", "šest", "šestnáct", "že", ]; ================================================ FILE: src/stopwords/cmn.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_CMN: &[&str] = &[ "、", "。", "〈", "〉", "《", "》", "一", "一些", "一何", "一切", "一则", "一方面", "一旦", "一来", "一样", "一般", "一转眼", "七", "万一", "三", "上", "上下", "下", "不", "不仅", "不但", "不光", "不单", "不只", "不外乎", "不如", "不妨", "不尽", "不尽然", "不得", "不怕", "不惟", "不成", "不拘", "不料", "不是", "不比", "不然", "不特", "不独", "不管", "不至于", "不若", "不论", "不过", "不问", "与", "与其", "与其说", "与否", "与此同时", "且", "且不说", "且说", "两者", "个", "个别", "中", "临", "为", "为了", "为什么", "为何", "为止", "为此", "为着", "乃", "乃至", "乃至于", "么", "之", "之一", "之所以", "之类", "乌乎", "乎", "乘", "九", "也", "也好", "也罢", "了", "二", "二来", "于", "于是", "于是乎", "云云", "云尔", "五", "些", "亦", "人", "人们", "人家", "什", "什么", "什么样", "今", "介于", "仍", "仍旧", "从", "从此", "从而", "他", "他人", "他们", "他们们", "以", "以上", "以为", "以便", "以免", "以及", "以故", "以期", "以来", "以至", "以至于", "以致", "们", "任", "任何", "任凭", "会", "似的", "但", "但凡", "但是", "何", "何以", "何况", "何处", "何时", "余外", "作为", "你", "你们", "使", "使得", "例如", "依", "依据", "依照", "便于", "俺", "俺们", "倘", "倘使", "倘或", "倘然", "倘若", "借", "借傥然", "假使", "假如", "假若", "做", "像", "儿", "先不先", "光是", "全体", "全部", "八", "六", "兮", "共", "关于", "关于具体地说", "其", "其一", "其中", "其二", "其他", "其余", "其它", "其次", "具体地说", "具体说来", "兼之", "内", "再", "再其次", "再则", "再有", "再者", "再者说", "再说", "冒", "冲", "况且", "几", "几时", "凡", "凡是", "凭", "凭借", "出于", "出来", "分", "分别", "则", "则甚", "别", "别人", "别处", "别是", "别的", "别管", "别说", "到", "前后", "前此", "前者", "加之", "加以", "即", "即令", "即使", "即便", "即如", "即或", "即若", "却", "去", "又", "又及", "及", "及其", "及至", "反之", "反而", "反过来", "反过来说", "受到", "另", "另一方面", "另外", "另悉", "只", "只当", "只怕", "只是", "只有", "只消", "只要", "只限", "叫", "叮咚", "可", "可以", "可是", "可见", "各", "各个", "各位", "各种", "各自", "同", "同时", "后", "后者", "向", "向使", "向着", "吓", "吗", "否则", "吧", "吧哒", "含", "吱", "呀", "呃", "呕", "呗", "呜", "呜呼", "呢", "呵", "呵呵", "呸", "呼哧", "咋", "和", "咚", "咦", "咧", "咱", "咱们", "咳", "哇", "哈", "哈哈", "哉", "哎", "哎呀", "哎哟", "哗", "哟", "哦", "哩", "哪", "哪个", "哪些", "哪儿", "哪天", "哪年", "哪怕", "哪样", "哪边", "哪里", "哼", "哼唷", "唉", "唯有", "啊", "啐", "啥", "啦", "啪达", "啷当", "喂", "喏", "喔唷", "喽", "嗡", "嗡嗡", "嗬", "嗯", "嗳", "嘎", "嘎登", "嘘", "嘛", "嘻", "嘿", "嘿嘿", "四", "因", "因为", "因了", "因此", "因着", "因而", "固然", "在", "在下", "在于", "地", "基于", "处在", "多", "多么", "多少", "大", "大家", "她", "她们", "好", "如", "如上", "如上所述", "如下", "如何", "如其", "如同", "如是", "如果", "如此", "如若", "始而", "孰料", "孰知", "宁", "宁可", "宁愿", "宁肯", "它", "它们", "对", "对于", "对待", "对方", "对比", "将", "小", "尔", "尔后", "尔尔", "尚且", "就", "就是", "就是了", "就是说", "就算", "就要", "尽", "尽管", "尽管如此", "岂但", "己", "已", "已矣", "巴", "巴巴", "年", "并", "并且", "庶乎", "庶几", "开外", "开始", "归", "归齐", "当", "当地", "当然", "当着", "彼", "彼时", "彼此", "往", "待", "很", "得", "得了", "怎", "怎么", "怎么办", "怎么样", "怎奈", "怎样", "总之", "总的来看", "总的来说", "总的说来", "总而言之", "恰恰相反", "您", "惟其", "慢说", "我", "我们", "或", "或则", "或是", "或曰", "或者", "截至", "所", "所以", "所在", "所幸", "所有", "才", "才能", "打", "打从", "把", "抑或", "拿", "按", "按照", "换句话说", "换言之", "据", "据此", "接着", "故", "故此", "故而", "旁人", "无", "无宁", "无论", "既", "既往", "既是", "既然", "日", "时", "时候", "是", "是以", "是的", "更", "曾", "替", "替代", "最", "月", "有", "有些", "有关", "有及", "有时", "有的", "望", "朝", "朝着", "本", "本人", "本地", "本着", "本身", "来", "来着", "来自", "来说", "极了", "果然", "果真", "某", "某个", "某些", "某某", "根据", "欤", "正值", "正如", "正巧", "正是", "此", "此地", "此处", "此外", "此时", "此次", "此间", "毋宁", "每", "每当", "比", "比及", "比如", "比方", "没奈何", "沿", "沿着", "漫说", "焉", "然则", "然后", "然而", "照", "照着", "犹且", "犹自", "甚且", "甚么", "甚或", "甚而", "甚至", "甚至于", "用", "用来", "由", "由于", "由是", "由此", "由此可见", "的", "的确", "的话", "直到", "相对而言", "省得", "看", "眨眼", "着", "着呢", "矣", "矣乎", "矣哉", "离", "秒", "竟而", "第", "等", "等到", "等等", "简言之", "管", "类如", "紧接着", "纵", "纵令", "纵使", "纵然", "经", "经过", "结果", "给", "继之", "继后", "继而", "综上所述", "罢了", "者", "而", "而且", "而况", "而后", "而外", "而已", "而是", "而言", "能", "能否", "腾", "自", "自个儿", "自从", "自各儿", "自后", "自家", "自己", "自打", "自身", "至", "至于", "至今", "至若", "致", "般的", "若", "若夫", "若是", "若果", "若非", "莫不然", "莫如", "莫若", "虽", "虽则", "虽然", "虽说", "被", "要", "要不", "要不是", "要不然", "要么", "要是", "譬喻", "譬如", "让", "许多", "论", "设使", "设或", "设若", "诚如", "诚然", "该", "说", "说来", "请", "诸", "诸位", "诸如", "谁", "谁人", "谁料", "谁知", "贼死", "赖以", "赶", "起", "起见", "趁", "趁着", "越是", "距", "跟", "较", "较之", "边", "过", "还", "还是", "还有", "还要", "这", "这一来", "这个", "这么", "这么些", "这么样", "这么点儿", "这些", "这会儿", "这儿", "这就是说", "这时", "这样", "这次", "这般", "这边", "这里", "进而", "连", "连同", "逐步", "通过", "遵循", "遵照", "那", "那个", "那么", "那么些", "那么样", "那些", "那会儿", "那儿", "那时", "那样", "那般", "那边", "那里", "都", "鄙人", "鉴于", "针对", "阿", "除", "除了", "除外", "除开", "除此之外", "除非", "随", "随后", "随时", "随着", "难道说", "零", "非", "非但", "非徒", "非特", "非独", "靠", "顺", "顺着", "首先", "︿", "!", "#", "$", "%", "&", "(", ")", "*", "+", ",", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", ":", ";", "<", ">", "?", "@", "[", "]", "{", "|", "}", "~", "¥", "一則", "一來", "一樣", "一轉眼", "萬一", "不僅", "不單", "不盡", "不盡然", "不獨", "不至於", "不論", "不過", "不問", "與", "與其", "與其說", "與否", "與此同時", "且不說", "且說", "兩者", "個", "個別", "臨", "爲", "爲了", "爲什麼", "爲何", "爲止", "爲此", "爲著", "乃至於", "麼", "之類", "烏乎", "也罷", "二來", "於", "於是", "於是乎", "云爾", "人們", "什麼", "什麼樣", "介於", "仍舊", "從", "從此", "從而", "他們", "他們們", "以爲", "以來", "以至於", "們", "任憑", "會", "何況", "何處", "何時", "餘外", "作爲", "你們", "依據", "便於", "俺們", "借儻然", "兒", "全體", "關於", "關於具體地說", "其餘", "具體地說", "具體說來", "內", "再則", "再者說", "再說", "衝", "況且", "幾", "幾時", "憑", "憑藉", "出於", "出來", "分別", "則", "則甚", "別", "別人", "別處", "別是", "別的", "別管", "別說", "前後", "卻", "反過來", "反過來說", "只當", "可見", "各個", "各種", "同時", "後", "後者", "嚇", "嗎", "否則", "吧噠", "嘔", "唄", "嗚", "嗚呼", "咱們", "哎喲", "譁", "喲", "哪個", "哪兒", "哪樣", "哪邊", "哪裡", "啪達", "啷噹", "嘍", "噯", "噓", "因爲", "在於", "基於", "處在", "多麼", "她們", "寧", "寧可", "寧願", "寧肯", "它們", "對", "對於", "對待", "對方", "對比", "將", "爾", "爾後", "爾爾", "就是說", "盡", "儘管", "儘管如此", "豈但", "並", "並且", "庶幾", "開外", "開始", "歸", "歸齊", "當", "當地", "當然", "當著", "彼時", "怎麼", "怎麼辦", "怎麼樣", "怎樣", "總之", "總的來看", "總的來說", "總的說來", "總而言之", "慢說", "我們", "或則", "打從", "換句話說", "換言之", "據", "據此", "無", "無寧", "無論", "時", "時候", "有關", "有時", "來", "來著", "來自", "來說", "極了", "某個", "根據", "歟", "此處", "此時", "此間", "毋寧", "每當", "沒奈何", "漫說", "然則", "然後", "猶且", "猶自", "甚麼", "甚至於", "用來", "由於", "由此可見", "的確", "的話", "相對而言", "離", "簡言之", "類如", "緊接著", "縱", "縱令", "縱使", "縱然", "經", "經過", "結果", "給", "繼之", "繼後", "繼而", "綜上所述", "罷了", "而況", "而後", "騰", "自個兒", "自從", "自各兒", "自後", "至於", "雖", "雖則", "雖然", "雖說", "要麼", "讓", "許多", "論", "設使", "設或", "設若", "誠如", "誠然", "該", "說", "說來", "請", "諸", "諸位", "諸如", "誰", "誰人", "誰料", "誰知", "賊死", "賴以", "趕", "起見", "較", "較之", "邊", "過", "還", "還是", "還有", "還要", "這", "這一來", "這個", "這麼", "這麼些", "這麼樣", "這麼點兒", "這些", "這會兒", "這兒", "這就是說", "這時", "這樣", "這次", "這般", "這邊", "這裡", "進而", "連", "連同", "通過", "那個", "那麼", "那麼些", "那麼樣", "那會兒", "那兒", "那時", "那樣", "那邊", "那裡", "鑑於", "針對", "除開", "隨", "隨後", "隨時", "隨著", "難道說", "非獨", "順", "順著", ]; ================================================ FILE: src/stopwords/dan.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_DAN: &[&str] = &[ "ad", "af", "aldrig", "alle", "alt", "anden", "andet", "andre", "at", "bare", "begge", "blev", "blive", "bliver", "da", "de", "dem", "den", "denne", "der", "deres", "det", "dette", "dig", "din", "dine", "disse", "dit", "dog", "du", "efter", "ej", "eller", "en", "end", "ene", "eneste", "enhver", "er", "et", "far", "fem", "fik", "fire", "flere", "fleste", "for", "fordi", "forrige", "fra", "få", "får", "før", "god", "godt", "ham", "han", "hans", "har", "havde", "have", "hej", "helt", "hende", "hendes", "her", "hos", "hun", "hvad", "hvem", "hver", "hvilken", "hvis", "hvor", "hvordan", "hvorfor", "hvornår", "i", "ikke", "ind", "ingen", "intet", "ja", "jeg", "jer", "jeres", "jo", "kan", "kom", "komme", "kommer", "kun", "kunne", "lad", "lav", "lidt", "lige", "lille", "man", "mand", "mange", "med", "meget", "men", "mens", "mere", "mig", "min", "mine", "mit", "mod", "må", "ned", "nej", "ni", "nogen", "noget", "nogle", "nu", "ny", "nyt", "når", "nær", "næste", "næsten", "og", "også", "okay", "om", "op", "os", "otte", "over", "på", "se", "seks", "selv", "ser", "ses", "sig", "sige", "sin", "sine", "sit", "skal", "skulle", "som", "stor", "store", "syv", "så", "sådan", "tag", "tage", "thi", "ti", "til", "to", "tre", "ud", "under", "var", "ved", "vi", "vil", "ville", "vor", "vores", "være", "været", ]; ================================================ FILE: src/stopwords/deu.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_DEU: &[&str] = &[ "a", "ab", "aber", "ach", "acht", "achte", "achten", "achter", "achtes", "ag", "alle", "allein", "allem", "allen", "aller", "allerdings", "alles", "allgemeinen", "als", "also", "am", "an", "ander", "andere", "anderem", "anderen", "anderer", "anderes", "anderm", "andern", "anderr", "anders", "au", "auch", "auf", "aus", "ausser", "ausserdem", "außer", "außerdem", "b", "bald", "bei", "beide", "beiden", "beim", "beispiel", "bekannt", "bereits", "besonders", "besser", "besten", "bin", "bis", "bisher", "bist", "c", "d", "d.h", "da", "dabei", "dadurch", "dafür", "dagegen", "daher", "dahin", "dahinter", "damals", "damit", "danach", "daneben", "dank", "dann", "daran", "darauf", "daraus", "darf", "darfst", "darin", "darum", "darunter", "darüber", "das", "dasein", "daselbst", "dass", "dasselbe", "davon", "davor", "dazu", "dazwischen", "daß", "dein", "deine", "deinem", "deinen", "deiner", "deines", "dem", "dementsprechend", "demgegenüber", "demgemäss", "demgemäß", "demselben", "demzufolge", "den", "denen", "denn", "denselben", "der", "deren", "derer", "derjenige", "derjenigen", "dermassen", "dermaßen", "derselbe", "derselben", "des", "deshalb", "desselben", "dessen", "deswegen", "dich", "die", "diejenige", "diejenigen", "dies", "diese", "dieselbe", "dieselben", "diesem", "diesen", "dieser", "dieses", "dir", "doch", "dort", "drei", "drin", "dritte", "dritten", "dritter", "drittes", "du", "durch", "durchaus", "durfte", "durften", "dürfen", "dürft", "e", "eben", "ebenso", "ehrlich", "ei", "ei,", "eigen", "eigene", "eigenen", "eigener", "eigenes", "ein", "einander", "eine", "einem", "einen", "einer", "eines", "einig", "einige", "einigem", "einigen", "einiger", "einiges", "einmal", "eins", "elf", "en", "ende", "endlich", "entweder", "er", "ernst", "erst", "erste", "ersten", "erster", "erstes", "es", "etwa", "etwas", "euch", "euer", "eure", "eurem", "euren", "eurer", "eures", "f", "folgende", "früher", "fünf", "fünfte", "fünften", "fünfter", "fünftes", "für", "g", "gab", "ganz", "ganze", "ganzen", "ganzer", "ganzes", "gar", "gedurft", "gegen", "gegenüber", "gehabt", "gehen", "geht", "gekannt", "gekonnt", "gemacht", "gemocht", "gemusst", "genug", "gerade", "gern", "gesagt", "geschweige", "gewesen", "gewollt", "geworden", "gibt", "ging", "gleich", "gott", "gross", "grosse", "grossen", "grosser", "grosses", "groß", "große", "großen", "großer", "großes", "gut", "gute", "guter", "gutes", "h", "hab", "habe", "haben", "habt", "hast", "hat", "hatte", "hatten", "hattest", "hattet", "heisst", "her", "heute", "hier", "hin", "hinter", "hoch", "hätte", "hätten", "i", "ich", "ihm", "ihn", "ihnen", "ihr", "ihre", "ihrem", "ihren", "ihrer", "ihres", "im", "immer", "in", "indem", "infolgedessen", "ins", "irgend", "ist", "j", "ja", "jahr", "jahre", "jahren", "je", "jede", "jedem", "jeden", "jeder", "jedermann", "jedermanns", "jedes", "jedoch", "jemand", "jemandem", "jemanden", "jene", "jenem", "jenen", "jener", "jenes", "jetzt", "k", "kam", "kann", "kannst", "kaum", "kein", "keine", "keinem", "keinen", "keiner", "keines", "kleine", "kleinen", "kleiner", "kleines", "kommen", "kommt", "konnte", "konnten", "kurz", "können", "könnt", "könnte", "l", "lang", "lange", "leicht", "leide", "lieber", "los", "m", "machen", "macht", "machte", "mag", "magst", "mahn", "mal", "man", "manche", "manchem", "manchen", "mancher", "manches", "mann", "mehr", "mein", "meine", "meinem", "meinen", "meiner", "meines", "mensch", "menschen", "mich", "mir", "mit", "mittel", "mochte", "mochten", "morgen", "muss", "musst", "musste", "mussten", "muß", "mußt", "möchte", "mögen", "möglich", "mögt", "müssen", "müsst", "müßt", "n", "na", "nach", "nachdem", "nahm", "natürlich", "neben", "nein", "neue", "neuen", "neun", "neunte", "neunten", "neunter", "neuntes", "nicht", "nichts", "nie", "niemand", "niemandem", "niemanden", "noch", "nun", "nur", "o", "ob", "oben", "oder", "offen", "oft", "ohne", "ordnung", "p", "q", "r", "recht", "rechte", "rechten", "rechter", "rechtes", "richtig", "rund", "s", "sa", "sache", "sagt", "sagte", "sah", "satt", "schlecht", "schluss", "schon", "sechs", "sechste", "sechsten", "sechster", "sechstes", "sehr", "sei", "seid", "seien", "sein", "seine", "seinem", "seinen", "seiner", "seines", "seit", "seitdem", "selbst", "sich", "sie", "sieben", "siebente", "siebenten", "siebenter", "siebentes", "sind", "so", "solang", "solche", "solchem", "solchen", "solcher", "solches", "soll", "sollen", "sollst", "sollt", "sollte", "sollten", "sondern", "sonst", "soweit", "sowie", "später", "startseite", "statt", "steht", "suche", "t", "tag", "tage", "tagen", "tat", "teil", "tel", "tritt", "trotzdem", "tun", "u", "uhr", "um", "und", "und?", "uns", "unse", "unsem", "unsen", "unser", "unsere", "unserer", "unses", "unter", "v", "vergangenen", "viel", "viele", "vielem", "vielen", "vielleicht", "vier", "vierte", "vierten", "vierter", "viertes", "vom", "von", "vor", "w", "wahr?", "wann", "war", "waren", "warst", "wart", "warum", "was", "weg", "wegen", "weil", "weit", "weiter", "weitere", "weiteren", "weiteres", "welche", "welchem", "welchen", "welcher", "welches", "wem", "wen", "wenig", "wenige", "weniger", "weniges", "wenigstens", "wenn", "wer", "werde", "werden", "werdet", "weshalb", "wessen", "wie", "wieder", "wieso", "will", "willst", "wir", "wird", "wirklich", "wirst", "wissen", "wo", "woher", "wohin", "wohl", "wollen", "wollt", "wollte", "wollten", "worden", "wurde", "wurden", "während", "währenddem", "währenddessen", "wäre", "würde", "würden", "x", "y", "z", "z.b", "zehn", "zehnte", "zehnten", "zehnter", "zehntes", "zeit", "zu", "zuerst", "zugleich", "zum", "zunächst", "zur", "zurück", "zusammen", "zwanzig", "zwar", "zwei", "zweite", "zweiten", "zweiter", "zweites", "zwischen", "zwölf", "über", "überhaupt", "übrigens", ]; ================================================ FILE: src/stopwords/ell.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_ELL: &[&str] = &[ "αλλα", "αν", "αντι", "απο", "αυτα", "αυτεσ", "αυτη", "αυτο", "αυτοι", "αυτοσ", "αυτουσ", "αυτων", "αἱ", "αἳ", "αἵ", "αὐτόσ", "αὐτὸς", "αὖ", "γάρ", "γα", "γα^", "γε", "για", "γοῦν", "γὰρ", "δ'", "δέ", "δή", "δαί", "δαίσ", "δαὶ", "δαὶς", "δε", "δεν", "δι'", "διά", "διὰ", "δὲ", "δὴ", "δ’", "εαν", "ειμαι", "ειμαστε", "ειναι", "εισαι", "ειστε", "εκεινα", "εκεινεσ", "εκεινη", "εκεινο", "εκεινοι", "εκεινοσ", "εκεινουσ", "εκεινων", "ενω", "επ", "επι", "εἰ", "εἰμί", "εἰμὶ", "εἰς", "εἰσ", "εἴ", "εἴμι", "εἴτε", "η", "θα", "ισωσ", "κ", "καί", "καίτοι", "καθ", "και", "κατ", "κατά", "κατα", "κατὰ", "καὶ", "κι", "κἀν", "κἂν", "μέν", "μή", "μήτε", "μα", "με", "μεθ", "μετ", "μετά", "μετα", "μετὰ", "μη", "μην", "μἐν", "μὲν", "μὴ", "μὴν", "να", "ο", "οι", "ομωσ", "οπωσ", "οσο", "οτι", "οἱ", "οἳ", "οἷς", "οὐ", "οὐδ", "οὐδέ", "οὐδείσ", "οὐδεὶς", "οὐδὲ", "οὐδὲν", "οὐκ", "οὐχ", "οὐχὶ", "οὓς", "οὔτε", "οὕτω", "οὕτως", "οὕτωσ", "οὖν", "οὗ", "οὗτος", "οὗτοσ", "παρ", "παρά", "παρα", "παρὰ", "περί", "περὶ", "ποια", "ποιεσ", "ποιο", "ποιοι", "ποιοσ", "ποιουσ", "ποιων", "ποτε", "που", "ποῦ", "προ", "προσ", "πρόσ", "πρὸ", "πρὸς", "πως", "πωσ", "σε", "στη", "στην", "στο", "στον", "σόσ", "σύ", "σύν", "σὸς", "σὺ", "σὺν", "τά", "τήν", "τί", "τίς", "τίσ", "τα", "ταῖς", "τε", "την", "τησ", "τι", "τινα", "τις", "τισ", "το", "τοί", "τοι", "τοιοῦτος", "τοιοῦτοσ", "τον", "τοτε", "του", "τούσ", "τοὺς", "τοῖς", "τοῦ", "των", "τό", "τόν", "τότε", "τὰ", "τὰς", "τὴν", "τὸ", "τὸν", "τῆς", "τῆσ", "τῇ", "τῶν", "τῷ", "ωσ", "ἀλλ'", "ἀλλά", "ἀλλὰ", "ἀλλ’", "ἀπ", "ἀπό", "ἀπὸ", "ἀφ", "ἂν", "ἃ", "ἄλλος", "ἄλλοσ", "ἄν", "ἄρα", "ἅμα", "ἐάν", "ἐγώ", "ἐγὼ", "ἐκ", "ἐμόσ", "ἐμὸς", "ἐν", "ἐξ", "ἐπί", "ἐπεὶ", "ἐπὶ", "ἐστι", "ἐφ", "ἐὰν", "ἑαυτοῦ", "ἔτι", "ἡ", "ἢ", "ἣ", "ἤ", "ἥ", "ἧς", "ἵνα", "ὁ", "ὃ", "ὃν", "ὃς", "ὅ", "ὅδε", "ὅθεν", "ὅπερ", "ὅς", "ὅσ", "ὅστις", "ὅστισ", "ὅτε", "ὅτι", "ὑμόσ", "ὑπ", "ὑπέρ", "ὑπό", "ὑπὲρ", "ὑπὸ", "ὡς", "ὡσ", "ὥς", "ὥστε", "ὦ", "ᾧ", ]; ================================================ FILE: src/stopwords/eng.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_ENG: &[&str] = &[ "'ll", "'tis", "'twas", "'ve", "10", "39", "a", "a's", "able", "ableabout", "about", "above", "abroad", "abst", "accordance", "according", "accordingly", "across", "act", "actually", "ad", "added", "adj", "adopted", "ae", "af", "affected", "affecting", "affects", "after", "afterwards", "ag", "again", "against", "ago", "ah", "ahead", "ai", "ain't", "aint", "al", "all", "allow", "allows", "almost", "alone", "along", "alongside", "already", "also", "although", "always", "am", "amid", "amidst", "among", "amongst", "amoungst", "amount", "an", "and", "announce", "another", "any", "anybody", "anyhow", "anymore", "anyone", "anything", "anyway", "anyways", "anywhere", "ao", "apart", "apparently", "appear", "appreciate", "appropriate", "approximately", "aq", "ar", "are", "area", "areas", "aren", "aren't", "arent", "arise", "around", "arpa", "as", "aside", "ask", "asked", "asking", "asks", "associated", "at", "au", "auth", "available", "aw", "away", "awfully", "az", "b", "ba", "back", "backed", "backing", "backs", "backward", "backwards", "bb", "bd", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "began", "begin", "beginning", "beginnings", "begins", "behind", "being", "beings", "believe", "below", "beside", "besides", "best", "better", "between", "beyond", "bf", "bg", "bh", "bi", "big", "bill", "billion", "biol", "bj", "bm", "bn", "bo", "both", "bottom", "br", "brief", "briefly", "bs", "bt", "but", "buy", "bv", "bw", "by", "bz", "c", "c'mon", "c's", "ca", "call", "came", "can", "can't", "cannot", "cant", "caption", "case", "cases", "cause", "causes", "cc", "cd", "certain", "certainly", "cf", "cg", "ch", "changes", "ci", "ck", "cl", "clear", "clearly", "click", "cm", "cmon", "cn", "co", "co.", "com", "come", "comes", "computer", "con", "concerning", "consequently", "consider", "considering", "contain", "containing", "contains", "copy", "corresponding", "could", "could've", "couldn", "couldn't", "couldnt", "course", "cr", "cry", "cs", "cu", "currently", "cv", "cx", "cy", "cz", "d", "dare", "daren't", "darent", "date", "de", "dear", "definitely", "describe", "described", "despite", "detail", "did", "didn", "didn't", "didnt", "differ", "different", "differently", "directly", "dj", "dk", "dm", "do", "does", "doesn", "doesn't", "doesnt", "doing", "don", "don't", "done", "dont", "doubtful", "down", "downed", "downing", "downs", "downwards", "due", "during", "dz", "e", "each", "early", "ec", "ed", "edu", "ee", "effect", "eg", "eh", "eight", "eighty", "either", "eleven", "else", "elsewhere", "empty", "end", "ended", "ending", "ends", "enough", "entirely", "er", "es", "especially", "et", "et-al", "etc", "even", "evenly", "ever", "evermore", "every", "everybody", "everyone", "everything", "everywhere", "ex", "exactly", "example", "except", "f", "face", "faces", "fact", "facts", "fairly", "far", "farther", "felt", "few", "fewer", "ff", "fi", "fifteen", "fifth", "fifty", "fify", "fill", "find", "finds", "fire", "first", "five", "fix", "fj", "fk", "fm", "fo", "followed", "following", "follows", "for", "forever", "former", "formerly", "forth", "forty", "forward", "found", "four", "fr", "free", "from", "front", "full", "fully", "further", "furthered", "furthering", "furthermore", "furthers", "fx", "g", "ga", "gave", "gb", "gd", "ge", "general", "generally", "get", "gets", "getting", "gf", "gg", "gh", "gi", "give", "given", "gives", "giving", "gl", "gm", "gmt", "gn", "go", "goes", "going", "gone", "good", "goods", "got", "gotten", "gov", "gp", "gq", "gr", "great", "greater", "greatest", "greetings", "group", "grouped", "grouping", "groups", "gs", "gt", "gu", "gw", "gy", "h", "had", "hadn't", "hadnt", "half", "happens", "hardly", "has", "hasn", "hasn't", "hasnt", "have", "haven", "haven't", "havent", "having", "he", "he'd", "he'll", "he's", "hed", "hell", "hello", "help", "hence", "her", "here", "here's", "hereafter", "hereby", "herein", "heres", "hereupon", "hers", "herself", "herse”", "hes", "hi", "hid", "high", "higher", "highest", "him", "himself", "himse”", "his", "hither", "hk", "hm", "hn", "home", "homepage", "hopefully", "how", "how'd", "how'll", "how's", "howbeit", "however", "hr", "ht", "htm", "html", "http", "hu", "hundred", "i", "i'd", "i'll", "i'm", "i've", "i.e.", "id", "ie", "if", "ignored", "ii", "il", "ill", "im", "immediate", "immediately", "importance", "important", "in", "inasmuch", "inc", "inc.", "indeed", "index", "indicate", "indicated", "indicates", "information", "inner", "inside", "insofar", "instead", "int", "interest", "interested", "interesting", "interests", "into", "invention", "inward", "io", "iq", "ir", "is", "isn", "isn't", "isnt", "it", "it'd", "it'll", "it's", "itd", "itll", "its", "itself", "itse”", "ive", "j", "je", "jm", "jo", "join", "jp", "just", "k", "ke", "keep", "keeps", "kept", "keys", "kg", "kh", "ki", "kind", "km", "kn", "knew", "know", "known", "knows", "kp", "kr", "kw", "ky", "kz", "l", "la", "large", "largely", "last", "lately", "later", "latest", "latter", "latterly", "lb", "lc", "least", "length", "less", "lest", "let", "let's", "lets", "li", "like", "liked", "likely", "likewise", "line", "little", "lk", "ll", "long", "longer", "longest", "look", "looking", "looks", "low", "lower", "lr", "ls", "lt", "ltd", "lu", "lv", "ly", "m", "ma", "made", "mainly", "make", "makes", "making", "man", "many", "may", "maybe", "mayn't", "maynt", "mc", "md", "me", "mean", "means", "meantime", "meanwhile", "member", "members", "men", "merely", "mg", "mh", "microsoft", "might", "might've", "mightn't", "mightnt", "mil", "mill", "million", "mine", "minus", "miss", "mk", "ml", "mm", "mn", "mo", "more", "moreover", "most", "mostly", "move", "mp", "mq", "mr", "mrs", "ms", "msie", "mt", "mu", "much", "mug", "must", "must've", "mustn't", "mustnt", "mv", "mw", "mx", "my", "myself", "myse”", "mz", "n", "na", "name", "namely", "nay", "nc", "nd", "ne", "near", "nearly", "necessarily", "necessary", "need", "needed", "needing", "needn't", "neednt", "needs", "neither", "net", "netscape", "never", "neverf", "neverless", "nevertheless", "new", "newer", "newest", "next", "nf", "ng", "ni", "nine", "ninety", "nl", "no", "no-one", "nobody", "non", "none", "nonetheless", "noone", "nor", "normally", "nos", "not", "noted", "nothing", "notwithstanding", "novel", "now", "nowhere", "np", "nr", "nu", "null", "number", "numbers", "nz", "o", "obtain", "obtained", "obviously", "of", "off", "often", "oh", "ok", "okay", "old", "older", "oldest", "om", "omitted", "on", "once", "one", "one's", "ones", "only", "onto", "open", "opened", "opening", "opens", "opposite", "or", "ord", "order", "ordered", "ordering", "orders", "org", "other", "others", "otherwise", "ought", "oughtn't", "oughtnt", "our", "ours", "ourselves", "out", "outside", "over", "overall", "owing", "own", "p", "pa", "page", "pages", "part", "parted", "particular", "particularly", "parting", "parts", "past", "pe", "per", "perhaps", "pf", "pg", "ph", "pk", "pl", "place", "placed", "places", "please", "plus", "pm", "pmid", "pn", "point", "pointed", "pointing", "points", "poorly", "possible", "possibly", "potentially", "pp", "pr", "predominantly", "present", "presented", "presenting", "presents", "presumably", "previously", "primarily", "probably", "problem", "problems", "promptly", "proud", "provided", "provides", "pt", "put", "puts", "pw", "py", "q", "qa", "que", "quickly", "quite", "qv", "r", "ran", "rather", "rd", "re", "readily", "really", "reasonably", "recent", "recently", "ref", "refs", "regarding", "regardless", "regards", "related", "relatively", "research", "reserved", "respectively", "resulted", "resulting", "results", "right", "ring", "ro", "room", "rooms", "round", "ru", "run", "rw", "s", "sa", "said", "same", "saw", "say", "saying", "says", "sb", "sc", "sd", "se", "sec", "second", "secondly", "seconds", "section", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "sees", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "seventy", "several", "sg", "sh", "shall", "shan't", "shant", "she", "she'd", "she'll", "she's", "shed", "shell", "shes", "should", "should've", "shouldn", "shouldn't", "shouldnt", "show", "showed", "showing", "shown", "showns", "shows", "si", "side", "sides", "significant", "significantly", "similar", "similarly", "since", "sincere", "site", "six", "sixty", "sj", "sk", "sl", "slightly", "sm", "small", "smaller", "smallest", "sn", "so", "some", "somebody", "someday", "somehow", "someone", "somethan", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "specifically", "specified", "specify", "specifying", "sr", "st", "state", "states", "still", "stop", "strongly", "su", "sub", "substantially", "successfully", "such", "sufficiently", "suggest", "sup", "sure", "sv", "sy", "system", "sz", "t", "t's", "take", "taken", "taking", "tc", "td", "tell", "ten", "tends", "test", "text", "tf", "tg", "th", "than", "thank", "thanks", "thanx", "that", "that'll", "that's", "that've", "thatll", "thats", "thatve", "the", "their", "theirs", "them", "themselves", "then", "thence", "there", "there'd", "there'll", "there're", "there's", "there've", "thereafter", "thereby", "thered", "therefore", "therein", "therell", "thereof", "therere", "theres", "thereto", "thereupon", "thereve", "these", "they", "they'd", "they'll", "they're", "they've", "theyd", "theyll", "theyre", "theyve", "thick", "thin", "thing", "things", "think", "thinks", "third", "thirty", "this", "thorough", "thoroughly", "those", "thou", "though", "thoughh", "thought", "thoughts", "thousand", "three", "throug", "through", "throughout", "thru", "thus", "til", "till", "tip", "tis", "tj", "tk", "tm", "tn", "to", "today", "together", "too", "took", "top", "toward", "towards", "tp", "tr", "tried", "tries", "trillion", "truly", "try", "trying", "ts", "tt", "turn", "turned", "turning", "turns", "tv", "tw", "twas", "twelve", "twenty", "twice", "two", "tz", "u", "ua", "ug", "uk", "um", "un", "under", "underneath", "undoing", "unfortunately", "unless", "unlike", "unlikely", "until", "unto", "up", "upon", "ups", "upwards", "us", "use", "used", "useful", "usefully", "usefulness", "uses", "using", "usually", "uucp", "uy", "uz", "v", "va", "value", "various", "vc", "ve", "versus", "very", "vg", "vi", "via", "viz", "vn", "vol", "vols", "vs", "vu", "w", "want", "wanted", "wanting", "wants", "was", "wasn", "wasn't", "wasnt", "way", "ways", "we", "we'd", "we'll", "we're", "we've", "web", "webpage", "website", "wed", "welcome", "well", "wells", "went", "were", "weren", "weren't", "werent", "weve", "wf", "what", "what'd", "what'll", "what's", "what've", "whatever", "whatll", "whats", "whatve", "when", "when'd", "when'll", "when's", "whence", "whenever", "where", "where'd", "where'll", "where's", "whereafter", "whereas", "whereby", "wherein", "wheres", "whereupon", "wherever", "whether", "which", "whichever", "while", "whilst", "whim", "whither", "who", "who'd", "who'll", "who's", "whod", "whoever", "whole", "wholl", "whom", "whomever", "whos", "whose", "why", "why'd", "why'll", "why's", "widely", "width", "will", "willing", "wish", "with", "within", "without", "won", "won't", "wonder", "wont", "words", "work", "worked", "working", "works", "world", "would", "would've", "wouldn", "wouldn't", "wouldnt", "ws", "www", "x", "y", "ye", "year", "years", "yes", "yet", "you", "you'd", "you'll", "you're", "you've", "youd", "youll", "young", "younger", "youngest", "your", "youre", "yours", "yourself", "yourselves", "youve", "yt", "yu", "z", "za", "zero", "zm", "zr", ]; ================================================ FILE: src/stopwords/epo.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_EPO: &[&str] = &[ "adiaŭ", "ajn", "al", "ankoraŭ", "antaŭ", "aŭ", "bonan", "bonvole", "bonvolu", "bv", "ci", "cia", "cian", "cin", "d-ro", "da", "de", "dek", "deka", "do", "doktor'", "doktoro", "du", "dua", "dum", "eble", "ekz", "ekzemple", "en", "estas", "estis", "estos", "estu", "estus", "eĉ", "f-no", "feliĉan", "for", "fraŭlino", "ha", "havas", "havis", "havos", "havu", "havus", "he", "ho", "hu", "ili", "ilia", "ilian", "ilin", "inter", "io", "ion", "iu", "iujn", "iun", "ja", "jam", "je", "jes", "k", "kaj", "ke", "kio", "kion", "kiu", "kiujn", "kiun", "kvankam", "kvar", "kvara", "kvazaŭ", "kvin", "kvina", "la", "li", "lia", "lian", "lin", "malantaŭ", "male", "malgraŭ", "mem", "mi", "mia", "mian", "min", "minus", "naŭ", "naŭa", "ne", "nek", "nenio", "nenion", "neniu", "neniun", "nepre", "ni", "nia", "nian", "nin", "nu", "nun", "nur", "ok", "oka", "oni", "onia", "onian", "onin", "plej", "pli", "plu", "plus", "por", "post", "preter", "s-no", "s-ro", "se", "sed", "sep", "sepa", "ses", "sesa", "si", "sia", "sian", "sin", "sinjor'", "sinjorino", "sinjoro", "sub", "super", "supren", "sur", "tamen", "tio", "tion", "tiu", "tiujn", "tiun", "tra", "tri", "tria", "tuj", "tute", "unu", "unua", "ve", "verŝajne", "vi", "via", "vian", "vin", "ĉi", "ĉio", "ĉion", "ĉiu", "ĉiujn", "ĉiun", "ĉu", "ĝi", "ĝia", "ĝian", "ĝin", "ĝis", "ĵus", "ŝi", "ŝia", "ŝin", ]; ================================================ FILE: src/stopwords/est.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_EST: &[&str] = &[ "aga", "ei", "et", "ja", "jah", "kas", "kui", "kõik", "ma", "me", "mida", "midagi", "mind", "minu", "mis", "mu", "mul", "mulle", "nad", "nii", "oled", "olen", "oli", "oma", "on", "pole", "sa", "seda", "see", "selle", "siin", "siis", "ta", "te", "ära", ]; ================================================ FILE: src/stopwords/fin.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_FIN: &[&str] = &[ "aiemmin", "aika", "aikaa", "aikaan", "aikaisemmin", "aikaisin", "aikajen", "aikana", "aikoina", "aikoo", "aikovat", "aina", "ainakaan", "ainakin", "ainoa", "ainoat", "aiomme", "aion", "aiotte", "aist", "aivan", "ajan", "alas", "alemmas", "alkuisin", "alkuun", "alla", "alle", "aloitamme", "aloitan", "aloitat", "aloitatte", "aloitattivat", "aloitettava", "aloitettevaksi", "aloitettu", "aloitimme", "aloitin", "aloitit", "aloititte", "aloittaa", "aloittamatta", "aloitti", "aloittivat", "alta", "aluksi", "alussa", "alusta", "annettavaksi", "annetteva", "annettu", "ansiosta", "antaa", "antamatta", "antoi", "aoua", "apu", "asia", "asiaa", "asian", "asiasta", "asiat", "asioiden", "asioihin", "asioita", "asti", "avuksi", "avulla", "avun", "avutta", "edelle", "edelleen", "edellä", "edeltä", "edemmäs", "edes", "edessä", "edestä", "ehkä", "ei", "eikä", "eilen", "eivät", "eli", "ellei", "elleivät", "ellemme", "ellen", "ellet", "ellette", "emme", "en", "enemmän", "eniten", "ennen", "ensi", "ensimmäinen", "ensimmäiseksi", "ensimmäisen", "ensimmäisenä", "ensimmäiset", "ensimmäisiksi", "ensimmäisinä", "ensimmäisiä", "ensimmäistä", "ensin", "entinen", "entisen", "entisiä", "entisten", "entistä", "enää", "eri", "erittäin", "erityisesti", "eräiden", "eräs", "eräät", "esi", "esiin", "esillä", "esimerkiksi", "et", "eteen", "etenkin", "etessa", "ette", "ettei", "että", "haikki", "halua", "haluaa", "haluamatta", "haluamme", "haluan", "haluat", "haluatte", "haluavat", "halunnut", "halusi", "halusimme", "halusin", "halusit", "halusitte", "halusivat", "halutessa", "haluton", "he", "hei", "heidän", "heidät", "heihin", "heille", "heillä", "heiltä", "heissä", "heistä", "heitä", "helposti", "heti", "hetkellä", "hieman", "hitaasti", "hoikein", "huolimatta", "huomenna", "hyvien", "hyviin", "hyviksi", "hyville", "hyviltä", "hyvin", "hyvinä", "hyvissä", "hyvistä", "hyviä", "hyvä", "hyvät", "hyvää", "hän", "häneen", "hänelle", "hänellä", "häneltä", "hänen", "hänessä", "hänestä", "hänet", "häntä", "ihan", "ilman", "ilmeisesti", "itse", "itsensä", "itseään", "ja", "jo", "johon", "joiden", "joihin", "joiksi", "joilla", "joille", "joilta", "joina", "joissa", "joista", "joita", "joka", "jokainen", "jokin", "joko", "joksi", "joku", "jolla", "jolle", "jolloin", "jolta", "jompikumpi", "jona", "jonka", "jonkin", "jonne", "joo", "jopa", "jos", "joskus", "jossa", "josta", "jota", "jotain", "joten", "jotenkin", "jotenkuten", "jotka", "jotta", "jouduimme", "jouduin", "jouduit", "jouduitte", "joudumme", "joudun", "joudutte", "joukkoon", "joukossa", "joukosta", "joutua", "joutui", "joutuivat", "joutumaan", "joutuu", "joutuvat", "juuri", "jälkeen", "jälleen", "jää", "kahdeksan", "kahdeksannen", "kahdella", "kahdelle", "kahdelta", "kahden", "kahdessa", "kahdesta", "kahta", "kahteen", "kai", "kaiken", "kaikille", "kaikilta", "kaikkea", "kaikki", "kaikkia", "kaikkiaan", "kaikkialla", "kaikkialle", "kaikkialta", "kaikkien", "kaikkin", "kaksi", "kannalta", "kannattaa", "kanssa", "kanssaan", "kanssamme", "kanssani", "kanssanne", "kanssasi", "kauan", "kauemmas", "kaukana", "kautta", "kehen", "keiden", "keihin", "keiksi", "keille", "keillä", "keiltä", "keinä", "keissä", "keistä", "keitten", "keittä", "keitä", "keneen", "keneksi", "kenelle", "kenellä", "keneltä", "kenen", "kenenä", "kenessä", "kenestä", "kenet", "kenettä", "kennessästä", "kenties", "kerran", "kerta", "kertaa", "keskellä", "kesken", "keskimäärin", "ketkä", "ketä", "kiitos", "kohti", "koko", "kokonaan", "kolmas", "kolme", "kolmen", "kolmesti", "koska", "koskaan", "kovin", "kuin", "kuinka", "kuinkan", "kuitenkaan", "kuitenkin", "kuka", "kukaan", "kukin", "kukka", "kumpainen", "kumpainenkaan", "kumpi", "kumpikaan", "kumpikin", "kun", "kuten", "kuuden", "kuusi", "kuutta", "kylliksi", "kyllä", "kymmenen", "kyse", "liian", "liki", "lisäksi", "lisää", "lla", "luo", "luona", "lähekkäin", "lähelle", "lähellä", "läheltä", "lähemmäs", "lähes", "lähinnä", "lähtien", "läpi", "mahdollisimman", "mahdollista", "me", "meidän", "meidät", "meihin", "meille", "meillä", "meiltä", "meissä", "meistä", "meitä", "melkein", "melko", "menee", "meneet", "menemme", "menen", "menet", "menette", "menevät", "meni", "menimme", "menin", "menit", "menivät", "mennessä", "mennyt", "menossa", "mihin", "mikin", "miksi", "mikä", "mikäli", "mikään", "mille", "milloin", "milloinkan", "millä", "miltä", "minkä", "minne", "minua", "minulla", "minulle", "minulta", "minun", "minussa", "minusta", "minut", "minuun", "minä", "missä", "mistä", "miten", "mitkä", "mitä", "mitään", "moi", "molemmat", "mones", "monesti", "monet", "moni", "moniaalla", "moniaalle", "moniaalta", "monta", "muassa", "muiden", "muita", "muka", "mukaan", "mukaansa", "mukana", "mutta", "muu", "muualla", "muualle", "muualta", "muuanne", "muulloin", "muun", "muut", "muuta", "muutama", "muutaman", "muuten", "myöhemmin", "myös", "myöskin", "myöskään", "myötä", "ne", "neljä", "neljän", "neljää", "niiden", "niihin", "niiksi", "niille", "niillä", "niiltä", "niin", "niinä", "niissä", "niistä", "niitä", "noiden", "noihin", "noiksi", "noilla", "noille", "noilta", "noin", "noina", "noissa", "noista", "noita", "nopeammin", "nopeasti", "nopeiten", "nro", "nuo", "nyt", "näiden", "näihin", "näiksi", "näille", "näillä", "näiltä", "näin", "näinä", "näissä", "näissähin", "näissälle", "näissältä", "näissästä", "näistä", "näitä", "nämä", "ohi", "oikea", "oikealla", "oikein", "ole", "olemme", "olen", "olet", "olette", "oleva", "olevan", "olevat", "oli", "olimme", "olin", "olisi", "olisimme", "olisin", "olisit", "olisitte", "olisivat", "olit", "olitte", "olivat", "olla", "olleet", "olli", "ollut", "oma", "omaa", "omaan", "omaksi", "omalle", "omalta", "oman", "omassa", "omat", "omia", "omien", "omiin", "omiksi", "omille", "omilta", "omissa", "omista", "on", "onkin", "onko", "ovat", "paikoittain", "paitsi", "pakosti", "paljon", "paremmin", "parempi", "parhaillaan", "parhaiten", "perusteella", "peräti", "pian", "pieneen", "pieneksi", "pienelle", "pienellä", "pieneltä", "pienempi", "pienestä", "pieni", "pienin", "poikki", "puolesta", "puolestaan", "päälle", "runsaasti", "saakka", "sadam", "sama", "samaa", "samaan", "samalla", "samallalta", "samallassa", "samallasta", "saman", "samat", "samoin", "sata", "sataa", "satojen", "se", "seitsemän", "sekä", "sen", "seuraavat", "siellä", "sieltä", "siihen", "siinä", "siis", "siitä", "sijaan", "siksi", "sille", "silloin", "sillä", "silti", "siltä", "sinne", "sinua", "sinulla", "sinulle", "sinulta", "sinun", "sinussa", "sinusta", "sinut", "sinuun", "sinä", "sisäkkäin", "sisällä", "siten", "sitten", "sitä", "ssa", "sta", "suoraan", "suuntaan", "suuren", "suuret", "suuri", "suuria", "suurin", "suurten", "taa", "taas", "taemmas", "tahansa", "tai", "takaa", "takaisin", "takana", "takia", "tallä", "tapauksessa", "tarpeeksi", "tavalla", "tavoitteena", "te", "teidän", "teidät", "teihin", "teille", "teillä", "teiltä", "teissä", "teistä", "teitä", "tietysti", "todella", "toinen", "toisaalla", "toisaalle", "toisaalta", "toiseen", "toiseksi", "toisella", "toiselle", "toiselta", "toisemme", "toisen", "toisensa", "toisessa", "toisesta", "toista", "toistaiseksi", "toki", "tosin", "tuhannen", "tuhat", "tule", "tulee", "tulemme", "tulen", "tulet", "tulette", "tulevat", "tulimme", "tulin", "tulisi", "tulisimme", "tulisin", "tulisit", "tulisitte", "tulisivat", "tulit", "tulitte", "tulivat", "tulla", "tulleet", "tullut", "tuntuu", "tuo", "tuohon", "tuoksi", "tuolla", "tuolle", "tuolloin", "tuolta", "tuon", "tuona", "tuonne", "tuossa", "tuosta", "tuota", "tuotä", "tuskin", "tykö", "tähän", "täksi", "tälle", "tällä", "tällöin", "tältä", "tämä", "tämän", "tänne", "tänä", "tänään", "tässä", "tästä", "täten", "tätä", "täysin", "täytyvät", "täytyy", "täällä", "täältä", "ulkopuolella", "usea", "useasti", "useimmiten", "usein", "useita", "uudeksi", "uudelleen", "uuden", "uudet", "uusi", "uusia", "uusien", "uusinta", "uuteen", "uutta", "vaan", "vahemmän", "vai", "vaiheessa", "vaikea", "vaikean", "vaikeat", "vaikeilla", "vaikeille", "vaikeilta", "vaikeissa", "vaikeista", "vaikka", "vain", "varmasti", "varsin", "varsinkin", "varten", "vasen", "vasenmalla", "vasta", "vastaan", "vastakkain", "vastan", "verran", "vielä", "vierekkäin", "vieressä", "vieri", "viiden", "viime", "viimeinen", "viimeisen", "viimeksi", "viisi", "voi", "voidaan", "voimme", "voin", "voisi", "voit", "voitte", "voivat", "vuoden", "vuoksi", "vuosi", "vuosien", "vuosina", "vuotta", "vähemmän", "vähintään", "vähiten", "vähän", "välillä", "yhdeksän", "yhden", "yhdessä", "yhteen", "yhteensä", "yhteydessä", "yhteyteen", "yhtä", "yhtäälle", "yhtäällä", "yhtäältä", "yhtään", "yhä", "yksi", "yksin", "yksittäin", "yleensä", "ylemmäs", "yli", "ylös", "ympäri", "älköön", "älä", ]; ================================================ FILE: src/stopwords/fra.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_FRA: &[&str] = &[ "a", "abord", "absolument", "afin", "ah", "ai", "aie", "aient", "aies", "ailleurs", "ainsi", "ait", "allaient", "allo", "allons", "allô", "alors", "anterieur", "anterieure", "anterieures", "apres", "après", "as", "assez", "attendu", "au", "aucun", "aucune", "aucuns", "aujourd", "aujourd'hui", "aupres", "auquel", "aura", "aurai", "auraient", "aurais", "aurait", "auras", "aurez", "auriez", "aurions", "aurons", "auront", "aussi", "autre", "autrefois", "autrement", "autres", "autrui", "aux", "auxquelles", "auxquels", "avaient", "avais", "avait", "avant", "avec", "avez", "aviez", "avions", "avoir", "avons", "ayant", "ayez", "ayons", "b", "bah", "bas", "basee", "bat", "beau", "beaucoup", "bien", "bigre", "bon", "boum", "bravo", "brrr", "c", "car", "ce", "ceci", "cela", "celle", "celle-ci", "celle-là", "celles", "celles-ci", "celles-là", "celui", "celui-ci", "celui-là", "celà", "cent", "cependant", "certain", "certaine", "certaines", "certains", "certes", "ces", "cet", "cette", "ceux", "ceux-ci", "ceux-là", "chacun", "chacune", "chaque", "cher", "chers", "chez", "chiche", "chut", "chère", "chères", "ci", "cinq", "cinquantaine", "cinquante", "cinquantième", "cinquième", "clac", "clic", "combien", "comme", "comment", "comparable", "comparables", "compris", "concernant", "contre", "couic", "crac", "d", "da", "dans", "de", "debout", "dedans", "dehors", "deja", "delà", "depuis", "dernier", "derniere", "derriere", "derrière", "des", "desormais", "desquelles", "desquels", "dessous", "dessus", "deux", "deuxième", "deuxièmement", "devant", "devers", "devra", "devrait", "different", "differentes", "differents", "différent", "différente", "différentes", "différents", "dire", "directe", "directement", "dit", "dite", "dits", "divers", "diverse", "diverses", "dix", "dix-huit", "dix-neuf", "dix-sept", "dixième", "doit", "doivent", "donc", "dont", "dos", "douze", "douzième", "dring", "droite", "du", "duquel", "durant", "dès", "début", "désormais", "e", "effet", "egale", "egalement", "egales", "eh", "elle", "elle-même", "elles", "elles-mêmes", "en", "encore", "enfin", "entre", "envers", "environ", "es", "essai", "est", "et", "etant", "etc", "etre", "eu", "eue", "eues", "euh", "eurent", "eus", "eusse", "eussent", "eusses", "eussiez", "eussions", "eut", "eux", "eux-mêmes", "exactement", "excepté", "extenso", "exterieur", "eûmes", "eût", "eûtes", "f", "fais", "faisaient", "faisant", "fait", "faites", "façon", "feront", "fi", "flac", "floc", "fois", "font", "force", "furent", "fus", "fusse", "fussent", "fusses", "fussiez", "fussions", "fut", "fûmes", "fût", "fûtes", "g", "gens", "h", "ha", "haut", "hein", "hem", "hep", "hi", "ho", "holà", "hop", "hormis", "hors", "hou", "houp", "hue", "hui", "huit", "huitième", "hum", "hurrah", "hé", "hélas", "i", "ici", "il", "ils", "importe", "j", "je", "jusqu", "jusque", "juste", "k", "l", "la", "laisser", "laquelle", "las", "le", "lequel", "les", "lesquelles", "lesquels", "leur", "leurs", "longtemps", "lors", "lorsque", "lui", "lui-meme", "lui-même", "là", "lès", "m", "ma", "maint", "maintenant", "mais", "malgre", "malgré", "maximale", "me", "meme", "memes", "merci", "mes", "mien", "mienne", "miennes", "miens", "mille", "mince", "mine", "minimale", "moi", "moi-meme", "moi-même", "moindres", "moins", "mon", "mot", "moyennant", "multiple", "multiples", "même", "mêmes", "n", "na", "naturel", "naturelle", "naturelles", "ne", "neanmoins", "necessaire", "necessairement", "neuf", "neuvième", "ni", "nombreuses", "nombreux", "nommés", "non", "nos", "notamment", "notre", "nous", "nous-mêmes", "nouveau", "nouveaux", "nul", "néanmoins", "nôtre", "nôtres", "o", "oh", "ohé", "ollé", "olé", "on", "ont", "onze", "onzième", "ore", "ou", "ouf", "ouias", "oust", "ouste", "outre", "ouvert", "ouverte", "ouverts", "o|", "où", "p", "paf", "pan", "par", "parce", "parfois", "parle", "parlent", "parler", "parmi", "parole", "parseme", "partant", "particulier", "particulière", "particulièrement", "pas", "passé", "pendant", "pense", "permet", "personne", "personnes", "peu", "peut", "peuvent", "peux", "pff", "pfft", "pfut", "pif", "pire", "pièce", "plein", "plouf", "plupart", "plus", "plusieurs", "plutôt", "possessif", "possessifs", "possible", "possibles", "pouah", "pour", "pourquoi", "pourrais", "pourrait", "pouvait", "prealable", "precisement", "premier", "première", "premièrement", "pres", "probable", "probante", "procedant", "proche", "près", "psitt", "pu", "puis", "puisque", "pur", "pure", "q", "qu", "quand", "quant", "quant-à-soi", "quanta", "quarante", "quatorze", "quatre", "quatre-vingt", "quatrième", "quatrièmement", "que", "quel", "quelconque", "quelle", "quelles", "quelqu'un", "quelque", "quelques", "quels", "qui", "quiconque", "quinze", "quoi", "quoique", "r", "rare", "rarement", "rares", "relative", "relativement", "remarquable", "rend", "rendre", "restant", "reste", "restent", "restrictif", "retour", "revoici", "revoilà", "rien", "s", "sa", "sacrebleu", "sait", "sans", "sapristi", "sauf", "se", "sein", "seize", "selon", "semblable", "semblaient", "semble", "semblent", "sent", "sept", "septième", "sera", "serai", "seraient", "serais", "serait", "seras", "serez", "seriez", "serions", "serons", "seront", "ses", "seul", "seule", "seulement", "si", "sien", "sienne", "siennes", "siens", "sinon", "six", "sixième", "soi", "soi-même", "soient", "sois", "soit", "soixante", "sommes", "son", "sont", "sous", "souvent", "soyez", "soyons", "specifique", "specifiques", "speculatif", "stop", "strictement", "subtiles", "suffisant", "suffisante", "suffit", "suis", "suit", "suivant", "suivante", "suivantes", "suivants", "suivre", "sujet", "superpose", "sur", "surtout", "t", "ta", "tac", "tandis", "tant", "tardive", "te", "tel", "telle", "tellement", "telles", "tels", "tenant", "tend", "tenir", "tente", "tes", "tic", "tien", "tienne", "tiennes", "tiens", "toc", "toi", "toi-même", "ton", "touchant", "toujours", "tous", "tout", "toute", "toutefois", "toutes", "treize", "trente", "tres", "trois", "troisième", "troisièmement", "trop", "très", "tsoin", "tsouin", "tu", "té", "u", "un", "une", "unes", "uniformement", "unique", "uniques", "uns", "v", "va", "vais", "valeur", "vas", "vers", "via", "vif", "vifs", "vingt", "vivat", "vive", "vives", "vlan", "voici", "voie", "voient", "voilà", "vont", "vos", "votre", "vous", "vous-mêmes", "vu", "vé", "vôtre", "vôtres", "w", "x", "y", "z", "zut", "à", "â", "ça", "ès", "étaient", "étais", "était", "étant", "état", "étiez", "étions", "été", "étée", "étées", "étés", "êtes", "être", "ô", ]; ================================================ FILE: src/stopwords/guj.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_GUJ: &[&str] = &[ "અંગે", "અંદર", "અથવા", "અને", "અમને", "અમારું", "અમે", "અહીં", "આ", "આગળ", "આથી", "આનું", "આને", "આપણને", "આપણું", "આપણે", "આપી", "આર", "આવી", "આવે", "ઉપર", "ઉભા", "ઊંચે", "ઊભું", "એ", "એક", "એન", "એના", "એનાં", "એની", "એનું", "એને", "એનો", "એમ", "એવા", "એવાં", "એવી", "એવું", "એવો", "ઓછું", "કંઈક", "કઈ", "કયું", "કયો", "કરતાં", "કરવું", "કરી", "કરીએ", "કરું", "કરે", "કરેલું", "કર્યા", "કર્યાં", "કર્યું", "કર્યો", "કાંઈ", "કે", "કેટલું", "કેમ", "કેવી", "કેવું", "કોઈ", "કોઈક", "કોણ", "કોણે", "કોને", "ક્યાં", "ક્યારે", "ખૂબ", "ગઈ", "ગયા", "ગયાં", "ગયું", "ગયો", "ઘણું", "છ", "છતાં", "છીએ", "છું", "છે", "છેક", "છો", "જ", "જાય", "જી", "જે", "જેટલું", "જેને", "જેમ", "જેવી", "જેવું", "જેવો", "જો", "જોઈએ", "જ્યાં", "જ્યારે", "ઝાઝું", "તને", "તમને", "તમારું", "તમે", "તા", "તારાથી", "તારામાં", "તારું", "તું", "તે", "તેં", "તેઓ", "તેણે", "તેથી", "તેના", "તેની", "તેનું", "તેને", "તેમ", "તેમનું", "તેમને", "તેવી", "તેવું", "તો", "ત્યાં", "ત્યારે", "થઇ", "થઈ", "થઈએ", "થતા", "થતાં", "થતી", "થતું", "થતો", "થયા", "થયાં", "થયું", "થયેલું", "થયો", "થવું", "થાઉં", "થાઓ", "થાય", "થી", "થોડું", "દરેક", "ન", "નં", "નં.", "નથી", "નહિ", "નહી", "નહીં", "ના", "ની", "નીચે", "નું", "ને", "નો", "પછી", "પણ", "પર", "પરંતુ", "પહેલાં", "પાછળ", "પાસે", "પોતાનું", "પ્રત્યેક", "ફક્ત", "ફરી", "ફરીથી", "બંને", "બધા", "બધું", "બની", "બહાર", "બહુ", "બાદ", "બે", "મને", "મા", "માં", "માટે", "માત્ર", "મારું", "મી", "મૂકવું", "મૂકી", "મૂક્યા", "મૂક્યાં", "મૂક્યું", "મેં", "રહી", "રહે", "રહેવું", "રહ્યા", "રહ્યાં", "રહ્યો", "રીતે", "રૂ.", "રૂા", "લેતા", "લેતું", "લેવા", "વગેરે", "વધુ", "શકે", "શા", "શું", "સરખું", "સામે", "સુધી", "હતા", "હતાં", "હતી", "હતું", "હવે", "હશે", "હશો", "હા", "હું", "હો", "હોઈ", "હોઈશ", "હોઈશું", "હોય", "હોવા", ]; ================================================ FILE: src/stopwords/heb.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_HEB: &[&str] = &[ "אבל", "או", "אולי", "אותה", "אותו", "אותי", "אותך", "אותם", "אותן", "אותנו", "אז", "אחר", "אחרות", "אחרי", "אחריכן", "אחרים", "אחרת", "אי", "איזה", "איך", "אין", "איפה", "איתה", "איתו", "איתי", "איתך", "איתכם", "איתכן", "איתם", "איתן", "איתנו", "אך", "אל", "אלה", "אלו", "אם", "אנחנו", "אני", "אס", "אף", "אצל", "אשר", "את", "אתה", "אתכם", "אתכן", "אתם", "אתן", "באיזומידה", "באמצע", "באמצעות", "בגלל", "בין", "בלי", "במידה", "במקוםשבו", "ברם", "בשביל", "בשעהש", "בתוך", "גם", "דרך", "הוא", "היא", "היה", "היכן", "היתה", "היתי", "הם", "הן", "הנה", "הסיבהשבגללה", "הרי", "ואילו", "ואת", "זאת", "זה", "זות", "יהיה", "יוכל", "יוכלו", "יותרמדי", "יכול", "יכולה", "יכולות", "יכולים", "יכל", "יכלה", "יכלו", "יש", "כאן", "כאשר", "כולם", "כולן", "כזה", "כי", "כיצד", "כך", "ככה", "כל", "כלל", "כמו", "כן", "כפי", "כש", "לא", "לאו", "לאיזותכלית", "לאן", "לבין", "לה", "להיות", "להם", "להן", "לו", "לי", "לכם", "לכן", "למה", "למטה", "למעלה", "למקוםשבו", "למרות", "לנו", "לעבר", "לעיכן", "לפיכך", "לפני", "מאד", "מאחורי", "מאיזוסיבה", "מאין", "מאיפה", "מבלי", "מבעד", "מדוע", "מה", "מהיכן", "מול", "מחוץ", "מי", "מכאן", "מכיוון", "מלבד", "מן", "מנין", "מסוגל", "מעט", "מעטים", "מעל", "מצד", "מקוםבו", "מתחת", "מתי", "נגד", "נגר", "נו", "עד", "עז", "על", "עלי", "עליה", "עליהם", "עליהן", "עליו", "עליך", "עליכם", "עלינו", "עם", "עצמה", "עצמהם", "עצמהן", "עצמו", "עצמי", "עצמם", "עצמן", "עצמנו", "פה", "רק", "שוב", "של", "שלה", "שלהם", "שלהן", "שלו", "שלי", "שלך", "שלכה", "שלכם", "שלכן", "שלנו", "שם", "תהיה", "תחת", ]; ================================================ FILE: src/stopwords/hin.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_HIN: &[&str] = &[ "अंदर", "अत", "अदि", "अप", "अपना", "अपनि", "अपनी", "अपने", "अभि", "अभी", "आदि", "आप", "इंहिं", "इंहें", "इंहों", "इतयादि", "इत्यादि", "इन", "इनका", "इन्हीं", "इन्हें", "इन्हों", "इस", "इसका", "इसकि", "इसकी", "इसके", "इसमें", "इसि", "इसी", "इसे", "उंहिं", "उंहें", "उंहों", "उन", "उनका", "उनकि", "उनकी", "उनके", "उनको", "उन्हीं", "उन्हें", "उन्हों", "उस", "उसके", "उसि", "उसी", "उसे", "एक", "एवं", "एस", "एसे", "ऐसे", "ओर", "और", "कइ", "कई", "कर", "करता", "करते", "करना", "करने", "करें", "कहते", "कहा", "का", "काफि", "काफ़ी", "कि", "किंहें", "किंहों", "कितना", "किन्हें", "किन्हों", "किया", "किर", "किस", "किसि", "किसी", "किसे", "की", "कुछ", "कुल", "के", "को", "कोइ", "कोई", "कोन", "कोनसा", "कौन", "कौनसा", "गया", "घर", "जब", "जहाँ", "जहां", "जा", "जिंहें", "जिंहों", "जितना", "जिधर", "जिन", "जिन्हें", "जिन्हों", "जिस", "जिसे", "जीधर", "जेसा", "जेसे", "जैसा", "जैसे", "जो", "तक", "तब", "तरह", "तिंहें", "तिंहों", "तिन", "तिन्हें", "तिन्हों", "तिस", "तिसे", "तो", "था", "थि", "थी", "थे", "दबारा", "दवारा", "दिया", "दुसरा", "दुसरे", "दूसरे", "दो", "द्वारा", "न", "नहिं", "नहीं", "ना", "निचे", "निहायत", "नीचे", "ने", "पर", "पहले", "पुरा", "पूरा", "पे", "फिर", "बनि", "बनी", "बहि", "बही", "बहुत", "बाद", "बाला", "बिलकुल", "भि", "भितर", "भी", "भीतर", "मगर", "मानो", "मे", "में", "यदि", "यह", "यहाँ", "यहां", "यहि", "यही", "या", "यिह", "ये", "रखें", "रवासा", "रहा", "रहे", "ऱ्वासा", "लिए", "लिये", "लेकिन", "व", "वगेरह", "वरग", "वर्ग", "वह", "वहाँ", "वहां", "वहिं", "वहीं", "वाले", "वुह", "वे", "वग़ैरह", "संग", "सकता", "सकते", "सबसे", "सभि", "सभी", "साथ", "साबुत", "साभ", "सारा", "से", "सो", "हि", "ही", "हुअ", "हुआ", "हुइ", "हुई", "हुए", "हे", "हें", "है", "हैं", "हो", "होता", "होति", "होती", "होते", "होना", "होने", ]; ================================================ FILE: src/stopwords/hrv.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_HRV: &[&str] = &[ "a", "ako", "ali", "bi", "bih", "bila", "bili", "bilo", "bio", "bismo", "biste", "biti", "bumo", "da", "do", "duž", "ga", "hoće", "hoćemo", "hoćete", "hoćeš", "hoću", "i", "iako", "ih", "ili", "iz", "ja", "je", "jedna", "jedne", "jedno", "jer", "jesam", "jesi", "jesmo", "jest", "jeste", "jesu", "jim", "joj", "još", "ju", "kada", "kako", "kao", "koja", "koje", "koji", "kojima", "koju", "kroz", "li", "me", "mene", "meni", "mi", "mimo", "moj", "moja", "moje", "mu", "na", "nad", "nakon", "nam", "nama", "nas", "naš", "naša", "naše", "našeg", "ne", "nego", "neka", "neki", "nekog", "neku", "nema", "netko", "neće", "nećemo", "nećete", "nećeš", "neću", "nešto", "ni", "nije", "nikoga", "nikoje", "nikoju", "nisam", "nisi", "nismo", "niste", "nisu", "njega", "njegov", "njegova", "njegovo", "njemu", "njezin", "njezina", "njezino", "njih", "njihov", "njihova", "njihovo", "njim", "njima", "njoj", "nju", "no", "o", "od", "odmah", "on", "ona", "oni", "ono", "ova", "pa", "pak", "po", "pod", "pored", "prije", "s", "sa", "sam", "samo", "se", "sebe", "sebi", "si", "smo", "ste", "su", "sve", "svi", "svog", "svoj", "svoja", "svoje", "svom", "ta", "tada", "taj", "tako", "te", "tebe", "tebi", "ti", "to", "toj", "tome", "tu", "tvoj", "tvoja", "tvoje", "u", "uz", "vam", "vama", "vas", "vaš", "vaša", "vaše", "već", "vi", "vrlo", "za", "zar", "će", "ćemo", "ćete", "ćeš", "ću", "što", ]; ================================================ FILE: src/stopwords/hun.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_HUN: &[&str] = &[ "a", "abba", "abban", "abbã³l", "abból", "addig", "ahhoz", "ahogy", "ahol", "aki", "akik", "akkor", "akár", "akã¡r", "alapján", "alapjã¡n", "alatt", "alatta", "alattad", "alattam", "alattatok", "alattuk", "alattunk", "alá", "alád", "alájuk", "alám", "alánk", "alátok", "alã¡", "alã¡d", "alã¡juk", "alã¡m", "alã¡nk", "alã¡tok", "alã³l", "alã³la", "alã³lad", "alã³lam", "alã³latok", "alã³luk", "alã³lunk", "alól", "alóla", "alólad", "alólam", "alólatok", "alóluk", "alólunk", "amely", "amelybol", "amelyek", "amelyekben", "amelyeket", "amelyet", "amelyik", "amelynek", "ami", "amikor", "amit", "amolyan", "amott", "amã\u{AD}g", "amíg", "annak", "annál", "annã¡l", "arra", "arrã³l", "arról", "attã³l", "attól", "az", "aznap", "azok", "azokat", "azokba", "azokban", "azokbã³l", "azokból", "azokhoz", "azokig", "azokkal", "azokká", "azokkã¡", "azoknak", "azoknál", "azoknã¡l", "azokon", "azokra", "azokrã³l", "azokról", "azoktã³l", "azoktól", "azokã©rt", "azokért", "azon", "azonban", "azonnal", "azt", "aztán", "aztã¡n", "azután", "azzal", "azzá", "azzã¡", "azã©rt", "azért", "bal", "balra", "ban", "be", "belã©", "belã©d", "belã©jã¼k", "belã©m", "belã©nk", "belã©tek", "belã¼l", "belå‘le", "belå‘led", "belå‘lem", "belå‘letek", "belå‘lã¼k", "belå‘lã¼nk", "belé", "beléd", "beléjük", "belém", "belénk", "belétek", "belül", "belőle", "belőled", "belőlem", "belőletek", "belőlük", "belőlünk", "ben", "benne", "benned", "bennem", "bennetek", "bennã¼k", "bennã¼nk", "bennük", "bennünk", "bár", "bárcsak", "bármilyen", "bã¡r", "bã¡rcsak", "bã¡rmilyen", "bãºcsãº", "búcsú", "cikk", "cikkek", "cikkeket", "csak", "csakhogy", "csupán", "csupã¡n", "de", "dehogy", "e", "ebbe", "ebben", "ebbå‘l", "ebből", "eddig", "egy", "egyebek", "egyebet", "egyedã¼l", "egyedül", "egyelå‘re", "egyelőre", "egyes", "egyet", "egyetlen", "egyik", "egymás", "egymã¡s", "egyre", "egyszerre", "egyã©b", "egyã¼tt", "egyéb", "együtt", "egã©sz", "egã©szen", "egész", "egészen", "ehhez", "ekkor", "el", "eleinte", "ellen", "ellenes", "elleni", "ellenã©re", "ellenére", "elmondta", "elså‘", "elså‘k", "elså‘sorban", "elså‘t", "elsõ", "első", "elsők", "elsősorban", "elsőt", "elã©", "elã©d", "elã©g", "elã©jã¼k", "elã©m", "elã©nk", "elã©tek", "elå‘bb", "elå‘l", "elå‘le", "elå‘led", "elå‘lem", "elå‘letek", "elå‘lã¼k", "elå‘lã¼nk", "elå‘szã¶r", "elå‘tt", "elå‘tte", "elå‘tted", "elå‘ttem", "elå‘ttetek", "elå‘ttã¼k", "elå‘ttã¼nk", "elå‘zå‘", "elé", "eléd", "elég", "eléjük", "elém", "elénk", "elétek", "elõ", "elõször", "elõtt", "elő", "előbb", "elől", "előle", "előled", "előlem", "előletek", "előlük", "előlünk", "először", "előtt", "előtte", "előtted", "előttem", "előttetek", "előttük", "előttünk", "előző", "emilyen", "engem", "ennek", "ennyi", "ennã©l", "ennél", "enyã©m", "enyém", "erre", "errå‘l", "erről", "esetben", "ettå‘l", "ettől", "ez", "ezek", "ezekbe", "ezekben", "ezekbå‘l", "ezekből", "ezeken", "ezeket", "ezekhez", "ezekig", "ezekkel", "ezekkã©", "ezekké", "ezeknek", "ezeknã©l", "ezeknél", "ezekre", "ezekrå‘l", "ezekről", "ezektå‘l", "ezektől", "ezekã©rt", "ezekért", "ezen", "ezentãºl", "ezentúl", "ezer", "ezret", "ezt", "ezután", "ezutã¡n", "ezzel", "ezzã©", "ezzé", "ezã©rt", "ezért", "fel", "fele", "felek", "felet", "felett", "felã©", "felé", "fent", "fenti", "fã©l", "fã¶lã©", "fél", "fölé", "gyakran", "ha", "hallã³", "halló", "hamar", "hanem", "harmadik", "harmadikat", "harminc", "hat", "hatodik", "hatodikat", "hatot", "hatvan", "helyett", "hetedik", "hetediket", "hetet", "hetven", "hirtelen", "hiszen", "hiába", "hiã¡ba", "hogy", "hogyan", "hol", "holnap", "holnapot", "honnan", "hova", "hozzá", "hozzád", "hozzájuk", "hozzám", "hozzánk", "hozzátok", "hozzã¡", "hozzã¡d", "hozzã¡juk", "hozzã¡m", "hozzã¡nk", "hozzã¡tok", "hurrá", "hurrã¡", "huszadik", "hány", "hányszor", "hármat", "három", "hát", "hátha", "hátulsó", "hã¡ny", "hã¡nyszor", "hã¡rmat", "hã¡rom", "hã¡t", "hã¡tha", "hã¡tulsã³", "hã©t", "hãºsz", "hét", "húsz", "ide", "ide-ð¾da", "ide-оda", "idã©n", "idén", "igazán", "igazã¡n", "igen", "ill", "ill.", "illetve", "ilyen", "ilyenkor", "immár", "immã¡r", "inkább", "inkã¡bb", "is", "ismã©t", "ismét", "ison", "itt", "jelenleg", "jobban", "jobbra", "jã³", "jã³l", "jã³lesik", "jã³val", "jã¶vå‘re", "jó", "jól", "jólesik", "jóval", "jövőre", "kell", "kellene", "kellett", "kelljen", "keressünk", "keresztül", "ketten", "kettå‘", "kettå‘t", "kettő", "kettőt", "kevã©s", "kevés", "ki", "kiben", "kibå‘l", "kiből", "kicsit", "kicsoda", "kihez", "kik", "kikbe", "kikben", "kikbå‘l", "kikből", "kiken", "kiket", "kikhez", "kikkel", "kikkã©", "kikké", "kiknek", "kiknã©l", "kiknél", "kikre", "kikrå‘l", "kikről", "kiktå‘l", "kiktől", "kikã©rt", "kikért", "kilenc", "kilencedik", "kilencediket", "kilencet", "kilencven", "kin", "kinek", "kinã©l", "kinél", "kire", "kirå‘l", "kiről", "kit", "kitå‘l", "kitől", "kivel", "kivã©", "kivé", "kiã©", "kiã©rt", "kié", "kiért", "korábban", "korã¡bban", "kã©pest", "kã©rem", "kã©rlek", "kã©sz", "kã©så‘", "kã©så‘bb", "kã©så‘n", "kã©t", "kã©tszer", "kã¶rã¼l", "kã¶szã¶nhetå‘en", "kã¶szã¶nã¶m", "kã¶zben", "kã¶zel", "kã¶zepesen", "kã¶zepã©n", "kã¶zã©", "kã¶zã¶tt", "kã¶zã¼l", "kã¼lã¶n", "kã¼lã¶nben", "kã¼lã¶nbã¶zå‘", "kã¼lã¶nbã¶zå‘bb", "kã¼lã¶nbã¶zå‘ek", "képest", "kérem", "kérlek", "kész", "késő", "később", "későn", "két", "kétszer", "kívül", "körül", "köszönhetően", "köszönöm", "közben", "közel", "közepesen", "közepén", "közé", "között", "közül", "külön", "különben", "különböző", "különbözőbb", "különbözőek", "lassan", "le", "legalább", "legalã¡bb", "legyen", "lehet", "lehetetlen", "lehetett", "lehetå‘leg", "lehetå‘sã©g", "lehetőleg", "lehetőség", "lenne", "lenni", "lennã©k", "lennã©nek", "lennék", "lennének", "lesz", "leszek", "lesznek", "leszã¼nk", "leszünk", "lett", "lettek", "lettem", "lettã¼nk", "lettünk", "lã©vå‘", "lévő", "ma", "maga", "magad", "magam", "magatokat", "magukat", "magunkat", "magát", "magã¡t", "mai", "majd", "majdnem", "manapság", "manapsã¡g", "meg", "megcsinál", "megcsinálnak", "megcsinã¡l", "megcsinã¡lnak", "megint", "megvan", "mellett", "mellette", "melletted", "mellettem", "mellettetek", "mellettã¼k", "mellettã¼nk", "mellettük", "mellettünk", "mellã©", "mellã©d", "mellã©jã¼k", "mellã©m", "mellã©nk", "mellã©tek", "mellå‘l", "mellå‘le", "mellå‘led", "mellå‘lem", "mellå‘letek", "mellå‘lã¼k", "mellå‘lã¼nk", "mellé", "melléd", "melléjük", "mellém", "mellénk", "mellétek", "mellől", "mellőle", "mellőled", "mellőlem", "mellőletek", "mellőlük", "mellőlünk", "mely", "melyek", "melyik", "mennyi", "mert", "mi", "miatt", "miatta", "miattad", "miattam", "miattatok", "miattuk", "miattunk", "mibe", "miben", "mibå‘l", "miből", "mihez", "mik", "mikbe", "mikben", "mikbå‘l", "mikből", "miken", "miket", "mikhez", "mikkel", "mikkã©", "mikké", "miknek", "miknã©l", "miknél", "mikor", "mikre", "mikrå‘l", "mikről", "miktå‘l", "miktől", "mikã©rt", "mikért", "milyen", "min", "mind", "mindegyik", "mindegyiket", "minden", "mindenesetre", "mindenki", "mindent", "mindenã¼tt", "mindenütt", "mindig", "mindketten", "minek", "minket", "mint", "mintha", "minã©l", "minél", "mire", "mirå‘l", "miről", "mit", "mitå‘l", "mitől", "mivel", "mivã©", "mivé", "miã©rt", "miért", "mondta", "most", "mostanáig", "mostanã¡ig", "már", "más", "másik", "másikat", "másnap", "második", "másodszor", "mások", "másokat", "mást", "mã¡r", "mã¡s", "mã¡sik", "mã¡sikat", "mã¡snap", "mã¡sodik", "mã¡sodszor", "mã¡sok", "mã¡sokat", "mã¡st", "mã©g", "mã©gis", "mã\u{AD}g", "mã¶gã©", "mã¶gã©d", "mã¶gã©jã¼k", "mã¶gã©m", "mã¶gã©nk", "mã¶gã©tek", "mã¶gã¶tt", "mã¶gã¶tte", "mã¶gã¶tted", "mã¶gã¶ttem", "mã¶gã¶ttetek", "mã¶gã¶ttã¼k", "mã¶gã¶ttã¼nk", "mã¶gã¼l", "mã¶gã¼le", "mã¶gã¼led", "mã¶gã¼lem", "mã¶gã¼letek", "mã¶gã¼lã¼k", "mã¶gã¼lã¼nk", "mãºltkor", "mãºlva", "még", "mégis", "míg", "mögé", "mögéd", "mögéjük", "mögém", "mögénk", "mögétek", "mögött", "mögötte", "mögötted", "mögöttem", "mögöttetek", "mögöttük", "mögöttünk", "mögül", "mögüle", "mögüled", "mögülem", "mögületek", "mögülük", "mögülünk", "múltkor", "múlva", "na", "nagy", "nagyobb", "nagyon", "naponta", "napot", "ne", "negyedik", "negyediket", "negyven", "neked", "nekem", "neki", "nekik", "nektek", "nekã¼nk", "nekünk", "nem", "nemcsak", "nemrã©g", "nemrég", "nincs", "nyolc", "nyolcadik", "nyolcadikat", "nyolcat", "nyolcvan", "nála", "nálad", "nálam", "nálatok", "náluk", "nálunk", "nã¡la", "nã¡lad", "nã¡lam", "nã¡latok", "nã¡luk", "nã¡lunk", "nã©gy", "nã©gyet", "nã©ha", "nã©hã¡ny", "nã©lkã¼l", "négy", "négyet", "néha", "néhány", "nélkül", "o", "oda", "ok", "olyan", "onnan", "ott", "pedig", "persze", "pár", "pã¡r", "pã©ldã¡ul", "például", "rajta", "rajtad", "rajtam", "rajtatok", "rajtuk", "rajtunk", "rendben", "rosszul", "rá", "rád", "rájuk", "rám", "ránk", "rátok", "rã¡", "rã¡d", "rã¡juk", "rã¡m", "rã¡nk", "rã¡tok", "rã©gen", "rã©gã³ta", "rã©szã©re", "rã³la", "rã³lad", "rã³lam", "rã³latok", "rã³luk", "rã³lunk", "rã¶gtã¶n", "régen", "régóta", "részére", "róla", "rólad", "rólam", "rólatok", "róluk", "rólunk", "rögtön", "s", "saját", "se", "sem", "semmi", "semmilyen", "semmisã©g", "semmiség", "senki", "soha", "sok", "sokan", "sokat", "sokkal", "sokszor", "sokáig", "sokã¡ig", "során", "sorã¡n", "stb.", "szemben", "szerbusz", "szerint", "szerinte", "szerinted", "szerintem", "szerintetek", "szerintã¼k", "szerintã¼nk", "szerintük", "szerintünk", "szervusz", "szinte", "számára", "száz", "századik", "százat", "szã¡mã¡ra", "szã¡z", "szã¡zadik", "szã¡zat", "szã©pen", "szã\u{AD}ves", "szã\u{AD}vesen", "szã\u{AD}veskedjã©k", "szépen", "szét", "szíves", "szívesen", "szíveskedjék", "så‘t", "sőt", "talán", "talã¡n", "tavaly", "te", "tegnap", "tegnapelå‘tt", "tegnapelőtt", "tehát", "tehã¡t", "tele", "teljes", "tessã©k", "tessék", "ti", "tied", "titeket", "tizedik", "tizediket", "tizenegy", "tizenegyedik", "tizenhat", "tizenhárom", "tizenhã¡rom", "tizenhã©t", "tizenhét", "tizenkettedik", "tizenkettå‘", "tizenkettő", "tizenkilenc", "tizenkã©t", "tizenkét", "tizennyolc", "tizennã©gy", "tizennégy", "tizenã¶t", "tizenöt", "tizet", "tovább", "további", "továbbá", "tovã¡bb", "tovã¡bbi", "távol", "tã¡vol", "tã©ged", "tã©nyleg", "tã\u{AD}z", "tã¶bb", "tã¶bbi", "tã¶bbszã¶r", "tãºl", "tå‘le", "tå‘led", "tå‘lem", "tå‘letek", "tå‘lã¼k", "tå‘lã¼nk", "téged", "tényleg", "tíz", "több", "többi", "többször", "túl", "tőle", "tőled", "tőlem", "tőletek", "tőlük", "tőlünk", "ugyanakkor", "ugyanez", "ugyanis", "ugye", "urak", "uram", "urat", "utoljára", "utoljã¡ra", "utolsã³", "utolsó", "után", "utána", "utã¡n", "vagy", "vagyis", "vagyok", "vagytok", "vagyunk", "vajon", "valahol", "valaki", "valakit", "valamelyik", "valami", "valamint", "való", "van", "vannak", "vele", "veled", "velem", "veletek", "velã¼k", "velã¼nk", "velük", "velünk", "vissza", "viszlát", "viszlã¡t", "viszont", "viszontlátásra", "viszontlã¡tã¡sra", "volna", "volnának", "volnã¡nak", "volnã©k", "volnék", "volt", "voltak", "voltam", "voltunk", "vã©gre", "vã©gã©n", "vã©gã¼l", "végre", "végén", "végül", "által", "általában", "ám", "át", "ã¡ltal", "ã¡ltalã¡ban", "ã¡m", "ã¡t", "ã©ljen", "ã©n", "ã©rte", "ã©rted", "ã©rtem", "ã©rtetek", "ã©rtã¼k", "ã©rtã¼nk", "ã©s", "ã©v", "ã©vben", "ã©ve", "ã©vek", "ã©ves", "ã©vi", "ã©vvel", "ã\u{AD}gy", "ã³ta", "ã¶n", "ã¶nbe", "ã¶nben", "ã¶nbå‘l", "ã¶nhã¶z", "ã¶nnek", "ã¶nnel", "ã¶nnã©l", "ã¶nre", "ã¶nrå‘l", "ã¶nt", "ã¶ntå‘l", "ã¶nã©rt", "ã¶nã¶k", "ã¶nã¶kbe", "ã¶nã¶kben", "ã¶nã¶kbå‘l", "ã¶nã¶ket", "ã¶nã¶khã¶z", "ã¶nã¶kkel", "ã¶nã¶knek", "ã¶nã¶knã©l", "ã¶nã¶kre", "ã¶nã¶krå‘l", "ã¶nã¶ktå‘l", "ã¶nã¶kã©rt", "ã¶nã¶kã¶n", "ã¶nã¶n", "ã¶t", "ã¶tven", "ã¶tã¶dik", "ã¶tã¶diket", "ã¶tã¶t", "ãºgy", "ãºgyis", "ãºgynevezett", "ãºjra", "ãºr", "å‘", "å‘k", "å‘ket", "å‘t", "éljen", "én", "éppen", "érte", "érted", "értem", "értetek", "értük", "értünk", "és", "év", "évben", "éve", "évek", "éves", "évi", "évvel", "így", "óta", "õ", "õk", "õket", "ön", "önbe", "önben", "önből", "önhöz", "önnek", "önnel", "önnél", "önre", "önről", "önt", "öntől", "önért", "önök", "önökbe", "önökben", "önökből", "önöket", "önökhöz", "önökkel", "önöknek", "önöknél", "önökre", "önökről", "önöktől", "önökért", "önökön", "önön", "össze", "öt", "ötven", "ötödik", "ötödiket", "ötöt", "úgy", "úgyis", "úgynevezett", "új", "újabb", "újra", "úr", "ő", "ők", "őket", "őt", ]; ================================================ FILE: src/stopwords/hye.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2022, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_HYE: &[&str] = &[ "այդ", "այլ", "այն", "այս", "դու", "դուք", "եմ", "են", "ենք", "ես", "եք", "է", "էի", "էին", "էինք", "էիր", "էիք", "էր", "ըստ", "թ", "ի", "ին", "իսկ", "իր", "կամ", "համար", "հետ", "հետո", "մենք", "մեջ", "մի", "ն", "նա", "նաև", "նրա", "նրանք", "որ", "որը", "որոնք", "որպես", "ու", "ում", "պիտի", "վրա", "և", ]; ================================================ FILE: src/stopwords/ind.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_IND: &[&str] = &[ "ada", "adalah", "adanya", "adapun", "agak", "agaknya", "agar", "akan", "akankah", "akhir", "akhiri", "akhirnya", "aku", "akulah", "amat", "amatlah", "anda", "andalah", "antar", "antara", "antaranya", "apa", "apaan", "apabila", "apakah", "apalagi", "apatah", "artinya", "asal", "asalkan", "atas", "atau", "ataukah", "ataupun", "awal", "awalnya", "bagai", "bagaikan", "bagaimana", "bagaimanakah", "bagaimanapun", "bagi", "bagian", "bahkan", "bahwa", "bahwasanya", "baik", "bakal", "bakalan", "balik", "banyak", "bapak", "baru", "bawah", "beberapa", "begini", "beginian", "beginikah", "beginilah", "begitu", "begitukah", "begitulah", "begitupun", "bekerja", "belakang", "belakangan", "belum", "belumlah", "benar", "benarkah", "benarlah", "berada", "berakhir", "berakhirlah", "berakhirnya", "berapa", "berapakah", "berapalah", "berapapun", "berarti", "berawal", "berbagai", "berdatangan", "beri", "berikan", "berikut", "berikutnya", "berjumlah", "berkali-kali", "berkata", "berkehendak", "berkeinginan", "berkenaan", "berlainan", "berlalu", "berlangsung", "berlebihan", "bermacam", "bermacam-macam", "bermaksud", "bermula", "bersama", "bersama-sama", "bersiap", "bersiap-siap", "bertanya", "bertanya-tanya", "berturut", "berturut-turut", "bertutur", "berujar", "berupa", "besar", "betul", "betulkah", "biasa", "biasanya", "bila", "bilakah", "bisa", "bisakah", "boleh", "bolehkah", "bolehlah", "buat", "bukan", "bukankah", "bukanlah", "bukannya", "bulan", "bung", "cara", "caranya", "cukup", "cukupkah", "cukuplah", "cuma", "dahulu", "dalam", "dan", "dapat", "dari", "daripada", "datang", "dekat", "demi", "demikian", "demikianlah", "dengan", "depan", "di", "dia", "diakhiri", "diakhirinya", "dialah", "diantara", "diantaranya", "diberi", "diberikan", "diberikannya", "dibuat", "dibuatnya", "didapat", "didatangkan", "digunakan", "diibaratkan", "diibaratkannya", "diingat", "diingatkan", "diinginkan", "dijawab", "dijelaskan", "dijelaskannya", "dikarenakan", "dikatakan", "dikatakannya", "dikerjakan", "diketahui", "diketahuinya", "dikira", "dilakukan", "dilalui", "dilihat", "dimaksud", "dimaksudkan", "dimaksudkannya", "dimaksudnya", "diminta", "dimintai", "dimisalkan", "dimulai", "dimulailah", "dimulainya", "dimungkinkan", "dini", "dipastikan", "diperbuat", "diperbuatnya", "dipergunakan", "diperkirakan", "diperlihatkan", "diperlukan", "diperlukannya", "dipersoalkan", "dipertanyakan", "dipunyai", "diri", "dirinya", "disampaikan", "disebut", "disebutkan", "disebutkannya", "disini", "disinilah", "ditambahkan", "ditandaskan", "ditanya", "ditanyai", "ditanyakan", "ditegaskan", "ditujukan", "ditunjuk", "ditunjuki", "ditunjukkan", "ditunjukkannya", "ditunjuknya", "dituturkan", "dituturkannya", "diucapkan", "diucapkannya", "diungkapkan", "dong", "dua", "dulu", "empat", "enggak", "enggaknya", "entah", "entahlah", "guna", "gunakan", "hal", "hampir", "hanya", "hanyalah", "hari", "harus", "haruslah", "harusnya", "hendak", "hendaklah", "hendaknya", "hingga", "ia", "ialah", "ibarat", "ibaratkan", "ibaratnya", "ibu", "ikut", "ingat", "ingat-ingat", "ingin", "inginkah", "inginkan", "ini", "inikah", "inilah", "itu", "itukah", "itulah", "jadi", "jadilah", "jadinya", "jangan", "jangankan", "janganlah", "jauh", "jawab", "jawaban", "jawabnya", "jelas", "jelaskan", "jelaslah", "jelasnya", "jika", "jikalau", "juga", "jumlah", "jumlahnya", "justru", "kala", "kalau", "kalaulah", "kalaupun", "kalian", "kami", "kamilah", "kamu", "kamulah", "kan", "kapan", "kapankah", "kapanpun", "karena", "karenanya", "kasus", "kata", "katakan", "katakanlah", "katanya", "ke", "keadaan", "kebetulan", "kecil", "kedua", "keduanya", "keinginan", "kelamaan", "kelihatan", "kelihatannya", "kelima", "keluar", "kembali", "kemudian", "kemungkinan", "kemungkinannya", "kenapa", "kepada", "kepadanya", "kesampaian", "keseluruhan", "keseluruhannya", "keterlaluan", "ketika", "khususnya", "kini", "kinilah", "kira", "kira-kira", "kiranya", "kita", "kitalah", "kok", "kurang", "lagi", "lagian", "lah", "lain", "lainnya", "lalu", "lama", "lamanya", "lanjut", "lanjutnya", "lebih", "lewat", "lima", "luar", "macam", "maka", "makanya", "makin", "malah", "malahan", "mampu", "mampukah", "mana", "manakala", "manalagi", "masa", "masalah", "masalahnya", "masih", "masihkah", "masing", "masing-masing", "mau", "maupun", "melainkan", "melakukan", "melalui", "melihat", "melihatnya", "memang", "memastikan", "memberi", "memberikan", "membuat", "memerlukan", "memihak", "meminta", "memintakan", "memisalkan", "memperbuat", "mempergunakan", "memperkirakan", "memperlihatkan", "mempersiapkan", "mempersoalkan", "mempertanyakan", "mempunyai", "memulai", "memungkinkan", "menaiki", "menambahkan", "menandaskan", "menanti", "menanti-nanti", "menantikan", "menanya", "menanyai", "menanyakan", "mendapat", "mendapatkan", "mendatang", "mendatangi", "mendatangkan", "menegaskan", "mengakhiri", "mengapa", "mengatakan", "mengatakannya", "mengenai", "mengerjakan", "mengetahui", "menggunakan", "menghendaki", "mengibaratkan", "mengibaratkannya", "mengingat", "mengingatkan", "menginginkan", "mengira", "mengucapkan", "mengucapkannya", "mengungkapkan", "menjadi", "menjawab", "menjelaskan", "menuju", "menunjuk", "menunjuki", "menunjukkan", "menunjuknya", "menurut", "menuturkan", "menyampaikan", "menyangkut", "menyatakan", "menyebutkan", "menyeluruh", "menyiapkan", "merasa", "mereka", "merekalah", "merupakan", "meski", "meskipun", "meyakini", "meyakinkan", "minta", "mirip", "misal", "misalkan", "misalnya", "mula", "mulai", "mulailah", "mulanya", "mungkin", "mungkinkah", "nah", "naik", "namun", "nanti", "nantinya", "nyaris", "nyatanya", "oleh", "olehnya", "pada", "padahal", "padanya", "pak", "paling", "panjang", "pantas", "para", "pasti", "pastilah", "penting", "pentingnya", "per", "percuma", "perlu", "perlukah", "perlunya", "pernah", "persoalan", "pertama", "pertama-tama", "pertanyaan", "pertanyakan", "pihak", "pihaknya", "pukul", "pula", "pun", "punya", "rasa", "rasanya", "rata", "rupanya", "saat", "saatnya", "saja", "sajalah", "saling", "sama", "sama-sama", "sambil", "sampai", "sampai-sampai", "sampaikan", "sana", "sangat", "sangatlah", "satu", "saya", "sayalah", "se", "sebab", "sebabnya", "sebagai", "sebagaimana", "sebagainya", "sebagian", "sebaik", "sebaik-baiknya", "sebaiknya", "sebaliknya", "sebanyak", "sebegini", "sebegitu", "sebelum", "sebelumnya", "sebenarnya", "seberapa", "sebesar", "sebetulnya", "sebisanya", "sebuah", "sebut", "sebutlah", "sebutnya", "secara", "secukupnya", "sedang", "sedangkan", "sedemikian", "sedikit", "sedikitnya", "seenaknya", "segala", "segalanya", "segera", "seharusnya", "sehingga", "seingat", "sejak", "sejauh", "sejenak", "sejumlah", "sekadar", "sekadarnya", "sekali", "sekali-kali", "sekalian", "sekaligus", "sekalipun", "sekarang", "sekecil", "seketika", "sekiranya", "sekitar", "sekitarnya", "sekurang-kurangnya", "sekurangnya", "sela", "selagi", "selain", "selaku", "selalu", "selama", "selama-lamanya", "selamanya", "selanjutnya", "seluruh", "seluruhnya", "semacam", "semakin", "semampu", "semampunya", "semasa", "semasih", "semata", "semata-mata", "semaunya", "sementara", "semisal", "semisalnya", "sempat", "semua", "semuanya", "semula", "sendiri", "sendirian", "sendirinya", "seolah", "seolah-olah", "seorang", "sepanjang", "sepantasnya", "sepantasnyalah", "seperlunya", "seperti", "sepertinya", "sepihak", "sering", "seringnya", "serta", "serupa", "sesaat", "sesama", "sesampai", "sesegera", "sesekali", "seseorang", "sesuatu", "sesuatunya", "sesudah", "sesudahnya", "setelah", "setempat", "setengah", "seterusnya", "setiap", "setiba", "setibanya", "setidak-tidaknya", "setidaknya", "setinggi", "seusai", "sewaktu", "siap", "siapa", "siapakah", "siapapun", "sini", "sinilah", "soal", "soalnya", "suatu", "sudah", "sudahkah", "sudahlah", "supaya", "tadi", "tadinya", "tahu", "tahun", "tak", "tambah", "tambahnya", "tampak", "tampaknya", "tandas", "tandasnya", "tanpa", "tanya", "tanyakan", "tanyanya", "tapi", "tegas", "tegasnya", "telah", "tempat", "tengah", "tentang", "tentu", "tentulah", "tentunya", "tepat", "terakhir", "terasa", "terbanyak", "terdahulu", "terdapat", "terdiri", "terhadap", "terhadapnya", "teringat", "teringat-ingat", "terjadi", "terjadilah", "terjadinya", "terkira", "terlalu", "terlebih", "terlihat", "termasuk", "ternyata", "tersampaikan", "tersebut", "tersebutlah", "tertentu", "tertuju", "terus", "terutama", "tetap", "tetapi", "tiap", "tiba", "tiba-tiba", "tidak", "tidakkah", "tidaklah", "tiga", "tinggi", "toh", "tunjuk", "turut", "tutur", "tuturnya", "ucap", "ucapnya", "ujar", "ujarnya", "umum", "umumnya", "ungkap", "ungkapnya", "untuk", "usah", "usai", "waduh", "wah", "wahai", "waktu", "waktunya", "walau", "walaupun", "wong", "yaitu", "yakin", "yakni", "yang", ]; ================================================ FILE: src/stopwords/ita.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_ITA: &[&str] = &[ "a", "abbastanza", "abbia", "abbiamo", "abbiano", "abbiate", "accidenti", "ad", "adesso", "affinche", "agl", "agli", "ahime", "ahimã¨", "ahimè", "ai", "al", "alcuna", "alcuni", "alcuno", "all", "alla", "alle", "allo", "allora", "altre", "altri", "altrimenti", "altro", "altrove", "altrui", "anche", "ancora", "anni", "anno", "ansa", "anticipo", "assai", "attesa", "attraverso", "avanti", "avemmo", "avendo", "avente", "aver", "avere", "averlo", "avesse", "avessero", "avessi", "avessimo", "aveste", "avesti", "avete", "aveva", "avevamo", "avevano", "avevate", "avevi", "avevo", "avrai", "avranno", "avrebbe", "avrebbero", "avrei", "avremmo", "avremo", "avreste", "avresti", "avrete", "avrà", "avrò", "avuta", "avute", "avuti", "avuto", "basta", "ben", "bene", "benissimo", "berlusconi", "brava", "bravo", "buono", "c", "casa", "caso", "cento", "certa", "certe", "certi", "certo", "che", "chi", "chicchessia", "chiunque", "ci", "ciascuna", "ciascuno", "cima", "cinque", "cio", "cioe", "cioã¨", "cioè", "circa", "citta", "città", "cittã", "ciã²", "ciò", "co", "codesta", "codesti", "codesto", "cogli", "coi", "col", "colei", "coll", "coloro", "colui", "come", "cominci", "comprare", "comunque", "con", "concernente", "conciliarsi", "conclusione", "consecutivi", "consecutivo", "consiglio", "contro", "cortesia", "cos", "cosa", "cosi", "cosã¬", "così", "cui", "d", "da", "dagl", "dagli", "dai", "dal", "dall", "dalla", "dalle", "dallo", "dappertutto", "davanti", "degl", "degli", "dei", "del", "dell", "della", "delle", "dello", "dentro", "detto", "deve", "devo", "di", "dice", "dietro", "dire", "dirimpetto", "diventa", "diventare", "diventato", "dopo", "doppio", "dov", "dove", "dovra", "dovrà", "dovrã", "dovunque", "due", "dunque", "durante", "e", "ebbe", "ebbero", "ebbi", "ecc", "ecco", "ed", "effettivamente", "egli", "ella", "entrambi", "eppure", "era", "erano", "eravamo", "eravate", "eri", "ero", "esempio", "esse", "essendo", "esser", "essere", "essi", "ex", "fa", "faccia", "facciamo", "facciano", "facciate", "faccio", "facemmo", "facendo", "facesse", "facessero", "facessi", "facessimo", "faceste", "facesti", "faceva", "facevamo", "facevano", "facevate", "facevi", "facevo", "fai", "fanno", "farai", "faranno", "fare", "farebbe", "farebbero", "farei", "faremmo", "faremo", "fareste", "faresti", "farete", "farà", "farò", "fatto", "favore", "fece", "fecero", "feci", "fin", "finalmente", "finche", "fine", "fino", "forse", "forza", "fosse", "fossero", "fossi", "fossimo", "foste", "fosti", "fra", "frattempo", "fu", "fui", "fummo", "fuori", "furono", "futuro", "generale", "gente", "gia", "giacche", "giorni", "giorno", "giu", "già", "giã", "gli", "gliela", "gliele", "glieli", "glielo", "gliene", "governo", "grande", "grazie", "gruppo", "ha", "haha", "hai", "hanno", "ho", "i", "ie", "ieri", "il", "improvviso", "in", "inc", "indietro", "infatti", "inoltre", "insieme", "intanto", "intorno", "invece", "io", "l", "la", "lasciato", "lato", "lavoro", "le", "lei", "li", "lo", "lontano", "loro", "lui", "lungo", "luogo", "là", "lã", "ma", "macche", "magari", "maggior", "mai", "male", "malgrado", "malissimo", "mancanza", "marche", "me", "medesimo", "mediante", "meglio", "meno", "mentre", "mesi", "mezzo", "mi", "mia", "mie", "miei", "mila", "miliardi", "milioni", "minimi", "ministro", "mio", "modo", "molta", "molti", "moltissimo", "molto", "momento", "mondo", "mosto", "nazionale", "ne", "negl", "negli", "nei", "nel", "nell", "nella", "nelle", "nello", "nemmeno", "neppure", "nessun", "nessuna", "nessuno", "niente", "no", "noi", "nome", "non", "nondimeno", "nonostante", "nonsia", "nostra", "nostre", "nostri", "nostro", "novanta", "nove", "nulla", "nuovi", "nuovo", "o", "od", "oggi", "ogni", "ognuna", "ognuno", "oltre", "oppure", "ora", "ore", "osi", "ossia", "ottanta", "otto", "paese", "parecchi", "parecchie", "parecchio", "parte", "partendo", "peccato", "peggio", "per", "perche", "perchã¨", "perchè", "perché", "percio", "perciã²", "perciò", "perfino", "pero", "persino", "persone", "perã²", "però", "piedi", "pieno", "piglia", "piu", "piuttosto", "piã¹", "più", "po", "pochissimo", "poco", "poi", "poiche", "possa", "possedere", "posteriore", "posto", "potrebbe", "preferibilmente", "presa", "press", "prima", "primo", "principalmente", "probabilmente", "promesso", "proprio", "puo", "pure", "purtroppo", "puã²", "può", "qua", "qualche", "qualcosa", "qualcuna", "qualcuno", "quale", "quali", "qualunque", "quando", "quanta", "quante", "quanti", "quanto", "quantunque", "quarto", "quasi", "quattro", "quel", "quella", "quelle", "quelli", "quello", "quest", "questa", "queste", "questi", "questo", "qui", "quindi", "quinto", "realmente", "recente", "recentemente", "registrazione", "relativo", "riecco", "rispetto", "salvo", "sara", "sarai", "saranno", "sarebbe", "sarebbero", "sarei", "saremmo", "saremo", "sareste", "saresti", "sarete", "sarà", "sarã", "sarò", "scola", "scopo", "scorso", "se", "secondo", "seguente", "seguito", "sei", "sembra", "sembrare", "sembrato", "sembrava", "sembri", "sempre", "senza", "sette", "si", "sia", "siamo", "siano", "siate", "siete", "sig", "solito", "solo", "soltanto", "sono", "sopra", "soprattutto", "sotto", "spesso", "srl", "sta", "stai", "stando", "stanno", "starai", "staranno", "starebbe", "starebbero", "starei", "staremmo", "staremo", "stareste", "staresti", "starete", "starà", "starò", "stata", "state", "stati", "stato", "stava", "stavamo", "stavano", "stavate", "stavi", "stavo", "stemmo", "stessa", "stesse", "stessero", "stessi", "stessimo", "stesso", "steste", "stesti", "stette", "stettero", "stetti", "stia", "stiamo", "stiano", "stiate", "sto", "su", "sua", "subito", "successivamente", "successivo", "sue", "sugl", "sugli", "sui", "sul", "sull", "sulla", "sulle", "sullo", "suo", "suoi", "tale", "tali", "talvolta", "tanto", "te", "tempo", "terzo", "th", "ti", "titolo", "torino", "tra", "tranne", "tre", "trenta", "triplo", "troppo", "trovato", "tu", "tua", "tue", "tuo", "tuoi", "tutta", "tuttavia", "tutte", "tutti", "tutto", "uguali", "ulteriore", "ultimo", "un", "una", "uno", "uomo", "va", "vai", "vale", "vari", "varia", "varie", "vario", "verso", "vi", "via", "vicino", "visto", "vita", "voi", "volta", "volte", "vostra", "vostre", "vostri", "vostro", "ã¨", "è", ]; ================================================ FILE: src/stopwords/jav.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) // Notice: we do not have stopwords for this language yet. pub static STOPWORDS_JAV: &[&str] = &[]; ================================================ FILE: src/stopwords/jpn.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_JPN: &[&str] = &[ "あそこ", "あっ", "あの", "あのかた", "あの人", "あり", "あります", "ある", "あれ", "い", "いう", "います", "いる", "う", "うち", "え", "お", "および", "おり", "おります", "か", "かつて", "から", "が", "き", "ここ", "こちら", "こと", "この", "これ", "これら", "さ", "さらに", "し", "しかし", "する", "ず", "せ", "せる", "そこ", "そして", "その", "その他", "その後", "それ", "それぞれ", "それで", "た", "ただし", "たち", "ため", "たり", "だ", "だっ", "だれ", "つ", "て", "で", "でき", "できる", "です", "では", "でも", "と", "という", "といった", "とき", "ところ", "として", "とともに", "とも", "と共に", "どこ", "どの", "な", "ない", "なお", "なかっ", "ながら", "なく", "なっ", "など", "なに", "なら", "なり", "なる", "なん", "に", "において", "における", "について", "にて", "によって", "により", "による", "に対して", "に対する", "に関する", "の", "ので", "のみ", "は", "ば", "へ", "ほか", "ほとんど", "ほど", "ます", "また", "または", "まで", "も", "もの", "ものの", "や", "よう", "より", "ら", "られ", "られる", "れ", "れる", "を", "ん", "何", "及び", "彼", "彼女", "我々", "特に", "私", "私達", "貴方", "貴方方", ]; ================================================ FILE: src/stopwords/kan.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) // Notice: we do not have stopwords for this language yet. pub static STOPWORDS_KAN: &[&str] = &[ "ಆ", "ಈ", "ಅಥವಾ", "ಮತ್ತು", "ಆದರೆ", "ಎಂದು", "ಅವರ", "ಎಂಬ", "ಅವರು", "ಬಗ್ಗೆ", "ಇದೆ", "ಇದು", "ಮೂಲಕ", "ಅದು", "ಮೇಲೆ", "ಈಗ", "ಹಾಗೂ", "ಹೆಚ್ಚು", "ಅವರಿಗೆ", "ತಮ್ಮ", "ಮಾಡಿ", "ನಮ್ಮ", "ಮಾತ್ರ", "ದೊಡ್ಡ", "ಅದೇ", "ಕೂಡ", "ಯಾವುದೇ", "ಯಾವ", "ಆಗ", "ತುಂಬಾ", "ನಾವು", "ದಿನ", "ಬೇರೆ", "ಅವರನ್ನು", "ಎಲ್ಲಾ", "ನೀವು", "ಸಾಕಷ್ಟು", "ಕನ್ನಡ", "ಹೊಸ", "ಮುಂದೆ", "ಹೇಗೆ", "ನಂತರ", "ಇಲ್ಲಿ", "ಕೆಲಸ", "ಬಳಿಕ", "ಒಳ್ಳೆಯ", "ಹಾಗಾಗಿ", "ಜನ", "ಅದನ್ನು", "ಬಂದ", "ಕಾರಣ", "ಅವಕಾಶ", "ವರ್ಷ", "ನಿಮ್ಮ", "ಇತ್ತು", "ಹೇಳಿ", "ಮಾಡಿದ", "ಅದಕ್ಕೆ", "ಆಗಿ", "ಎಂಬುದು", "ಅಂತ", "ಕೆಲವು", "ಮೊದಲು", "ಬಂದು", "ಇದೇ", "ನೋಡಿ", "ಕೇವಲ", "ಎರಡು", "ಇನ್ನು", "ಅಷ್ಟೇ", "ಎಷ್ಟು", "ಮಾಡಬೇಕು", "ಹೀಗೆ", "ಕುರಿತು", "ಎಂದರೆ", "ಇನ್ನೂ", "ಮತ್ತೆ", "ಏನು", "ಮುಂದಿನ", "ಮಾಡುವ", "ವೇಳೆ", "ಜೊತೆಗೆ", ]; ================================================ FILE: src/stopwords/kat.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_KAT: &[&str] = &[ "ა.შ.", "აგერ", "აგრეთვე", "ალბათ", "ამაზე", "ამას", "ამასთან", "ამასთანავე", "ამგვარად", "ამდენად", "ამით", "ამის", "ამისთვის", "ამიტომ", "ამიტომაც", "ამჟამად", "ამჯერად", "ან", "ანუ", "არ", "არა", "არადა", "არათუ", "არამარტო", "არამედ", "არამხოლოდ", "არანაკლებ", "არასოდეს", "არაუადრეს", "არაუგვიანეს", "არაუმეტეს", "არსად", "არსაიდან", "არც", "არცერთ", "ასევე", "ასეც", "აქამდე", "აღარ", "აღარც", "ბოლოს", "ბოლოსკენ", "გამო", "გამუდმებით", "განსაკუთრებით", "გარდა", "გარეშე", "და", "დასასრულს", "დასაწყისში", "დროულად", "ე.ი.", "ე.წ.", "ეგებ", "ერთადერთი", "ერთადერთმა", "ერთ-ერთი", "ერთხელ", "ესოდე", "ვერ", "ვითომ", "ვინაიდან", "ვინძლო", "ვისაც", "ზემოაღნიშნულმა", "ზოგჯერ", "თავად", "თავადაც", "თავადვე", "თავდაპირველად", "თავიდანვე", "თავის მხრივ", "თან", "თანაც", "თანახმადაც", "თანდათან", "თვით", "თვითონ", "თვითონაც", "თვითონვე", "თითოეულმა", "თითქოს", "თუ", "თუკი", "თუმცა", "თუმცაღა", "თუნდაც", "იმავდროულად", "იმავე", "იმან", "იმას", "იმდენად", "იმთავითვე", "იმით", "იმის", "იმისთვის", "იმიტომ", "ისევე", "ისეთი", "ისეც", "იშვიათად", "კერძოდ", "კვლავ", "კი", "კიდევ", "მაგალითად", "მაგან", "მაგას", "მაგით", "მაგის", "მაგრამ", "მათი", "მაინც", "მანამ", "მანამდე", "მართალია", "მარტო", "მაშასადამე", "მაშინ", "მაშინვე", "მერე", "მეტად", "მთელი", "მიერ", "მით", "მიმართ", "მისივე", "მსგავსი", "მხოლოდ", "ნაწილობრივ", "ნეტავ", "ნეტავი", "ნუ", "ნურასოდეს", "ნურც", "ნუღარ", "ნუღარც", "ოდენ", "ოდესღაც", "ოღონდ", "პირველი", "პირიქით", "პრინციპში", "რადგან", "რადგანაც", "რათა", "რაკი", "რამდენად", "რამდენადაც", "რამეთუ", "რამენაირად", "რამეფრად", "რანაირადაც", "რასაკვირველია", "რასაც", "რაღაც", "რაც", "რითაც", "რისთვისაც", "როგორადაც", "როგორიც", "როგორიცაა", "როგორღაც", "როგორც", "როდესაც", "როდესღაც", "რომ", "რომელიმე", "რომელიც", "რომელსაც", "რომლებიც", "რომლითაც", "რომლის", "როცა", "საბოლოოდ", "სადაც", "სადღაც", "საერთოდ", "სათანადოდ", "საიდანაც", "სამომავლოდ", "სანამ", "სანამდე", "სრულად", "სულ", "სწორედ", "სხვადასხვა", "სხვები", "უკვე", "უნდა", "უსათუოდ", "უფრო", "უცებ", "უცნაურად", "ფაქტობრივად", "ყველა", "ყოველგვარი", "ყოველთვის", "ყოველი", "ყოველივე", "შედარებით", "შედეგად", "შემდგომ", "შემდგომში", "შემდეგ", "შესახებ", "შორის", "ჩვეულებრივ", "წინააღმდეგ", "წინაშე", "ხან", "ხოლმე", "ხოლო", "ხშირად", "ჯერაც", "ჯერჯერობით", "ამის გარდა", "ამის გარეშე", "ამის მიუხედავად", "ამასთან ერთად", "ამის მიხედვით", "ამის ნაცვლად", "ამის პასუხად", "ამასთან შედარებით", "ამბობს, რომ", "ამ დროს", "ამ თემაზე", "ამ მიზნით", "ამის საპირისპიროდ", "ამის გამო", "ამ მხრივ", "ამის უარსაყოფად", "ამის შედეგად", "ამ შემთხვევაში", "ამავე დროს", "ამას გარდა", "ამასთან დაკავშირებით", "ამის შემდეგ", "ამის შესაბამისად", "ამის შესახებ", "ამისგან განსხვავებით", "არა მარტო", "არა მხოლოდ", "არა უადრეს", "არა უგვიანეს", "არც ერთი", "არც კი", "არც მეორე", "ასე ვთქვათ", "ასე მაგალითად", "ასე რომ", "ასე შემდეგ", "ასევე განიხილავს", "აქედან გამომდინარე", "აქედან დასკვნა", "აღნიშნა რომ", "აღნიშნულთან დაკავშირებით", "აცხადებს რომ", "ბოლო ერთი", "ბოლო პერიოდში", "ბოლო წლებში", "გამოთქვა იმედი", "განაცხადა, რომ", "განმარტა, რომ", "გარდა ამისა", "გარშემო არსებული", "და სხვ.", "და სხვა", "დაადასტურა, რომ", "ეგრეთ წოდებული", "ეგრეთ წოდებულმა", "ერთი თვალსაზრისით", "ერთი მხრივ", "ერთის მხრივ", "ეს კი", "ესე იგი", "ვიდრე არ", "თავიდან ბოლომდე", "თუ რამდენად", "თუ როგორ", "იგივეა რაც", "იმ შემთხვევაში", "იმაზე მეტი", "იმაზე, რომ", "იმას, რომ", "იმასთან დაკავშირებით", "იმდენად რამდენადაც", "იმედი გამოთქვა", "იმის გამო", "იმის თაობაზე", "იმის საწინააღმდეგოდ", "იმისათვის, რომ", "იმისთვის, რათა", "იმისთვის, რომ", "იმიტომ, რომ", "ის, რომელიც", "ისე როგორც", "ისე, რომ", "ისევე როგორც", "ისეთი როგორიც", "იქიდან გამომდინარე", "კიდევ ერთხელ", "მაგრამ თუ", "მათ შორის", "მათი ვარაუდით", "მანამ, სანამ", "მას შემდეგ", "მაშინ, როცა", "მაშინაც კი", "მეორე მხრივ", "მეორეც ერთი", "მერე მეორე", "მით უფრო", "მიიჩნევს, რომ", "მისი განმარტებით", "მისი თქმით", "მისივე თქმით", "მიუხედავად ამისა", "ნურც კი", "პირველ რიგში", "რა დროსაც", "რა მიზეზითაც", "რაც შეეხება", "რაც შეიძლება", "რის გამოც", "რის საფუძველზედაც", "რის საფუძველზეც", "რის შედეგადაც", "რის შემდეგაც", "როგორც კი", "რომ არა", "რომ თუ", "რომელთა გამოც", "რომლის თანახმად", "რომლის თანახმადაც", "რომლის მიხედვითაც", "რომლის შესახებ", "საკითხთან დაკავშირებით", "სულ მცირე", "სულ ცოტა", "სხვა კუთხით", "სხვა მხრივ", "სხვა რამ", "სხვათა შორის", "უფრო მეტიც", "ყოველივე ეს", "შემდეგ უკვე", "ჩვენი განცხადებით", "ჯერ ერთი", "ჯერ კიდევ", "ამ ბოლო დროს", "ამა თუ იმ", "ასე თუ ისე", "აქედან ჩანს, რომ", "ბოლოს და ბოლოს", "გამომდინარე იქიდან, რომ", "და ასე შემდეგ", "ვინაიდან და რადგანაც", "თუ რის საფუძველზე", "იმის გათვალისწინებით, რომ", "იმის გამო, რომ", "იმის ნაცვლად, რომ", "ისევ და ისევ", "იქვე აღნიშნა, რომ", "იქიდან გამომდინარე, რომ", "კიდევ და კიდევ", "მაინც და მაინც", "მას შემდეგ, რაც", "მიუხედავად იმისა, თუ", "მიუხედავად იმისა, რომ", "როგორც უკვე ითქვა", "როდის და რატომ", "უფრო და უფრო", ]; ================================================ FILE: src/stopwords/khm.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_KHM: &[&str] = &[ "ៗ", "។ល។", "៚", "។", "៕", "៖", "៙", "០", "១", "២", "៣", "៤", "៥", "៦", "៧", "៨", "៩", "៛", "នេះ", "នោះ", "ខ្ញុំ", "អ្នក", "គាត់", "នាង", "ពួក", "យើង", "ពួកគេ", "លោក", "អ្វី", "បាន", "ការ", "នៅ", "និង", "ដែល", "មាន", "ជា", "ថា", "ក្នុង", "របស់", "ពី", "មួយ", "នឹង", "ឲ្យ", "មិន", "ទៅ", "តែ", "ត្រូវ", "ដោយ", "ហើយ", "ឆ្នាំ", "ពេល", "គេ", "ប្រទេស", "អាច", "គឺ", "ក្រុម", "ធ្វើ", "ក៏", "លើ", "នៃ", "ដើម្បី", "មក", "ទី", "តាម", "ទេ", "ដល់", "វា", "ដែរ", "ខ្លួន", "សម្រាប់", "ក្រុមហ៊ុន", "ថ្ងៃ", "ចំនួន", "កម្ពុជា", "ឡើង", "ទៀត", "ទាំង", "បើ", "និយាយ", "ទទួល", "ដ៏", "ច្រើន", "ផង", "ដឹង", "ជាមួយ", "គ្នា", "ខែ", "នាក់", "កំពុង", "យ៉ាង", "តម្លៃ", "ប្រកួត", "ក្រុង", "តំបន់", "ភាព", "យក", "ជាង", "ចូល", "នូវ", "កាលពី", "ណា", "បន្ត", "ជាតិ", "រូប", "មនុស្ស", "កាល", "ចំពោះ", "ដូច", "ខណៈ", "វិញ", "មុន", "ភ្នំពេញ", "លើក", "ល្អ", "ខាង", "ដុល្លារ", "ឃើញ", "បញ្ហា", "ប្រើ", "ចាប់", "ទឹក", "តើ", "ប្រាក់", "ធំ", "ខ្មែរ", "ចេញ", "ខេត្ត", "ផ្នែក", "ថ្មី", "បង្ហាញ", "ស៊ី", "អាមេរិក", "គឺជា", "លក់", "ចង់", "ដាក់", "ម្នាក់", "រួម", "រថយន្ត", "ផ្លូវ", "ភាគរយ", "កើន", "ជួយ", "ពីរ", "លាន", "ផ្តល់", "រដ្ឋ", "ខ្លាំង", "ជាច្រើន", "ទីក្រុង", "ជន", "កីឡា", "ក្រោយ", "ប្រាប់", "រដ្ឋាភិបាល", "កាន់", "ការងារ", "រក", "ព្រោះ", "រឿង", "ប៉ុន្តែ", "ឡើយ", "មុខ", "ថ្លែង", "ធ្វើឲ្យ", "បី", "នាំ", "ច្បាប់", "ដី", "ដូចជា", "កម", "ផ្ទះ", "បញ្ជាក់", "ចុះ", "បំផុត", "ចិត្ត", "បែប", "ចិន", "កីឡាករ", "កញ្ញា", "គម្រោង", "បង្កើត", "នា", "សារ", "សេដ្ឋកិច្ច", "ធនាគារ", "អស់", "ភាគ", "កូន", "ប្រធាន", "ផ្សារ", "ខ្ពស់", "គ្មាន", "ណាស់", "សម្រេច", "គួរ", "គ្រប់", "ប្រជាជន", "បន្ថែម", "រយៈ", "ខ្លះ", "បទ", "ទិញ", "ទើប", "វិនិយោគ", "មានការ", "លេខ", "ថៃ", "មើល", "បុរស", "យុវជន", "ស្រី", "នយោបាយ", "កន្លែង", "គិត", "បើក", "ដូច្នេះ", "រូបថត", "វាយ", "ប្រភេទ", "សំខាន់", "បន្ទាប់ពី", "កម្មវិធី", "រយៈពេល", "ផលិត", "ឈ្នះ", "ពិភពលោក", "ភ្ញៀវ", "ដោយសារ", "ស្រុក", "អាយុ", "ចំណាយ", "អំពី", "ហ៊ុន", "សិក្សា", ]; ================================================ FILE: src/stopwords/kor.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_KOR: &[&str] = &[ "!", "\"", "$", "%", "&", "'", "(", ")", "*", "+", ",", "-", ".", "...", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", ";", "<", "=", ">", "?", "@", "\\", "^", "_", "`", "|", "~", "·", "—", "——", "‘", "’", "“", "”", "…", "、", "。", "〈", "〉", "《", "》", "가", "가까스로", "가령", "각", "각각", "각자", "각종", "갖고말하자면", "같다", "같이", "개의치않고", "거니와", "거바", "거의", "것", "것과 같이", "것들", "게다가", "게우다", "겨우", "견지에서", "결과에 이르다", "결국", "결론을 낼 수 있다", "겸사겸사", "고려하면", "고로", "곧", "공동으로", "과", "과연", "관계가 있다", "관계없이", "관련이 있다", "관하여", "관한", "관해서는", "구", "구체적으로", "구토하다", "그", "그들", "그때", "그래", "그래도", "그래서", "그러나", "그러니", "그러니까", "그러면", "그러므로", "그러한즉", "그런 까닭에", "그런데", "그런즉", "그럼", "그럼에도 불구하고", "그렇게 함으로써", "그렇지", "그렇지 않다면", "그렇지 않으면", "그렇지만", "그렇지않으면", "그리고", "그리하여", "그만이다", "그에 따르는", "그위에", "그저", "그중에서", "그치지 않다", "근거로", "근거하여", "기대여", "기점으로", "기준으로", "기타", "까닭으로", "까악", "까지", "까지 미치다", "까지도", "꽈당", "끙끙", "끼익", "나", "나머지는", "남들", "남짓", "너", "너희", "너희들", "네", "넷", "년", "논하지 않다", "놀라다", "누가 알겠는가", "누구", "다른", "다른 방면으로", "다만", "다섯", "다소", "다수", "다시 말하자면", "다시말하면", "다음", "다음에", "다음으로", "단지", "답다", "당신", "당장", "대로 하다", "대하면", "대하여", "대해 말하자면", "대해서", "댕그", "더구나", "더군다나", "더라도", "더불어", "더욱더", "더욱이는", "도달하다", "도착하다", "동시에", "동안", "된바에야", "된이상", "두번째로", "둘", "둥둥", "뒤따라", "뒤이어", "든간에", "들", "등", "등등", "딩동", "따라", "따라서", "따위", "따지지 않다", "딱", "때", "때가 되어", "때문에", "또", "또한", "뚝뚝", "라 해도", "령", "로", "로 인하여", "로부터", "로써", "륙", "를", "마음대로", "마저", "마저도", "마치", "막론하고", "만 못하다", "만약", "만약에", "만은 아니다", "만이 아니다", "만일", "만큼", "말하자면", "말할것도 없고", "매", "매번", "메쓰겁다", "몇", "모", "모두", "무렵", "무릎쓰고", "무슨", "무엇", "무엇때문에", "물론", "및", "바꾸어말하면", "바꾸어말하자면", "바꾸어서 말하면", "바꾸어서 한다면", "바꿔 말하면", "바로", "바와같이", "밖에 안된다", "반대로", "반대로 말하자면", "반드시", "버금", "보는데서", "보다더", "보드득", "본대로", "봐", "봐라", "부류의 사람들", "부터", "불구하고", "불문하고", "붕붕", "비걱거리다", "비교적", "비길수 없다", "비로소", "비록", "비슷하다", "비추어 보아", "비하면", "뿐만 아니라", "뿐만아니라", "뿐이다", "삐걱", "삐걱거리다", "사", "삼", "상대적으로 말하자면", "생각한대로", "설령", "설마", "설사", "셋", "소생", "소인", "솨", "쉿", "습니까", "습니다", "시각", "시간", "시작하여", "시초에", "시키다", "실로", "심지어", "아", "아니", "아니나다를가", "아니라면", "아니면", "아니었다면", "아래윗", "아무거나", "아무도", "아야", "아울러", "아이", "아이고", "아이구", "아이야", "아이쿠", "아하", "아홉", "안 그러면", "않기 위하여", "않기 위해서", "알 수 있다", "알았어", "앗", "앞에서", "앞의것", "야", "약간", "양자", "어", "어기여차", "어느", "어느 년도", "어느것", "어느곳", "어느때", "어느쪽", "어느해", "어디", "어때", "어떠한", "어떤", "어떤것", "어떤것들", "어떻게", "어떻해", "어이", "어째서", "어쨋든", "어쩔수 없다", "어찌", "어찌됏든", "어찌됏어", "어찌하든지", "어찌하여", "언제", "언젠가", "얼마", "얼마 안 되는 것", "얼마간", "얼마나", "얼마든지", "얼마만큼", "얼마큼", "엉엉", "에", "에 가서", "에 달려 있다", "에 대해", "에 있다", "에 한하다", "에게", "에서", "여", "여기", "여덟", "여러분", "여보시오", "여부", "여섯", "여전히", "여차", "연관되다", "연이서", "영", "영차", "옆사람", "예", "예를 들면", "예를 들자면", "예컨대", "예하면", "오", "오로지", "오르다", "오자마자", "오직", "오호", "오히려", "와", "와 같은 사람들", "와르르", "와아", "왜", "왜냐하면", "외에도", "요만큼", "요만한 것", "요만한걸", "요컨대", "우르르", "우리", "우리들", "우선", "우에 종합한것과같이", "운운", "월", "위에서 서술한바와같이", "위하여", "위해서", "윙윙", "육", "으로", "으로 인하여", "으로서", "으로써", "을", "응", "응당", "의", "의거하여", "의지하여", "의해", "의해되다", "의해서", "이", "이 되다", "이 때문에", "이 밖에", "이 외에", "이 정도의", "이것", "이곳", "이때", "이라면", "이래", "이러이러하다", "이러한", "이런", "이럴정도로", "이렇게 많은 것", "이렇게되면", "이렇게말하자면", "이렇구나", "이로 인하여", "이르기까지", "이리하여", "이만큼", "이번", "이봐", "이상", "이어서", "이었다", "이와 같다", "이와 같은", "이와 반대로", "이와같다면", "이외에도", "이용하여", "이유만으로", "이젠", "이지만", "이쪽", "이천구", "이천육", "이천칠", "이천팔", "인 듯하다", "인젠", "일", "일것이다", "일곱", "일단", "일때", "일반적으로", "일지라도", "임에 틀림없다", "입각하여", "입장에서", "잇따라", "있다", "자", "자기", "자기집", "자마자", "자신", "잠깐", "잠시", "저", "저것", "저것만큼", "저기", "저쪽", "저희", "전부", "전자", "전후", "점에서 보아", "정도에 이르다", "제", "제각기", "제외하고", "조금", "조차", "조차도", "졸졸", "좀", "좋아", "좍좍", "주룩주룩", "주저하지 않고", "줄은 몰랏다", "줄은모른다", "중에서", "중의하나", "즈음하여", "즉", "즉시", "지든지", "지만", "지말고", "진짜로", "쪽으로", "차라리", "참", "참나", "첫번째로", "쳇", "총적으로", "총적으로 말하면", "총적으로 보면", "칠", "콸콸", "쾅쾅", "쿵", "타다", "타인", "탕탕", "토하다", "통하여", "툭", "퉤", "틈타", "팍", "팔", "퍽", "펄렁", "하", "하게될것이다", "하게하다", "하겠는가", "하고 있다", "하고있었다", "하곤하였다", "하구나", "하기 때문에", "하기 위하여", "하기는한데", "하기만 하면", "하기보다는", "하기에", "하나", "하느니", "하는 김에", "하는 편이 낫다", "하는것도", "하는것만 못하다", "하는것이 낫다", "하는바", "하더라도", "하도다", "하도록시키다", "하도록하다", "하든지", "하려고하다", "하마터면", "하면 할수록", "하면된다", "하면서", "하물며", "하여금", "하여야", "하자마자", "하지 않는다면", "하지 않도록", "하지마", "하지마라", "하지만", "하하", "한 까닭에", "한 이유는", "한 후", "한다면", "한다면 몰라도", "한데", "한마디", "한적이있다", "한켠으로는", "한항목", "할 따름이다", "할 생각이다", "할 줄 안다", "할 지경이다", "할 힘이 있다", "할때", "할만하다", "할망정", "할뿐", "할수있다", "할수있어", "할줄알다", "할지라도", "할지언정", "함께", "해도된다", "해도좋다", "해봐요", "해서는 안된다", "해야한다", "해요", "했어요", "향하다", "향하여", "향해서", "허", "허걱", "허허", "헉", "헉헉", "헐떡헐떡", "형식으로 쓰여", "혹시", "혹은", "혼자", "훨씬", "휘익", "휴", "흐흐", "흥", "힘입어", "︿", "!", "#", "$", "%", "&", "(", ")", "*", "+", ",", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", ":", ";", "<", ">", "?", "@", "[", "]", "{", "|", "}", "~", "¥", ]; ================================================ FILE: src/stopwords/lat.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) // Notice: we do not have stopwords for this language yet. pub static STOPWORDS_LAT: &[&str] = &[ "a", "ab", "ac", "ad", "at", "atque", "aut", "autem", "cum", "de", "dum", "e", "erant", "erat", "est", "et", "etiam", "ex", "haec", "hic", "hoc", "in", "ita", "me", "nec", "neque", "non", "per", "qua", "quae", "quam", "qui", "quibus", "quidem", "quo", "quod", "re", "rebus", "rem", "res", "sed", "si", "sic", "sunt", "tamen", "tandem", "te", "ut", "vel", ]; ================================================ FILE: src/stopwords/lav.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_LAV: &[&str] = &[ "aiz", "ap", "apakš", "apakšpus", "ar", "arī", "augšpus", "bet", "bez", "bija", "biji", "biju", "bijām", "bijāt", "būs", "būsi", "būsiet", "būsim", "būt", "būšu", "caur", "diemžēl", "diezin", "droši", "dēļ", "esam", "esat", "esi", "esmu", "gan", "gar", "iekam", "iekams", "iekām", "iekāms", "iekš", "iekšpus", "ik", "ir", "it", "itin", "iz", "ja", "jau", "jeb", "jebšu", "jel", "jo", "jā", "ka", "kamēr", "kaut", "kolīdz", "kopš", "kā", "kļuva", "kļuvi", "kļuvu", "kļuvām", "kļuvāt", "kļūs", "kļūsi", "kļūsiet", "kļūsim", "kļūst", "kļūstam", "kļūstat", "kļūsti", "kļūstu", "kļūt", "kļūšu", "labad", "lai", "lejpus", "līdz", "līdzko", "ne", "nebūt", "nedz", "nekā", "nevis", "nezin", "no", "nu", "nē", "otrpus", "pa", "par", "pat", "pie", "pirms", "pret", "priekš", "pār", "pēc", "starp", "tad", "tak", "tapi", "taps", "tapsi", "tapsiet", "tapsim", "tapt", "tapāt", "tapšu", "taču", "te", "tiec", "tiek", "tiekam", "tiekat", "tieku", "tik", "tika", "tikai", "tiki", "tikko", "tiklab", "tiklīdz", "tiks", "tiksiet", "tiksim", "tikt", "tiku", "tikvien", "tikām", "tikāt", "tikšu", "tomēr", "topat", "turpretim", "turpretī", "tā", "tādēļ", "tālab", "tāpēc", "un", "uz", "vai", "var", "varat", "varēja", "varēji", "varēju", "varējām", "varējāt", "varēs", "varēsi", "varēsiet", "varēsim", "varēt", "varēšu", "vien", "virs", "virspus", "vis", "viņpus", "zem", "ārpus", "šaipus", ]; ================================================ FILE: src/stopwords/lit.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_LIT: &[&str] = &[ "abi", "abidvi", "abiejose", "abiejuose", "abiejø", "abiem", "abigaliai", "abipus", "abu", "abudu", "ai", "ana", "anaiptol", "anaisiais", "anajai", "anajam", "anajame", "anapus", "anas", "anasai", "anasis", "anei", "aniedvi", "anieji", "aniesiems", "anoji", "anojo", "anojoje", "anokia", "anoks", "anosiomis", "anosioms", "anosios", "anosiose", "anot", "ant", "antai", "anuodu", "anuoju", "anuosiuose", "anuosius", "anàja", "anàjà", "anàjá", "anàsias", "anøjø", "apie", "aplink", "ar", "arba", "argi", "arti", "aukðèiau", "að", "be", "bei", "beje", "bemaþ", "bent", "bet", "betgi", "beveik", "dar", "dargi", "daugmaþ", "deja", "dëka", "dël", "dëlei", "dëlto", "ech", "et", "gal", "galbût", "galgi", "gan", "gana", "gi", "greta", "idant", "iki", "ir", "irgi", "it", "itin", "ið", "iðilgai", "iðvis", "jaisiais", "jajai", "jajam", "jajame", "jei", "jeigu", "ji", "jiedu", "jiedvi", "jieji", "jiesiems", "jinai", "jis", "jisai", "jog", "joji", "jojo", "jojoje", "jokia", "joks", "josiomis", "josioms", "josios", "josiose", "judu", "judvi", "juk", "jumis", "jums", "jumyse", "juodu", "juoju", "juosiuose", "juosius", "jus", "jàja", "jàjà", "jàsias", "jájá", "jøjø", "jûs", "jûsiðkis", "jûsiðkë", "jûsø", "kad", "kada", "kadangi", "kai", "kaip", "kaipgi", "kas", "katra", "katras", "katriedvi", "katruodu", "kaþin", "kaþkas", "kaþkatra", "kaþkatras", "kaþkokia", "kaþkoks", "kaþkuri", "kaþkuris", "kiaurai", "kiek", "kiekvienas", "kieno", "kita", "kitas", "kitokia", "kitoks", "kodël", "kokia", "koks", "kol", "kolei", "kone", "kuomet", "kur", "kurgi", "kuri", "kuriedvi", "kuris", "kuriuodu", "lai", "lig", "ligi", "link", "lyg", "man", "manaisiais", "manajai", "manajam", "manajame", "manas", "manasai", "manasis", "mane", "manieji", "maniesiems", "manim", "manimi", "maniðkis", "maniðkë", "mano", "manoji", "manojo", "manojoje", "manosiomis", "manosioms", "manosios", "manosiose", "manuoju", "manuosiuose", "manuosius", "manyje", "manàja", "manàjà", "manàjá", "manàsias", "manæs", "manøjø", "mat", "maþdaug", "maþne", "mes", "mudu", "mudvi", "mumis", "mums", "mumyse", "mus", "mûsiðkis", "mûsiðkë", "mûsø", "na", "nagi", "ne", "nebe", "nebent", "negi", "negu", "nei", "nejau", "nejaugi", "nekaip", "nelyginant", "nes", "net", "netgi", "netoli", "neva", "nors", "nuo", "në", "o", "ogi", "oi", "paeiliui", "pagal", "pakeliui", "palaipsniui", "palei", "pas", "pasak", "paskos", "paskui", "paskum", "pat", "pati", "patiems", "paties", "pats", "patys", "patá", "paèiais", "paèiam", "paèiame", "paèiu", "paèiuose", "paèius", "paèiø", "per", "pernelyg", "pirm", "pirma", "pirmiau", "po", "prie", "prieð", "prieðais", "pro", "pusiau", "rasi", "rodos", "sau", "savaisiais", "savajai", "savajam", "savajame", "savas", "savasai", "savasis", "save", "savieji", "saviesiems", "savimi", "saviðkis", "saviðkë", "savo", "savoji", "savojo", "savojoje", "savosiomis", "savosioms", "savosios", "savosiose", "savuoju", "savuosiuose", "savuosius", "savyje", "savàja", "savàjà", "savàjá", "savàsias", "savæs", "savøjø", "skersai", "skradþiai", "staèiai", "su", "sulig", "ta", "tad", "tai", "taigi", "taip", "taipogi", "taisiais", "tajai", "tajam", "tajame", "tamsta", "tarp", "tarsi", "tartum", "tarytum", "tas", "tasai", "tau", "tavaisiais", "tavajai", "tavajam", "tavajame", "tavas", "tavasai", "tavasis", "tave", "tavieji", "taviesiems", "tavimi", "taviðkis", "taviðkë", "tavo", "tavoji", "tavojo", "tavojoje", "tavosiomis", "tavosioms", "tavosios", "tavosiose", "tavuoju", "tavuosiuose", "tavuosius", "tavyje", "tavàja", "tavàjà", "tavàjá", "tavàsias", "tavæs", "tavøjø", "taèiau", "te", "tegu", "tegul", "tiedvi", "tieji", "ties", "tiesiems", "tiesiog", "tik", "tikriausiai", "tiktai", "toji", "tojo", "tojoje", "tokia", "toks", "tol", "tolei", "toliau", "tosiomis", "tosioms", "tosios", "tosiose", "tu", "tuodu", "tuoju", "tuosiuose", "tuosius", "turbût", "tàja", "tàjà", "tàjá", "tàsias", "tøjø", "tûlas", "uþ", "uþtat", "uþvis", "va", "vai", "viduj", "vidury", "vien", "vienas", "vienokia", "vienoks", "vietoj", "virð", "virðuj", "virðum", "vis", "vis dëlto", "visa", "visas", "visgi", "visokia", "visoks", "vos", "vël", "vëlgi", "ypaè", "á", "ákypai", "ástriþai", "ðalia", "ðe", "ði", "ðiaisiais", "ðiajai", "ðiajam", "ðiajame", "ðiapus", "ðiedvi", "ðieji", "ðiesiems", "ðioji", "ðiojo", "ðiojoje", "ðiokia", "ðioks", "ðiosiomis", "ðiosioms", "ðiosios", "ðiosiose", "ðis", "ðisai", "ðit", "ðita", "ðitas", "ðitiedvi", "ðitokia", "ðitoks", "ðituodu", "ðiuodu", "ðiuoju", "ðiuosiuose", "ðiuosius", "ðiàja", "ðiàjà", "ðiàsias", "ðiøjø", "ðtai", "ðájá", "þemiau", ]; ================================================ FILE: src/stopwords/mal.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_MAL: &[&str] = &[ "കാണാന്‍", "നിന്ന്", "കുറഞ്ഞ", "മുഴുവന്", "കൂടാതെ", "ആദ്യം", "ഈ", "കൂടുതല്‍", "താങ്കള്‍", "എന്നാല്", "അതിനു", "ശേഷം", "ചെയ്യുന്നു", "ഇവിടത്തെ", "വേണ്ടി", "ഏറ്റവും", "ഇതില്", "വേണ്ടിയും", "ആണ്", "സ്ഥിതിചെയ്യുന്നു", "സ്ഥിതി", "സ്ഥിതിചെയ്യുന്ന", "ചെയ്യണം", "നമ്മുടെ", "ഇപ്പോള്", "ഒരു", "തന്റെ", "ചെയ്യുന്ന", "എന്ന", "ചെയ്യുന്നത്", "ഉണ്ട്", "മുന്‍പ്", "മുമ്പ്", "കൂടെ", "ചേര്‍ത്തു", "ഇപ്രകാരം", "എന്നിവയുടെ", "കഴിയും", "എന്നീ", "ഇതാണ്", "വളരെ", "കാരണം", "ഇവിടത്തെ", "എപ്പോഴും", "കൊണ്ട്", "നല്ല", "ധാരാളം", "എപ്പോഴും", "ഇവ", "കാരണം", "ഇതു", "മാത്രമല്ല", "മറ്റു", "എന്നിവ", "കൂടിയാണ്", "ഇടയില്", "ഇല്ല", "എന്നാണ്", "എന്നു", "കുറച്ച്", "അതായത്", "എന്തെന്നാല്", "എന്നറിയപ്പെടുന്നു", "കിടക്കുന്ന", "പോയാല്", "ഇത്", "എല്ലാ", "വേണ്ടി", "ഇവിടെ", "വരുന്നു", "പോലുള്ള", "വലിയ", "പറഞ്ഞ്", "ഇതിനെ", "കൊടുത്തിട്ടും", "എന്ന്", "വേണം", "ഒരുപോലെ", "ഒരു പോലെ", "കാര്യമാണ്", "കഴിയുന്നു", "വളരെ", "അധികം", "വളരെ അധികം", "വളരെയധികം", "പോയി", "ഉണ്ടാകുന്നുണ്ട്", "പക്ഷേ", "അതേ", "കൊണ്ട്", "ഏത്", "നിന്നും", "എത്താന്‍", "അടുത്ത്", "ആയി", "എന്നു പറയുന്നു", "ഇപ്പോൾ", "ഏകദേശം", "എന്നുപറയുന്നു", "കാണാൻ", "ആ", "വിവിധ", "ഇതിന്റെ", "നിന്നു", "ഇതിന്", "അടുത്ത", "അടുത്തുള്ള", "പല", "പ്രധാന", "നിലനിൽക്കുന്ന", "നിലനിൽക്കുന്നത്", "മുതലായവ", "മുതലായവക്ക്", "വേണ്ട", "പ്രാധാന്യം", ]; ================================================ FILE: src/stopwords/mar.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_MAR: &[&str] = &[ "अधिक", "अनेक", "अशी", "असलयाचे", "असलेल्या", "असा", "असून", "असे", "आज", "आणि", "आता", "आपल्या", "आला", "आली", "आले", "आहे", "आहेत", "एक", "एका", "कमी", "करणयात", "करून", "का", "काम", "काय", "काही", "किवा", "की", "केला", "केली", "केले", "कोटी", "गेल्या", "घेऊन", "जात", "झाला", "झाली", "झाले", "झालेल्या", "टा", "डॉ", "तर", "तरी", "तसेच", "ता", "ती", "तीन", "ते", "तो", "त्या", "त्याचा", "त्याची", "त्याच्या", "त्याना", "त्यानी", "त्यामुळे", "त्री", "दिली", "दोन", "न", "नाही", "निर्ण्य", "पण", "पम", "परयतन", "पाटील", "म", "मात्र", "माहिती", "मी", "मुबी", "म्हणजे", "म्हणाले", "म्हणून", "या", "याचा", "याची", "याच्या", "याना", "यानी", "येणार", "येत", "येथील", "येथे", "लाख", "व", "व्यकत", "सर्व", "सागित्ले", "सुरू", "हजार", "हा", "ही", "हे", "होणार", "होत", "होता", "होती", "होते", ]; ================================================ FILE: src/stopwords/mkd.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) // Notice: we do not have stopwords for this language yet. pub static STOPWORDS_MKD: &[&str] = &[]; ================================================ FILE: src/stopwords/mod.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) // All stopwords are sourced from: https://github.com/stopwords-iso // Last update: 7th March 2019 pub mod afr; pub mod aka; pub mod amh; pub mod ara; pub mod aze; pub mod bel; pub mod ben; pub mod bul; pub mod cat; pub mod ces; pub mod cmn; pub mod dan; pub mod deu; pub mod ell; pub mod eng; pub mod epo; pub mod est; pub mod fin; pub mod fra; pub mod guj; pub mod heb; pub mod hin; pub mod hrv; pub mod hun; pub mod hye; pub mod ind; pub mod ita; pub mod jav; pub mod jpn; pub mod kan; pub mod kat; pub mod khm; pub mod kor; pub mod lat; pub mod lav; pub mod lit; pub mod mal; pub mod mar; pub mod mkd; pub mod mya; pub mod nep; pub mod nld; pub mod nob; pub mod ori; pub mod pan; pub mod pes; pub mod pol; pub mod por; pub mod ron; pub mod rus; pub mod sin; pub mod slk; pub mod slv; pub mod sna; pub mod spa; pub mod srp; pub mod swe; pub mod tam; pub mod tel; pub mod tgl; pub mod tha; pub mod tuk; pub mod tur; pub mod ukr; pub mod urd; pub mod uzb; pub mod vie; pub mod yid; pub mod zul; ================================================ FILE: src/stopwords/mya.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_MYA: &[&str] = &[ "အပေါ်", "အနက်", "အမြဲတမ်း", "အတွင်းတွင်", "မကြာမီ", "မတိုင်မီ", "ဒါ့အပြင်", "အောက်မှာ", "အထဲမှာ", "ဘယ်တော့မျှ", "မကြာခဏ", "တော်တော်လေး", "စဉ်တွင်", "နှင့်အတူ", "နှင့်", "နှင့်တကွ", "ကျွန်တော်", "ကျွန်မ", "ငါ", "ကျုပ်", "ကျွနု်ပ်", "ကျနော်", "ကျမ", "သူ", "သူမ", "ထိုဟာ", "ထိုအရာ", "ဤအရာ", "ထို", "၄င်း", "ကျွန်တော်တို့", "ကျွန်မတို့", "ငါတို့", "ကျုပ်တို့", "ကျွနု်ပ်တို့", "ကျနော်တို့", "ကျမတို့", "သင်", "သင်တို့", "နင်တို့", "မင်း", "မင်းတို့", "သူတို့", "ကျွန်တော်အား", "ကျွန်တော်ကို", "ကျွန်မကို", "ငါကို", "ကျုပ်ကို", "ကျွနု်ပ်ကို", "သူ့ကို", "သူမကို", "ထိုအရာကို", "သင့်ကို", "သင်တို့ကို", "နင်တို့ကို", "မင်းကို", "မင်းတို့ကို", "ငါတို့ကို", "ကျုပ်တို့ကို", "ကျွနု်ပ်တို့ကို", "မိမိကိုယ်တိုင်", "မိမိဘာသာ", "မင်းကိုယ်တိုင်", "မင်းဘာသာ", "မင်းတို့ကိုယ်တိုင်", "မင်းတို့ဘာသာ", "သူကိုယ်တိုင်", "ကိုယ်တိုင်", "သူမကိုယ်တိုင်", "သူ့ဘာသာ", "သူ့ကိုယ်ကို", "ကိုယ့်ကိုယ်ကို", "မိမိကိုယ်ကို", "၄င်းပင်", "ထိုအရာပင်", "သည့်", "မည့်", "တဲ့", "ကျွနု်ပ်၏", "ကျွန်တော်၏", "ကျွန်မ၏", "ကျနော်၏", "ကျမ၏", "သူ၏", "သူမ၏", "ထိုအရာ၏", "ထိုဟာ၏", "ကျွနု်ပ်တို့၏", "ငါတို့၏", "ကျွန်တော်တို့၏", "ကျွန်မတို့၏", "ကျနော်တို့၏", "ကျမတို့၏", "သင်၏", "သင်တို့၏", "မင်း၏", "မင်းတို့၏", "သူတို့၏", "ကျွန်တော့်ဟာ", "ကျွန်မဟာ", "ကျနော်၏ဟာ", "ကျမ၏ဟာ", "ကျမဟာ", "ကျနော်ဟာ", "သူဟာ", "သူမဟာ", "သူ့ဟာ", "ကျွနု်ပ်တို့ဟာ", "ကျွန်တော်တို့ဟာ", "ကျွန်မတို့ဟာ", "သင်တို့ဟာ", "မင်းတို့ဟာ", "သူတို့ဟာ", "သူမတို့ဟာ", "ဤအရာ", "ဟောဒါ", "ဟောဒီ", "ဟောဒီဟာ", "ဒီဟာ", "ဒါ", "ထိုအရာ", "၄င်းအရာ", "ယင်းအရာ", "အဲဒါ", "ဟိုဟာ", "အချို့", "တစ်ခုခု", "အဘယ်မဆို", "ဘယ်အရာမဆို", "အဘယ်မည်သော", "အကြင်", "အရာရာတိုင်း", "စိုးစဉ်မျှ", "စိုးစဉ်းမျှ", "ဘယ်လောက်မဆို", "တစ်စုံတစ်ရာ", "တစုံတရာ", "အလျဉ်းမဟုတ်", "မည်သည့်နည်းနှင့်မျှမဟုတ်", "အလျဉ်းမရှိသော", "အခြားဖြစ်သော", "အခြားသော", "အခြားတစ်ခု", "အခြားတစ်ယောက်", "အားလုံး", "အရာရာတိုင်း", "အကုန်လုံး", "အလုံးစုံ", "အရာခပ်သိမ်း", "တစ်ခုစီ", "အသီးသီး", "တစ်ဦးဦး", "တစ်ခုခု", "ကိုယ်စီကိုယ်ငှ", "ကိုယ်စီ", "တစ်ဦးစီ", "တစ်ယောက်စီ", "တစ်ခုစီ", "အကုန်", "အပြည့်အစုံ", "လုံးလုံး", "နှစ်ခုလုံး", "နှစ်ယောက်လုံး", "နှစ်ဘက်လုံး", "တစ်စုံတစ်ရာ", "တစ်စုံတစ်ခု", "တစုံတခု", "တစ်စုံတစ်ယောက်", "တစုံတယောက်", "တစ်ယောက်ယောက်", "မည်သူမဆို", "ဘာမျှမရှိ", "ဘာမှမရှိ", "အဘယ်အရာမျှမရှိ", "လူတိုင်း", "လူတကာ", "နှင့်", "ပြီးလျှင်", "၄င်းနောက်", "သို့မဟုတ်", "သို့တည်းမဟုတ်", "သို့မဟုတ်လျှင်", "ဒါမှမဟုတ်", "ဖြစ်စေ", "သို့စေကာမူ", "ဒါပေမယ့်", "ဒါပေမဲ့", "မှတစ်ပါး", "မှလွဲလျှင်", "အဘယ်ကြောင့်ဆိုသော်", "သောကြောင့်", "သဖြင့်", "၍", "သည့်အတွက်ကြောင့်", "လျှင်", "ပါက", "အကယ်၍", "သော်ငြားလည်း", "စေကာမူ", "နည်းတူ", "ပေမယ့်", "ပေမဲ့", "ထိုနည်းတူစွာ", "ထိုနည်းတူ", "ကဲ့သို့", "သကဲ့သို့", "ယင်းကဲ့သို့", "ထိုကဲ့သို့", "နှင့်စပ်လျဉ်း၍", "ဤမျှ", "ဤမျှလောက်", "ဤကဲ့သို့", "အခုလောက်ထိ", "ဒါကတော့", "အဘယ်ကဲ့သလို့", "မည်ကဲ့သို့", "မည်သည့်နည်းနှင့်", "မည်သည့်နည်းဖြင့်", "မည်သည့်နည့်နှင့်မဆို", "မည်သည့်နည်းဖြင့်မဆို", "မည်သို့", "ဘယ်လိုလဲ", "သို့ပေတည့်", "သို့ပေမည့်", "ဘယ်နည်းနှင့်", "မည်ရွေ့မည်မျှ", "အဘယ်မျှလောက်", "ဘယ်လောက်", "မည်သူ", "ဘယ်သူ", "မည်သည့်အကြောင်းကြောင့်", "ဘာအတွက်ကြောင့်", "အဘယ်ကြောင့်", "မည်သည့်အတွက်ကြောင့်", "ဘာကြောင့်", "ဘာအတွက်နဲ့လဲ", "မည်သည်", "ဘာလဲ", "အဘယ်အရာနည်း", "မည်သည့်အရပ်မှာ", "ဘယ်နေရာတွင်", "မည်သည့်နေရာတွင်", "မည်သည့်နေရာသို့", "ဘယ်နေရာသို့", "ဘယ်နေရာမှာ", "ဘယ်သူ၏", "မည်သည့်အရာ၏", "မည်သည့်အခါ", "ဘယ်အချိန်", "ဘယ်အခါ", "မည်သည့်အချိန်", "ဘယ်တော့", "မည်သူကို", "မည်သူက", "ဘယ်သူ့ကို", "မည်သူမည်ဝါ", "မည်သည့်အရာ", "ဘယ်အရာ", "မည်သို့ပင်ဖြစ်စေ", "ဘယ်လိုပဲဖြစ်ဖြစ်", "မည်ရွေ့မည်မျှဖြစ်စေ", "မည်သည့်နည်းနှင့်မဆို", "ဘယ်နည်းနဲ့ဖြစ်ဖြစ်", "မည်သူမဆို", "ဘယ်သူမဆို", "အဘယ်သူမဆို", "မည်သည့်အရာမဆို", "ဘာဖြစ်ဖြစ်", "မည်သည့်အရာဖြစ်ဖြစ်", "မည်သည့်အရပ်၌မဆို", "မည်သည့်နေရာမဆို", "ဘယ်အခါမဆို", "ဘယ်အချိန်မဆို", "ဘယ်အခါဖြစ်ဖြစ်", "အချိန်အခါမရွေး", ]; ================================================ FILE: src/stopwords/nep.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_NEP: &[&str] = &[ "छ", "र", "पनि", "छन्", "लागि", "भएको", "गरेको", "भने", "गर्न", "गर्ने", "हो", "तथा", "यो", "रहेको", "उनले", "थियो", "हुने", "गरेका", "थिए", "गर्दै", "तर", "नै", "को", "मा", "हुन्", "भन्ने", "हुन", "गरी", "त", "हुन्छ", "अब", "के", "रहेका", "गरेर", "छैन", "दिए", "भए", "यस", "ले", "गर्नु", "औं", "सो", "त्यो", "कि", "जुन", "यी", "का", "गरि", "ती", "न", "छु", "छौं", "लाई", "नि", "उप", "अक्सर", "आदि", "कसरी", "क्रमशः", "चाले", "अगाडी", "अझै", "अनुसार", "अन्तर्गत", "अन्य", "अन्यत्र", "अन्यथा", "अरु", "अरुलाई", "अर्को", "अर्थात", "अर्थात्", "अलग", "आए", "आजको", "ओठ", "आत्म", "आफू", "आफूलाई", "आफ्नै", "आफ्नो", "आयो", "उदाहरण", "उनको", "उहालाई", "एउटै", "एक", "एकदम", "कतै", "कम से कम", "कसै", "कसैले", "कहाँबाट", "कहिलेकाहीं", "का", "किन", "किनभने", "कुनै", "कुरा", "कृपया", "केही", "कोही", "गए", "गरौं", "गर्छ", "गर्छु", "गर्नुपर्छ", "गयौ", "गैर", "चार", "चाहनुहुन्छ", "चाहन्छु", "चाहिए", "छू", "जताततै", "जब", "जबकि", "जसको", "जसबाट", "जसमा", "जसलाई", "जसले", "जस्तै", "जस्तो", "जस्तोसुकै", "जहाँ", "जान", "जाहिर", "जे", "जो", "ठीक", "तत्काल", "तदनुसार", "तपाईको", "तपाई", "पर्याप्त", "पहिले", "पहिलो", "पहिल्यै", "पाँच", "पाँचौं", "तल", "तापनी", "तिनी", "तिनीहरू", "तिनीहरुको", "तिनिहरुलाई", "तिमी", "तिर", "तीन", "तुरुन्तै", "तेस्रो", "तेस्कारण", "पूर्व", "प्रति", "प्रतेक", "प्लस", "फेरी", "बने", "त्सपछि", "त्सैले", "त्यहाँ", "थिएन", "दिनुभएको", "दिनुहुन्छ", "दुई", "देखि", "बरु", "बारे", "बाहिर", "देखिन्छ", "देखियो", "देखे", "देखेको", "देखेर", "दोस्रो", "धेरै", "नजिकै", "नत्र", "नयाँ", "निम्ति", "बाहेक", "बीच", "बीचमा", "भन", "निम्न", "निम्नानुसार", "निर्दिष्ट", "नौ", "पक्का", "पक्कै", "पछि", "पछिल्लो", "पटक", "पर्छ", "पर्थ्यो", "भन्छन्", "भन्", "भन्छु", "भन्दा", "भन्नुभयो", "भर", "भित्र", "भित्री", "म", "मलाई", "मात्र", "माथि", "मुख्य", "मेरो", "यति", "यथोचित", "यदि", "यद्यपि", "यसको", "यसपछि", "यसबाहेक", "यसरी", "यसो", "यस्तो", "यहाँ", "यहाँसम्म", "या", "रही", "राखे", "राख्छ", "राम्रो", "रूप", "लगभग", "वरीपरी", "वास्तवमा", "बिरुद्ध", "बिशेष", "सायद", "शायद", "संग", "संगै", "सक्छ", "सट्टा", "सधै", "सबै", "सबैलाई", "समय", "सम्भव", "सम्म", "सही", "साँच्चै", "सात", "साथ", "साथै", "सारा", "सोही", "स्पष्ट", "हरे", "हरेक", ]; ================================================ FILE: src/stopwords/nld.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_NLD: &[&str] = &[ "aan", "aangaande", "aangezien", "achte", "achter", "achterna", "af", "afgelopen", "al", "aldaar", "aldus", "alhoewel", "alias", "alle", "allebei", "alleen", "alles", "als", "alsnog", "altijd", "altoos", "ander", "andere", "anders", "anderszins", "beetje", "behalve", "behoudens", "beide", "beiden", "ben", "beneden", "bent", "bepaald", "betreffende", "bij", "bijna", "bijv", "binnen", "binnenin", "blijkbaar", "blijken", "boven", "bovenal", "bovendien", "bovengenoemd", "bovenstaand", "bovenvermeld", "buiten", "bv", "daar", "daardoor", "daarheen", "daarin", "daarna", "daarnet", "daarom", "daarop", "daaruit", "daarvanlangs", "dan", "dat", "de", "deden", "deed", "der", "derde", "derhalve", "dertig", "deze", "dhr", "die", "dikwijls", "dit", "doch", "doe", "doen", "doet", "door", "doorgaand", "drie", "duizend", "dus", "echter", "een", "eens", "eer", "eerdat", "eerder", "eerlang", "eerst", "eerste", "eigen", "eigenlijk", "elk", "elke", "en", "enig", "enige", "enigszins", "enkel", "er", "erdoor", "erg", "ergens", "etc", "etcetera", "even", "eveneens", "evenwel", "gauw", "ge", "gedurende", "geen", "gehad", "gekund", "geleden", "gelijk", "gemoeten", "gemogen", "genoeg", "geweest", "gewoon", "gewoonweg", "haar", "haarzelf", "had", "hadden", "hare", "heb", "hebben", "hebt", "hedden", "heeft", "heel", "hem", "hemzelf", "hen", "het", "hetzelfde", "hier", "hierbeneden", "hierboven", "hierin", "hierna", "hierom", "hij", "hijzelf", "hoe", "hoewel", "honderd", "hun", "hunne", "ieder", "iedere", "iedereen", "iemand", "iets", "ik", "ikzelf", "in", "inderdaad", "inmiddels", "intussen", "inzake", "is", "ja", "je", "jezelf", "jij", "jijzelf", "jou", "jouw", "jouwe", "juist", "jullie", "kan", "klaar", "kon", "konden", "krachtens", "kun", "kunnen", "kunt", "laatst", "later", "liever", "lijken", "lijkt", "maak", "maakt", "maakte", "maakten", "maar", "mag", "maken", "me", "meer", "meest", "meestal", "men", "met", "mevr", "mezelf", "mij", "mijn", "mijnent", "mijner", "mijzelf", "minder", "miss", "misschien", "missen", "mits", "mocht", "mochten", "moest", "moesten", "moet", "moeten", "mogen", "mr", "mrs", "mw", "na", "naar", "nadat", "nam", "namelijk", "nee", "neem", "negen", "nemen", "nergens", "net", "niemand", "niet", "niets", "niks", "noch", "nochtans", "nog", "nogal", "nooit", "nu", "nv", "of", "ofschoon", "om", "omdat", "omhoog", "omlaag", "omstreeks", "omtrent", "omver", "ondanks", "onder", "ondertussen", "ongeveer", "ons", "onszelf", "onze", "onzeker", "ooit", "ook", "op", "opnieuw", "opzij", "over", "overal", "overeind", "overige", "overigens", "paar", "pas", "per", "precies", "recent", "redelijk", "reeds", "rond", "rondom", "samen", "sedert", "sinds", "sindsdien", "slechts", "sommige", "spoedig", "steeds", "tamelijk", "te", "tegen", "tegenover", "tenzij", "terwijl", "thans", "tien", "tiende", "tijdens", "tja", "toch", "toe", "toen", "toenmaals", "toenmalig", "tot", "totdat", "tussen", "twee", "tweede", "u", "uit", "uitgezonderd", "uw", "vaak", "vaakwat", "van", "vanaf", "vandaan", "vanuit", "vanwege", "veel", "veeleer", "veertig", "verder", "verscheidene", "verschillende", "vervolgens", "via", "vier", "vierde", "vijf", "vijfde", "vijftig", "vol", "volgend", "volgens", "voor", "vooraf", "vooral", "vooralsnog", "voorbij", "voordat", "voordezen", "voordien", "voorheen", "voorop", "voorts", "vooruit", "vrij", "vroeg", "waar", "waarom", "waarschijnlijk", "wanneer", "want", "waren", "was", "wat", "we", "wederom", "weer", "weg", "wegens", "weinig", "wel", "weldra", "welk", "welke", "werd", "werden", "werder", "wezen", "whatever", "wie", "wiens", "wier", "wij", "wijzelf", "wil", "wilden", "willen", "word", "worden", "wordt", "zal", "ze", "zei", "zeker", "zelf", "zelfde", "zelfs", "zes", "zeven", "zich", "zichzelf", "zij", "zijn", "zijne", "zijzelf", "zo", "zoals", "zodat", "zodra", "zonder", "zou", "zouden", "zowat", "zulk", "zulke", "zullen", "zult", ]; ================================================ FILE: src/stopwords/nob.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_NOB: &[&str] = &[ "og", "i", "jeg", "det", "at", "en", "et", "den", "til", "er", "som", "på", "de", "med", "han", "av", "ikke", "ikkje", "der", "så", "var", "meg", "seg", "men", "ett", "har", "om", "vi", "min", "mitt", "ha", "hadde", "hun", "nå", "over", "da", "ved", "fra", "du", "ut", "sin", "dem", "oss", "opp", "man", "kan", "hans", "hvor", "eller", "hva", "skal", "selv", "sjøl", "her", "alle", "vil", "bli", "ble", "blei", "blitt", "kunne", "inn", "når", "være", "kom", "noen", "noe", "ville", "dere", "som", "deres", "kun", "ja", "etter", "ned", "skulle", "denne", "for", "deg", "si", "sine", "sitt", "mot", "å", "meget", "hvorfor", "dette", "disse", "uten", "hvordan", "ingen", "din", "ditt", "blir", "samme", "hvilken", "hvilke", "sånn", "inni", "mellom", "vår", "hver", "hvem", "vors", "hvis", "både", "bare", "enn", "fordi", "før", "mange", "også", "slik", "vært", "være", "båe", "begge", "siden", "dykk", "dykkar", "dei", "deira", "deires", "deim", "di", "då", "eg", "ein", "eit", "eitt", "elles", "honom", "hjå", "ho", "hoe", "henne", "hennar", "hennes", "hoss", "hossen", "ikkje", "ingi", "inkje", "korleis", "korso", "kva", "kvar", "kvarhelst", "kven", "kvi", "kvifor", "me", "medan", "mi", "mine", "mykje", "no", "nokon", "noka", "nokor", "noko", "nokre", "si", "sia", "sidan", "so", "somt", "somme", "um", "upp", "vere", "vore", "verte", "vort", "varte", "vart", ]; ================================================ FILE: src/stopwords/ori.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) // Notice: we do not have stopwords for this language yet. pub static STOPWORDS_ORI: &[&str] = &[]; ================================================ FILE: src/stopwords/pan.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_PAN: &[&str] = &[ "ਦੇ", "0", "ਵਿੱਚ", "ਦਾ", "ਅਤੇ", "ਦੀ", "ਇੱਕ", "ਨੂੰ", "ਹੈ", "ਤੋਂ", "ਇਸ", "ਇਹ", "ਨੇ", "ਤੇ", "ਨਾਲ", "1", "ਲਈ", "ਵੀ", "ਸੀ", "ਵਿਚ", "ਕਿ", "ਜੋ", "ਉਹ", "ਉਸ", "ਹਨ", "ਜਾਂਦਾ", "ਕੀਤਾ", "2", "ਗਿਆ", "ਹੀ", "ਕੇ", "ਜਾਂ", "ਦੀਆਂ", "ਜਿਸ", "ਕਰਨ", "ਹੋ", "ਕਰ", "ਆਪਣੇ", "ਕੀਤੀ", "ਤੌਰ", "ਬਾਅਦ", "ਨਹੀਂ", "ਭਾਰਤੀ", "ਪਿੰਡ", "3", "ਸਿੰਘ", "ਉੱਤੇ", "ਸਾਲ", "।", "ਪੰਜਾਬ", "ਸਭ", "ਭਾਰਤ", "ਉਨ੍ਹਾਂ", "ਹੁੰਦਾ", "ਤੱਕ", "ਇਕ", "ਹੋਇਆ", "ਜਨਮ", "ਬਹੁਤ", "ਪਰ", "ਦੁਆਰਾ", "ਰੂਪ", "4", "ਹੋਰ", "ਕੰਮ", "ਆਪਣੀ", "ਤਾਂ", "ਸਮੇਂ", "ਪੰਜਾਬੀ", "ਗਈ", "ਦਿੱਤਾ", "ਦੋ", "ਕਿਸੇ", "ਕਈ", "ਜਾ", "ਵਾਲੇ", "ਸ਼ੁਰੂ", "5", "ਉਸਨੇ", "ਕਿਹਾ", "ਹੋਣ", "ਲੋਕ", "ਜਾਂਦੀ", "ਵਿੱਚੋਂ", "ਨਾਮ", "ਜਦੋਂ", "ਪਹਿਲਾਂ", "ਕਰਦਾ", "ਹੁੰਦੀ", "ਹੋਏ", "ਸਨ", "ਵਜੋਂ", "ਰਾਜ", "ਮੁੱਖ", "ਕਰਦੇ", "ਕੁਝ", "ਸਾਰੇ", "ਹੁੰਦੇ", "ਸ਼ਹਿਰ", "ਭਾਸ਼ਾ", "6", "ਹੋਈ", "ਅਨੁਸਾਰ", "ਸਕਦਾ", "ਆਮ", "ਵੱਖ", "ਕੋਈ", "ਵਾਰ", "ਗਏ", "ਖੇਤਰ", "ਜੀ", "ਕਾਰਨ", "ਕਰਕੇ", "ਜਿਵੇਂ", "ਜ਼ਿਲ੍ਹੇ", "ਲੋਕਾਂ", "ਚ", "ਸਾਹਿਤ", "ਸਦੀ", "ਬਾਰੇ", "ਜਾਂਦੇ", "ਵਾਲਾ", "ਜਾਣ", "ਪਹਿਲੀ", "ਪ੍ਰਾਪਤ", "ਰਿਹਾ", "ਵਾਲੀ", "ਨਾਂ", "ਦੌਰਾਨ", "ਤਰ੍ਹਾਂ", "7", "ਯੂਨੀਵਰਸਿਟੀ", "ਨਾ", "ਏ", "ਤਿੰਨ", "ਇਨ੍ਹਾਂ", "ਗੁਰੂ", "ਇਸਨੂੰ", "ਇਹਨਾਂ", "ਪਿਤਾ", "ਲਿਆ", "ਸ਼ਾਮਲ", "ਸ਼ਬਦ", "ਅੰਗਰੇਜ਼ੀ", "ਉਸਨੂੰ", "ਉਹਨਾਂ", "8", "ਸਥਿਤ", "ਫਿਰ", "ਜੀਵਨ", "ਸਕੂਲ", "ਹੁਣ", "ਦਿਨ", "ਕੀਤੇ", "ਆਦਿ", "ਵੱਧ", "ਲੈ", "ਘਰ", "ਵੱਲ", "ਦੇਸ਼", "ਵਲੋਂ", "ਬਣ", "ਵੀਂ", "ਫਿਲਮ", "ਉਮਰ", "ਬਲਾਕ", "ਰਹੇ", "10", "ਸਾਹਿਬ", "ਕਰਦੀ", "ਹਰ", "ਪੈਦਾ", "ਘੱਟ", "9", "ਲੇਖਕ", "ਹਿੱਸਾ", "ਫ਼ਿਲਮ", "ਮੌਤ", "ਜਿੱਥੇ", "ਵੱਡਾ", "ਵਿਖੇ", "ਆਪਣਾ", "ਪਹਿਲਾ", "ਵਰਤੋਂ", "ਆਪ", "ਕਰਨਾ", "ਵਿਆਹ", "ਰਹੀ", "ਰਾਹੀਂ", "ਦਿੱਤੀ", "ਉਸਦੇ", "ਪਰਿਵਾਰ", "ਆ", "ਦੂਜੇ", "ਅਮਰੀਕਾ", "ਮੰਨਿਆ", "ਇਸਦੇ", "ਈ", "ਕਾਲਜ", "ਸਰਕਾਰ", "ਇੱਥੇ", "ਪਾਕਿਸਤਾਨ", "ਸ਼ਾਮਿਲ", "ਵਿਗਿਆਨ", "ਉਸਦੀ", "ਪੇਸ਼", "ਕਿਉਂਕਿ", "ਪਹਿਲੇ", "ਧਰਮ", "ਮਸ਼ਹੂਰ", "ਅੰਦਰ", "ਵਿਚੋਂ", "ਜਿਨ੍ਹਾਂ", "ਜਾਣਿਆ", "ਪਾਣੀ", "ਇਲਾਵਾ", "ਅਰਥ", "ਚਾਰ", "ਪ੍ਰਸਿੱਧ", "ਨਾਵਲ", "ਵੱਡੇ", "ਵੱਲੋਂ", "ਕਹਾਣੀ", "ਵਿਸ਼ਵ", "ਮੂਲ", "ਅਮਰੀਕੀ", "ਸਥਾਨ", "ਇਤਿਹਾਸ", "ਕੁੱਝ", "ਵਿਕਾਸ", "ਉੱਤਰ", "ਸਿੱਖਿਆ", "ਹਿੰਦੀ", "ਪ੍ਰਮੁੱਖ", "ਰਚਨਾ", "ਬਣਾਇਆ", "ਵਿਸ਼ੇਸ਼", "ਡਾ", "ਉੱਪਰ", "ਪੱਛਮੀ", "ਦੇਣ", "ਇਸਦਾ", "ਸਕਦੇ", "ਰੱਖਿਆ", "ਕਵੀ", "ਦਿੱਲੀ", "ਵੱਡੀ", "ਭੂਮਿਕਾ", "ਸਮਾਜ", "ਕਾਵਿ", "ਕੀ", "ਕੋਲ", "ਦ", "ਗੱਲ", "ਸੰਸਾਰ", "ਭਾਗ", "ਆਈ", "ਦੱਖਣ", "ਅੱਜ", "ਸਿੱਖ", "ਕਹਿੰਦੇ", "ਸੰਗੀਤ", "ਕਿਲੋਮੀਟਰ", "ਜਿਹਨਾਂ", "ਸਭਾ", "ਜਿਸਦਾ", "ਜਨਵਰੀ", "ਕਵਿਤਾ", "ਮੈਂਬਰ", "ਲਿਖਿਆ", "ਮਾਂ", "ਕਲਾ", "ਪੰਜ", "ਥਾਂ", "ਹੇਠ", "ਜਿਆਦਾ", "ਵਰਤਿਆ", "ਮਾਰਚ", "ਡੀ", "ਅਕਤੂਬਰ", "ਤਕ", "ਨਾਟਕ", "ਬੀ", "ਖਾਸ", "ਇਸੇ", "ਆਧੁਨਿਕ", "ਅਗਸਤ", "ਤਿਆਰ", "ਮਾਤਾ", "ਬਣਾਉਣ", "ਨਵੰਬਰ", "ਵਿਅਕਤੀ", "ਦੱਖਣੀ", "ਦਸੰਬਰ", "ਆਫ", "ਗੀਤ", "ਗਿਣਤੀ", "ਕਾਲ", "ਖੋਜ", "ਸਾਲਾਂ", "ਪੂਰੀ", "ਸਮਾਂ", "ਜ਼ਿਆਦਾ", "ਇਸਦੀ", "ਸਕਦੀ", "ਵਿਚਕਾਰ", "ਰਾਜਧਾਨੀ", "ਉਸਦਾ", "ਜੁਲਾਈ", "ਜੂਨ", "ਅਧੀਨ", "ਸਥਾਪਨਾ", "ਸੇਵਾ", "ਭਾਵ", "ਵਰਗ", "ਛੋਟੇ", "ਦਿੰਦਾ", "ਸਮਾਜਿਕ", "ਹੁੰਦੀਆਂ", "ਟੀਮ", "ਔਰਤਾਂ", "ਅਕਸਰ", "ਪ੍ਰਕਾਸ਼ਿਤ", "ਉਰਦੂ", "ਰੰਗ", "ਪਾਰਟੀ", "ਬਣਾ", "ਪ੍ਰਭਾਵ", "ਸ਼ੁਰੂਆਤ", "ਲਗਭਗ", "ਮਈ", "ਸਿਰਫ", "ਨੇੜੇ", "ਜਿਸਨੂੰ", "ਹਾਲਾਂਕਿ", "ਦੂਰ", "ਸਤੰਬਰ", "ਕਿਤਾਬ", "ਕਦੇ", "ਉੱਤਰੀ", "ਪ੍ਰਕਾਰ", "ਇਸਨੇ", "ਪ੍ਰਦੇਸ਼", "ਅੱਗੇ", "ਸੰਯੁਕਤ", "ਪੜ੍ਹਾਈ", "ਵਧੇਰੇ", "ਨਾਲ਼", "ਮਨੁੱਖ", "ਬਾਕੀ", "ਪ੍ਰਧਾਨ", "ਦੂਜੀ", "ਕੁੱਲ", "ਆਫ਼", "ਅਧਿਐਨ", "ਰਾਸ਼ਟਰੀ", "ਪੁੱਤਰ", "ਅੰਤਰਰਾਸ਼ਟਰੀ", "ਧਰਤੀ", "ਕੇਂਦਰ", "ਦੇਸ਼ਾਂ", "ਮੱਧ", "ਜ਼ਿਲ੍ਹਾ", "ਸਾਰੀਆਂ", "ਪੱਧਰ", "ਹੋਵੇ", "ਜੇ", "ਭਾਈ", "ਰਹਿਣ", "ਪੁਰਸਕਾਰ", "ਸਭਿਆਚਾਰ", "ਪਤਾ", "ਪਾਸੇ", "ਨਵੇਂ", "ਕੰਪਨੀ", "ਬਾਹਰ", "ਵੇਲੇ", "ਸੰਨ", "ਪੂਰਬੀ", "ਵਿਚਾਰ", "ਕਾਰਜ", "ਪੀ", "ਮਹੱਤਵਪੂਰਨ", "ਦੁਨੀਆਂ", "ਧਾਰਮਿਕ", "ਮਨੁੱਖੀ", "ਸਮੂਹ", "ਅਜਿਹੇ", "ਲਾਲ", "ਦੂਜਾ", "ਭਰਾ", "ਸ੍ਰੀ", "ਅੰਤ", "ਜਾਂਦੀਆਂ", "ਸ਼ਾਹ", "ਰਹਿੰਦੇ", "ਮਹਾਨ", "ਚੀਨ", "ਮੀਟਰ", "ਵਰਗੇ", "ਨਾਲੋਂ", "ਹਾਸਲ", "ਕਿਸਮ", "ਅਜਿਹਾ", "ਬਣਿਆ", "ਭਰ", "ਛੱਡ", "ਲੈਣ", "ਹਿੱਸੇ", "ਟੀ", "ਲਿਖੇ", "ਮਿਲ", "ਮੌਜੂਦ", "ਦਿੱਤੇ", "ਵਾਸਤੇ", "ਵਾਲੀਆਂ", "ਵਧੀਆ", "ਰੂਸੀ", "ਜਾਰੀ", "ਸਰਕਾਰੀ", "ਡਿਗਰੀ", "ਪੱਛਮ", "ਲੜਾਈ", "ਭਾਸ਼ਾਵਾਂ", "ਰਾਜਾ", "ਜਲੰਧਰ", "ਹਿੰਦੂ", "ਔਰਤ", "ਜੰਗ", "ਬਾਬਾ", "ਬੱਚਿਆਂ", "ਮੰਤਰੀ", "ਪਟਿਆਲਾ", "ਵਾਂਗ", "ਆਉਣ", "ਭਾਵੇਂ", "ਕੇਵਲ", "ਐਸ", "ਪ੍ਰਾਚੀਨ", "ਰਹਿੰਦਾ", "ਬੋਲੀ", "ਅਵਾਰਡ", "ਨਗਰ", "ਖੇਡਾਂ", "ਫਿਲਮਾਂ", "ਬੱਚੇ", "ਕੌਰ", "ਤੋ", "ਪ੍ਰਤੀ", "ਕੁਆਂਟਮ", "ਅਬਾਦੀ", "ਪੁਸਤਕ", "ਐਮ", "ਰਾਮ", "ਖੇਤਰਾਂ", "ਫਰਵਰੀ", "ਕ੍ਰਿਕਟ", "ਪੈਂਦਾ", "ਇਤਿਹਾਸਕ", "ਲੱਗ", "ਬ੍ਰਿਟਿਸ਼", "ਆਇਆ", "ਮਿਲਦਾ", ]; ================================================ FILE: src/stopwords/pes.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_PES: &[&str] = &[ "!", ",", ".", ":", ";", "،", "؛", "؟", "آباد", "آره", "آری", "آمد", "آمده", "آن", "آنان", "آنجا", "آنطور", "آنقدر", "آنكه", "آنها", "آنچه", "آنکه", "آورد", "آورده", "آيد", "آی", "آیا", "آیند", "اتفاقا", "اثرِ", "احتراما", "احتمالا", "اخیر", "اری", "از", "ازجمله", "اساسا", "است", "استفاد", "استفاده", "اش", "اشکارا", "اصلا", "اصولا", "اعلام", "اغلب", "اكنون", "الان", "البته", "البتّه", "ام", "اما", "امروز", "امروزه", "امسال", "امشب", "امور", "ان", "انجام", "اند", "انشاالله", "انصافا", "انطور", "انقدر", "انها", "انچنان", "انکه", "انگار", "او", "اول", "اولا", "اي", "ايشان", "ايم", "اين", "اينكه", "اکثرا", "اکنون", "اگر", "ای", "ایا", "اید", "ایشان", "ایم", "این", "اینجا", "ایند", "اینطور", "اینقدر", "اینها", "اینچنین", "اینک", "اینکه", "اینگونه", "با", "بار", "بارة", "باره", "بارها", "باز", "بازهم", "باش", "باشد", "باشم", "باشند", "باشيم", "باشی", "باشید", "باشیم", "بالا", "بالاخره", "بالایِ", "بالطبع", "بايد", "باید", "بتوان", "بتواند", "بتوانی", "بتوانیم", "بخش", "بخشی", "بخواه", "بخواهد", "بخواهم", "بخواهند", "بخواهی", "بخواهید", "بخواهیم", "بد", "بدون", "بر", "برابر", "برابرِ", "براحتی", "براساس", "براستی", "براي", "برای", "برایِ", "برخوردار", "برخي", "برخی", "برداري", "برعکس", "بروز", "بزرگ", "بزودی", "بسا", "بسيار", "بسياري", "بسیار", "بسیاری", "بطور", "بعد", "بعدا", "بعدها", "بعری", "بعضا", "بعضي", "بلافاصله", "بلكه", "بله", "بلکه", "بلی", "بنابراين", "بنابراین", "بندي", "به", "بهتر", "بهترين", "بود", "بودم", "بودن", "بودند", "بوده", "بودی", "بودید", "بودیم", "بویژه", "بي", "بيست", "بيش", "بيشتر", "بيشتري", "بين", "بکن", "بکند", "بکنم", "بکنند", "بکنی", "بکنید", "بکنیم", "بگو", "بگوید", "بگویم", "بگویند", "بگویی", "بگویید", "بگوییم", "بگیر", "بگیرد", "بگیرم", "بگیرند", "بگیری", "بگیرید", "بگیریم", "بی", "بیا", "بیاب", "بیابد", "بیابم", "بیابند", "بیابی", "بیابید", "بیابیم", "بیاور", "بیاورد", "بیاورم", "بیاورند", "بیاوری", "بیاورید", "بیاوریم", "بیاید", "بیایم", "بیایند", "بیایی", "بیایید", "بیاییم", "بیرون", "بیرونِ", "بیش", "بیشتر", "بیشتری", "بین", "ت", "تا", "تازه", "تاكنون", "تان", "تاکنون", "تحت", "تر", "تر براساس", "ترين", "تقریبا", "تلویحا", "تمام", "تماما", "تمامي", "تنها", "تو", "تواند", "توانست", "توانستم", "توانستن", "توانستند", "توانسته", "توانستی", "توانستیم", "توانم", "توانند", "توانی", "توانید", "توانیم", "توسط", "تولِ", "تویِ", "ثانیا", "جا", "جاي", "جايي", "جای", "جدا", "جديد", "جدید", "جريان", "جریان", "جز", "جلوگيري", "جلویِ", "جمعا", "جناح", "جهت", "حاضر", "حال", "حالا", "حتما", "حتي", "حتی", "حداکثر", "حدودا", "حدودِ", "حق", "خارجِ", "خب", "خدمات", "خصوصا", "خلاصه", "خواست", "خواستم", "خواستن", "خواستند", "خواسته", "خواستی", "خواستید", "خواستیم", "خواهد", "خواهم", "خواهند", "خواهيم", "خواهی", "خواهید", "خواهیم", "خوب", "خود", "خودت", "خودتان", "خودش", "خودشان", "خودم", "خودمان", "خوشبختانه", "خويش", "خویش", "خویشتن", "خیاه", "خیر", "خیلی", "داد", "دادم", "دادن", "دادند", "داده", "دادی", "دادید", "دادیم", "دار", "دارد", "دارم", "دارند", "داريم", "داری", "دارید", "داریم", "داشت", "داشتم", "داشتن", "داشتند", "داشته", "داشتی", "داشتید", "داشتیم", "دانست", "دانند", "دایم", "دایما", "در", "درباره", "درمجموع", "درون", "دریغ", "دقیقا", "دنبالِ", "ده", "دهد", "دهم", "دهند", "دهی", "دهید", "دهیم", "دو", "دوباره", "دوم", "ديده", "ديروز", "ديگر", "ديگران", "ديگري", "دیر", "دیروز", "دیگر", "دیگران", "دیگری", "را", "راحت", "راسا", "راستی", "راه", "رسما", "رسید", "رفت", "رفته", "رو", "روب", "روز", "روزانه", "روزهاي", "روي", "روی", "رویِ", "ريزي", "زمان", "زمانی", "زمینه", "زود", "زياد", "زير", "زيرا", "زیر", "زیرِ", "سابق", "ساخته", "سازي", "سالانه", "سالیانه", "سایر", "سراسر", "سرانجام", "سریعا", "سریِ", "سعي", "سمتِ", "سوم", "سوي", "سوی", "سویِ", "سپس", "شان", "شايد", "شاید", "شخصا", "شد", "شدم", "شدن", "شدند", "شده", "شدی", "شدید", "شدیدا", "شدیم", "شش", "شش نداشته", "شما", "شناسي", "شود", "شوم", "شوند", "شونده", "شوی", "شوید", "شویم", "صرفا", "صورت", "ضدِّ", "ضدِّ", "ضمن", "طبعا", "طبقِ", "طبیعتا", "طرف", "طريق", "طریق", "طور", "طي", "طی", "ظاهرا", "عدم", "عقبِ", "علّتِ", "علیه", "عمدا", "عمدتا", "عمل", "عملا", "عنوان", "عنوانِ", "غالبا", "غير", "غیر", "فردا", "فعلا", "فقط", "فكر", "فوق", "قابل", "قبل", "قبلا", "قدری", "قصدِ", "قطعا", "كرد", "كردم", "كردن", "كردند", "كرده", "كسي", "كل", "كمتر", "كند", "كنم", "كنند", "كنيد", "كنيم", "كه", "لااقل", "لطفا", "لطفاً", "ما", "مان", "مانند", "مانندِ", "مبادا", "متاسفانه", "متعاقبا", "مثل", "مثلا", "مثلِ", "مجانی", "مجددا", "مجموعا", "مختلف", "مدام", "مدت", "مدّتی", "مردم", "مرسی", "مستقیما", "مسلما", "مطمینا", "معمولا", "مقابل", "ممکن", "من", "موارد", "مورد", "موقتا", "مي", "ميليارد", "ميليون", "مگر", "می", "می شود", "میان", "می‌رسد", "می‌رود", "می‌شود", "می‌کنیم", "ناشي", "نام", "ناگاه", "ناگهان", "ناگهانی", "نبايد", "نباید", "نبود", "نخست", "نخستين", "نخواهد", "نخواهم", "نخواهند", "نخواهی", "نخواهید", "نخواهیم", "ندارد", "ندارم", "ندارند", "نداری", "ندارید", "نداریم", "نداشت", "نداشتم", "نداشتند", "نداشته", "نداشتی", "نداشتید", "نداشتیم", "نزديك", "نزدِ", "نزدیکِ", "نسبتا", "نشان", "نشده", "نظير", "نظیر", "نكرده", "نمايد", "نمي", "نمی", "نمی‌شود", "نه", "نهایتا", "نوع", "نوعي", "نوعی", "نيز", "نيست", "نگاه", "نیز", "نیست", "ها", "هاي", "هايي", "های", "هایی", "هبچ", "هر", "هرچه", "هرگز", "هزار", "هست", "هستم", "هستند", "هستيم", "هستی", "هستید", "هستیم", "هفت", "هم", "همان", "همه", "همواره", "همين", "همچنان", "همچنين", "همچنین", "همچون", "همیشه", "همین", "هنوز", "هنگام", "هنگامِ", "هنگامی", "هيچ", "هیچ", "هیچگاه", "و", "واقعا", "واقعی", "وجود", "وسطِ", "وضع", "وقتي", "وقتی", "وقتیکه", "ولی", "وي", "وگو", "وی", "ویژه", "يا", "يابد", "يك", "يكديگر", "يكي", "ّه", "٪", "پارسال", "پاعینِ", "پس", "پنج", "پيش", "پیدا", "پیش", "پیشاپیش", "پیشتر", "پیشِ", "چرا", "چطور", "چقدر", "چنان", "چنانچه", "چنانکه", "چند", "چندین", "چنين", "چنین", "چه", "چهار", "چو", "چون", "چيزي", "چگونه", "چیز", "چیزی", "چیست", "کاش", "کامل", "کاملا", "کتبا", "کجا", "کجاست", "کدام", "کرد", "کردم", "کردن", "کردند", "کرده", "کردی", "کردید", "کردیم", "کس", "کسانی", "کسی", "کل", "کلا", "کم", "کماکان", "کمتر", "کمتری", "کمی", "کن", "کنار", "کنارِ", "کند", "کنم", "کنند", "کننده", "کنون", "کنونی", "کنی", "کنید", "کنیم", "که", "کو", "کَی", "کی", "گاه", "گاهی", "گذاري", "گذاشته", "گذشته", "گردد", "گرفت", "گرفتم", "گرفتن", "گرفتند", "گرفته", "گرفتی", "گرفتید", "گرفتیم", "گروهي", "گفت", "گفتم", "گفتن", "گفتند", "گفته", "گفتی", "گفتید", "گفتیم", "گه", "گهگاه", "گو", "گويد", "گويند", "گویا", "گوید", "گویم", "گویند", "گویی", "گویید", "گوییم", "گيرد", "گيري", "گیرد", "گیرم", "گیرند", "گیری", "گیرید", "گیریم", "ی", "یا", "یابد", "یابم", "یابند", "یابی", "یابید", "یابیم", "یافت", "یافتم", "یافتن", "یافته", "یافتی", "یافتید", "یافتیم", "یعنی", "یقینا", "یه", "یک", "یکی", "۰", "۱", "۲", "۳", "۴", "۵", "۶", "۷", "۸", "۹", ]; ================================================ FILE: src/stopwords/pol.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_POL: &[&str] = &[ "a", "aby", "ach", "acz", "aczkolwiek", "aj", "albo", "ale", "ależ", "ani", "aż", "bardziej", "bardzo", "bez", "bo", "bowiem", "by", "byli", "bym", "bynajmniej", "być", "był", "była", "było", "były", "będzie", "będą", "cali", "cała", "cały", "chce", "choć", "ci", "ciebie", "cię", "co", "cokolwiek", "coraz", "coś", "czasami", "czasem", "czemu", "czy", "czyli", "często", "daleko", "dla", "dlaczego", "dlatego", "do", "dobrze", "dokąd", "dość", "dr", "dużo", "dwa", "dwaj", "dwie", "dwoje", "dzisiaj", "dziś", "gdy", "gdyby", "gdyż", "gdzie", "gdziekolwiek", "gdzieś", "go", "godz", "hab", "i", "ich", "ii", "iii", "ile", "im", "inna", "inne", "inny", "innych", "inż", "iv", "ix", "iż", "ja", "jak", "jakaś", "jakby", "jaki", "jakichś", "jakie", "jakiś", "jakiż", "jakkolwiek", "jako", "jakoś", "je", "jeden", "jedna", "jednak", "jednakże", "jedno", "jednym", "jedynie", "jego", "jej", "jemu", "jest", "jestem", "jeszcze", "jeśli", "jeżeli", "już", "ją", "każdy", "kiedy", "kierunku", "kilka", "kilku", "kimś", "kto", "ktokolwiek", "ktoś", "która", "które", "którego", "której", "który", "których", "którym", "którzy", "ku", "lat", "lecz", "lub", "ma", "mają", "mam", "mamy", "mało", "mgr", "mi", "miał", "mimo", "między", "mnie", "mną", "mogą", "moi", "moim", "moja", "moje", "może", "możliwe", "można", "mu", "musi", "my", "mój", "na", "nad", "nam", "nami", "nas", "nasi", "nasz", "nasza", "nasze", "naszego", "naszych", "natomiast", "natychmiast", "nawet", "nic", "nich", "nie", "niech", "niego", "niej", "niemu", "nigdy", "nim", "nimi", "nią", "niż", "no", "nowe", "np", "nr", "o", "o.o.", "obok", "od", "ok", "około", "on", "ona", "one", "oni", "ono", "oraz", "oto", "owszem", "pan", "pana", "pani", "pl", "po", "pod", "podczas", "pomimo", "ponad", "ponieważ", "powinien", "powinna", "powinni", "powinno", "poza", "prawie", "prof", "przecież", "przed", "przede", "przedtem", "przez", "przy", "raz", "razie", "roku", "również", "sam", "sama", "się", "skąd", "sobie", "sobą", "sposób", "swoje", "są", "ta", "tak", "taka", "taki", "takich", "takie", "także", "tam", "te", "tego", "tej", "tel", "temu", "ten", "teraz", "też", "to", "tobie", "tobą", "toteż", "trzeba", "tu", "tutaj", "twoi", "twoim", "twoja", "twoje", "twym", "twój", "ty", "tych", "tylko", "tym", "tys", "tzw", "tę", "u", "ul", "vi", "vii", "viii", "vol", "w", "wam", "wami", "was", "wasi", "wasz", "wasza", "wasze", "we", "według", "wie", "wiele", "wielu", "więc", "więcej", "wszyscy", "wszystkich", "wszystkie", "wszystkim", "wszystko", "wtedy", "www", "wy", "właśnie", "wśród", "xi", "xii", "xiii", "xiv", "xv", "z", "za", "zapewne", "zawsze", "zaś", "ze", "zeznowu", "znowu", "znów", "został", "zł", "żaden", "żadna", "żadne", "żadnych", "że", "żeby", ]; ================================================ FILE: src/stopwords/por.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_POR: &[&str] = &[ "a", "acerca", "adeus", "agora", "ainda", "alem", "algmas", "algo", "algumas", "alguns", "ali", "além", "ambas", "ambos", "ano", "anos", "antes", "ao", "aonde", "aos", "apenas", "apoio", "apontar", "apos", "após", "aquela", "aquelas", "aquele", "aqueles", "aqui", "aquilo", "as", "assim", "através", "atrás", "até", "aí", "baixo", "bastante", "bem", "boa", "boas", "bom", "bons", "breve", "cada", "caminho", "catorze", "cedo", "cento", "certamente", "certeza", "cima", "cinco", "coisa", "com", "como", "comprido", "conhecido", "conselho", "contra", "contudo", "corrente", "cuja", "cujas", "cujo", "cujos", "custa", "cá", "da", "daquela", "daquelas", "daquele", "daqueles", "dar", "das", "de", "debaixo", "dela", "delas", "dele", "deles", "demais", "dentro", "depois", "desde", "desligado", "dessa", "dessas", "desse", "desses", "desta", "destas", "deste", "destes", "deve", "devem", "deverá", "dez", "dezanove", "dezasseis", "dezassete", "dezoito", "dia", "diante", "direita", "dispoe", "dispoem", "diversa", "diversas", "diversos", "diz", "dizem", "dizer", "do", "dois", "dos", "doze", "duas", "durante", "dá", "dão", "dúvida", "e", "ela", "elas", "ele", "eles", "em", "embora", "enquanto", "entao", "entre", "então", "era", "eram", "essa", "essas", "esse", "esses", "esta", "estado", "estamos", "estar", "estará", "estas", "estava", "estavam", "este", "esteja", "estejam", "estejamos", "estes", "esteve", "estive", "estivemos", "estiver", "estivera", "estiveram", "estiverem", "estivermos", "estivesse", "estivessem", "estiveste", "estivestes", "estivéramos", "estivéssemos", "estou", "está", "estás", "estávamos", "estão", "eu", "exemplo", "falta", "fará", "favor", "faz", "fazeis", "fazem", "fazemos", "fazer", "fazes", "fazia", "faço", "fez", "fim", "final", "foi", "fomos", "for", "fora", "foram", "forem", "forma", "formos", "fosse", "fossem", "foste", "fostes", "fui", "fôramos", "fôssemos", "geral", "grande", "grandes", "grupo", "ha", "haja", "hajam", "hajamos", "havemos", "havia", "hei", "hoje", "hora", "horas", "houve", "houvemos", "houver", "houvera", "houveram", "houverei", "houverem", "houveremos", "houveria", "houveriam", "houvermos", "houverá", "houverão", "houveríamos", "houvesse", "houvessem", "houvéramos", "houvéssemos", "há", "hão", "iniciar", "inicio", "ir", "irá", "isso", "ista", "iste", "isto", "já", "lado", "lhe", "lhes", "ligado", "local", "logo", "longe", "lugar", "lá", "maior", "maioria", "maiorias", "mais", "mal", "mas", "me", "mediante", "meio", "menor", "menos", "meses", "mesma", "mesmas", "mesmo", "mesmos", "meu", "meus", "mil", "minha", "minhas", "momento", "muito", "muitos", "máximo", "mês", "na", "nada", "nao", "naquela", "naquelas", "naquele", "naqueles", "nas", "nem", "nenhuma", "nessa", "nessas", "nesse", "nesses", "nesta", "nestas", "neste", "nestes", "no", "noite", "nome", "nos", "nossa", "nossas", "nosso", "nossos", "nova", "novas", "nove", "novo", "novos", "num", "numa", "numas", "nunca", "nuns", "não", "nível", "nós", "número", "o", "obra", "obrigada", "obrigado", "oitava", "oitavo", "oito", "onde", "ontem", "onze", "os", "ou", "outra", "outras", "outro", "outros", "para", "parece", "parte", "partir", "paucas", "pegar", "pela", "pelas", "pelo", "pelos", "perante", "perto", "pessoas", "pode", "podem", "poder", "poderá", "podia", "pois", "ponto", "pontos", "por", "porque", "porquê", "portanto", "posição", "possivelmente", "posso", "possível", "pouca", "pouco", "poucos", "povo", "primeira", "primeiras", "primeiro", "primeiros", "promeiro", "propios", "proprio", "própria", "próprias", "próprio", "próprios", "próxima", "próximas", "próximo", "próximos", "puderam", "pôde", "põe", "põem", "quais", "qual", "qualquer", "quando", "quanto", "quarta", "quarto", "quatro", "que", "quem", "quer", "quereis", "querem", "queremas", "queres", "quero", "questão", "quieto", "quinta", "quinto", "quinze", "quáis", "quê", "relação", "sabe", "sabem", "saber", "se", "segunda", "segundo", "sei", "seis", "seja", "sejam", "sejamos", "sem", "sempre", "sendo", "ser", "serei", "seremos", "seria", "seriam", "será", "serão", "seríamos", "sete", "seu", "seus", "sexta", "sexto", "sim", "sistema", "sob", "sobre", "sois", "somente", "somos", "sou", "sua", "suas", "são", "sétima", "sétimo", "só", "tal", "talvez", "tambem", "também", "tanta", "tantas", "tanto", "tarde", "te", "tem", "temos", "tempo", "tendes", "tenha", "tenham", "tenhamos", "tenho", "tens", "tentar", "tentaram", "tente", "tentei", "ter", "terceira", "terceiro", "terei", "teremos", "teria", "teriam", "terá", "terão", "teríamos", "teu", "teus", "teve", "tinha", "tinham", "tipo", "tive", "tivemos", "tiver", "tivera", "tiveram", "tiverem", "tivermos", "tivesse", "tivessem", "tiveste", "tivestes", "tivéramos", "tivéssemos", "toda", "todas", "todo", "todos", "trabalhar", "trabalho", "treze", "três", "tu", "tua", "tuas", "tudo", "tão", "tém", "têm", "tínhamos", "um", "uma", "umas", "uns", "usa", "usar", "vai", "vais", "valor", "veja", "vem", "vens", "ver", "verdade", "verdadeiro", "vez", "vezes", "viagem", "vindo", "vinte", "você", "vocês", "vos", "vossa", "vossas", "vosso", "vossos", "vários", "vão", "vêm", "vós", "zero", "à", "às", "área", "é", "éramos", "és", "último", ]; ================================================ FILE: src/stopwords/ron.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_RON: &[&str] = &[ "a", "abia", "acea", "aceasta", "această", "aceea", "aceeasi", "acei", "aceia", "acel", "acela", "acelasi", "acele", "acelea", "acest", "acesta", "aceste", "acestea", "acestei", "acestia", "acestui", "aceşti", "aceştia", "acolo", "acord", "acum", "adica", "ai", "aia", "aibă", "aici", "aiurea", "al", "ala", "alaturi", "ale", "alea", "alt", "alta", "altceva", "altcineva", "alte", "altfel", "alti", "altii", "altul", "am", "anume", "apoi", "ar", "are", "as", "asa", "asemenea", "asta", "astazi", "astea", "astfel", "astăzi", "asupra", "atare", "atat", "atata", "atatea", "atatia", "ati", "atit", "atita", "atitea", "atitia", "atunci", "au", "avea", "avem", "aveţi", "avut", "azi", "aş", "aşadar", "aţi", "b", "ba", "bine", "bucur", "bună", "c", "ca", "cam", "cand", "capat", "care", "careia", "carora", "caruia", "cat", "catre", "caut", "ce", "cea", "ceea", "cei", "ceilalti", "cel", "cele", "celor", "ceva", "chiar", "ci", "cinci", "cind", "cine", "cineva", "cit", "cita", "cite", "citeva", "citi", "citiva", "conform", "contra", "cu", "cui", "cum", "cumva", "curând", "curînd", "când", "cât", "câte", "câtva", "câţi", "cînd", "cît", "cîte", "cîtva", "cîţi", "că", "căci", "cărei", "căror", "cărui", "către", "d", "da", "daca", "dacă", "dar", "dat", "datorită", "dată", "dau", "de", "deasupra", "deci", "decit", "degraba", "deja", "deoarece", "departe", "desi", "despre", "deşi", "din", "dinaintea", "dintr", "dintr-", "dintre", "doar", "doi", "doilea", "două", "drept", "dupa", "după", "dă", "e", "ea", "ei", "el", "ele", "era", "eram", "este", "eu", "exact", "eşti", "f", "face", "fara", "fata", "fel", "fi", "fie", "fiecare", "fii", "fim", "fiu", "fiţi", "foarte", "fost", "frumos", "fără", "g", "geaba", "graţie", "h", "halbă", "i", "ia", "iar", "ieri", "ii", "il", "imi", "in", "inainte", "inapoi", "inca", "incit", "insa", "intr", "intre", "isi", "iti", "j", "k", "l", "la", "le", "li", "lor", "lui", "lângă", "lîngă", "m", "ma", "mai", "mare", "mea", "mei", "mele", "mereu", "meu", "mi", "mie", "mine", "mod", "mult", "multa", "multe", "multi", "multă", "mulţi", "mulţumesc", "mâine", "mîine", "mă", "n", "ne", "nevoie", "ni", "nici", "niciodata", "nicăieri", "nimeni", "nimeri", "nimic", "niste", "nişte", "noastre", "noastră", "noi", "noroc", "nostri", "nostru", "nou", "noua", "nouă", "noştri", "nu", "numai", "o", "opt", "or", "ori", "oricare", "orice", "oricine", "oricum", "oricând", "oricât", "oricînd", "oricît", "oriunde", "p", "pai", "parca", "patra", "patru", "patrulea", "pe", "pentru", "peste", "pic", "pina", "plus", "poate", "pot", "prea", "prima", "primul", "prin", "printr-", "putini", "puţin", "puţina", "puţină", "până", "pînă", "r", "rog", "s", "sa", "sa-mi", "sa-ti", "sai", "sale", "sau", "se", "si", "sint", "sintem", "spate", "spre", "sub", "sunt", "suntem", "sunteţi", "sus", "sută", "sînt", "sîntem", "sînteţi", "să", "săi", "său", "t", "ta", "tale", "te", "ti", "timp", "tine", "toata", "toate", "toată", "tocmai", "tot", "toti", "totul", "totusi", "totuşi", "toţi", "trei", "treia", "treilea", "tu", "tuturor", "tăi", "tău", "u", "ul", "ului", "un", "una", "unde", "undeva", "unei", "uneia", "unele", "uneori", "unii", "unor", "unora", "unu", "unui", "unuia", "unul", "v", "va", "vi", "voastre", "voastră", "voi", "vom", "vor", "vostru", "vouă", "voştri", "vreme", "vreo", "vreun", "vă", "x", "z", "zece", "zero", "zi", "zice", "îi", "îl", "îmi", "împotriva", "în", "înainte", "înaintea", "încotro", "încât", "încît", "între", "întrucât", "întrucît", "îţi", "ăla", "ălea", "ăsta", "ăstea", "ăştia", "şapte", "şase", "şi", "ştiu", "ţi", "ţie", ]; ================================================ FILE: src/stopwords/rus.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_RUS: &[&str] = &[ "c", "а", "алло", "без", "белый", "близко", "более", "больше", "большой", "будем", "будет", "будете", "будешь", "будто", "буду", "будут", "будь", "бы", "бывает", "бывь", "был", "была", "были", "было", "быть", "в", "важная", "важное", "важные", "важный", "вам", "вами", "вас", "ваш", "ваша", "ваше", "ваши", "вверх", "вдали", "вдруг", "ведь", "везде", "вернуться", "весь", "вечер", "взгляд", "взять", "вид", "видел", "видеть", "вместе", "вне", "вниз", "внизу", "во", "вода", "война", "вокруг", "вон", "вообще", "вопрос", "восемнадцатый", "восемнадцать", "восемь", "восьмой", "вот", "впрочем", "времени", "время", "все", "все еще", "всегда", "всего", "всем", "всеми", "всему", "всех", "всею", "всю", "всюду", "вся", "всё", "второй", "вы", "выйти", "г", "где", "главный", "глаз", "говорил", "говорит", "говорить", "год", "года", "году", "голова", "голос", "город", "да", "давать", "давно", "даже", "далекий", "далеко", "дальше", "даром", "дать", "два", "двадцатый", "двадцать", "две", "двенадцатый", "двенадцать", "дверь", "двух", "девятнадцатый", "девятнадцать", "девятый", "девять", "действительно", "дел", "делал", "делать", "делаю", "дело", "день", "деньги", "десятый", "десять", "для", "до", "довольно", "долго", "должен", "должно", "должный", "дом", "дорога", "друг", "другая", "другие", "других", "друго", "другое", "другой", "думать", "душа", "е", "его", "ее", "ей", "ему", "если", "есть", "еще", "ещё", "ею", "её", "ж", "ждать", "же", "жена", "женщина", "жизнь", "жить", "за", "занят", "занята", "занято", "заняты", "затем", "зато", "зачем", "здесь", "земля", "знать", "значит", "значить", "и", "иди", "идти", "из", "или", "им", "имеет", "имел", "именно", "иметь", "ими", "имя", "иногда", "их", "к", "каждая", "каждое", "каждые", "каждый", "кажется", "казаться", "как", "какая", "какой", "кем", "книга", "когда", "кого", "ком", "комната", "кому", "конец", "конечно", "которая", "которого", "которой", "которые", "который", "которых", "кроме", "кругом", "кто", "куда", "лежать", "лет", "ли", "лицо", "лишь", "лучше", "любить", "люди", "м", "маленький", "мало", "мать", "машина", "между", "меля", "менее", "меньше", "меня", "место", "миллионов", "мимо", "минута", "мир", "мира", "мне", "много", "многочисленная", "многочисленное", "многочисленные", "многочисленный", "мной", "мною", "мог", "могу", "могут", "мож", "может", "может быть", "можно", "можхо", "мои", "мой", "мор", "москва", "мочь", "моя", "моё", "мы", "на", "наверху", "над", "надо", "назад", "наиболее", "найти", "наконец", "нам", "нами", "народ", "нас", "начала", "начать", "наш", "наша", "наше", "наши", "не", "него", "недавно", "недалеко", "нее", "ней", "некоторый", "нельзя", "нем", "немного", "нему", "непрерывно", "нередко", "несколько", "нет", "нею", "неё", "ни", "нибудь", "ниже", "низко", "никакой", "никогда", "никто", "никуда", "ним", "ними", "них", "ничего", "ничто", "но", "новый", "нога", "ночь", "ну", "нужно", "нужный", "нх", "о", "об", "оба", "обычно", "один", "одиннадцатый", "одиннадцать", "однажды", "однако", "одного", "одной", "оказаться", "окно", "около", "он", "она", "они", "оно", "опять", "особенно", "остаться", "от", "ответить", "отец", "откуда", "отовсюду", "отсюда", "очень", "первый", "перед", "писать", "плечо", "по", "под", "подойди", "подумать", "пожалуйста", "позже", "пойти", "пока", "пол", "получить", "помнить", "понимать", "понять", "пор", "пора", "после", "последний", "посмотреть", "посреди", "потом", "потому", "почему", "почти", "правда", "прекрасно", "при", "про", "просто", "против", "процентов", "путь", "пятнадцатый", "пятнадцать", "пятый", "пять", "работа", "работать", "раз", "разве", "рано", "раньше", "ребенок", "решить", "россия", "рука", "русский", "ряд", "рядом", "с", "с кем", "сам", "сама", "сами", "самим", "самими", "самих", "само", "самого", "самой", "самом", "самому", "саму", "самый", "свет", "свое", "своего", "своей", "свои", "своих", "свой", "свою", "сделать", "сеаой", "себе", "себя", "сегодня", "седьмой", "сейчас", "семнадцатый", "семнадцать", "семь", "сидеть", "сила", "сих", "сказал", "сказала", "сказать", "сколько", "слишком", "слово", "случай", "смотреть", "сначала", "снова", "со", "собой", "собою", "советский", "совсем", "спасибо", "спросить", "сразу", "стал", "старый", "стать", "стол", "сторона", "стоять", "страна", "суть", "считать", "т", "та", "так", "такая", "также", "таки", "такие", "такое", "такой", "там", "твои", "твой", "твоя", "твоё", "те", "тебе", "тебя", "тем", "теми", "теперь", "тех", "то", "тобой", "тобою", "товарищ", "тогда", "того", "тоже", "только", "том", "тому", "тот", "тою", "третий", "три", "тринадцатый", "тринадцать", "ту", "туда", "тут", "ты", "тысяч", "у", "увидеть", "уж", "уже", "улица", "уметь", "утро", "хороший", "хорошо", "хотел бы", "хотеть", "хоть", "хотя", "хочешь", "час", "часто", "часть", "чаще", "чего", "человек", "чем", "чему", "через", "четвертый", "четыре", "четырнадцатый", "четырнадцать", "что", "чтоб", "чтобы", "чуть", "шестнадцатый", "шестнадцать", "шестой", "шесть", "эта", "эти", "этим", "этими", "этих", "это", "этого", "этой", "этом", "этому", "этот", "эту", "я", "являюсь", ]; ================================================ FILE: src/stopwords/sin.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) // Notice: we do not have stopwords for this language yet. pub static STOPWORDS_SIN: &[&str] = &[]; ================================================ FILE: src/stopwords/slk.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2020, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_SLK: &[&str] = &[ "a", "aby", "aj", "ak", "akej", "akejže", "ako", "akom", "akomže", "akou", "akouže", "akože", "aká", "akáže", "aké", "akého", "akéhože", "akému", "akémuže", "akéže", "akú", "akúže", "aký", "akých", "akýchže", "akým", "akými", "akýmiže", "akýmže", "akýže", "ale", "alebo", "ani", "asi", "avšak", "až", "ba", "bez", "bezo", "bol", "bola", "boli", "bolo", "bude", "budem", "budeme", "budete", "budeš", "budú", "buď", "by", "byť", "cez", "cezo", "dnes", "do", "ešte", "ho", "hoci", "i", "iba", "ich", "im", "inej", "inom", "iná", "iné", "iného", "inému", "iní", "inú", "iný", "iných", "iným", "inými", "ja", "je", "jeho", "jej", "jemu", "ju", "k", "kam", "kamže", "každou", "každá", "každé", "každého", "každému", "každí", "každú", "každý", "každých", "každým", "každými", "kde", "kej", "kejže", "keď", "keďže", "kie", "kieho", "kiehože", "kiemu", "kiemuže", "kieže", "koho", "kom", "komu", "kou", "kouže", "kto", "ktorej", "ktorou", "ktorá", "ktoré", "ktorí", "ktorú", "ktorý", "ktorých", "ktorým", "ktorými", "ku", "ká", "káže", "ké", "kéže", "kú", "kúže", "ký", "kýho", "kýhože", "kým", "kýmu", "kýmuže", "kýže", "lebo", "leda", "ledaže", "len", "ma", "majú", "mal", "mala", "mali", "mať", "medzi", "mi", "mne", "mnou", "moja", "moje", "mojej", "mojich", "mojim", "mojimi", "mojou", "moju", "možno", "mu", "musia", "musieť", "musí", "musím", "musíme", "musíte", "musíš", "my", "má", "mám", "máme", "máte", "máš", "môcť", "môj", "môjho", "môže", "môžem", "môžeme", "môžete", "môžeš", "môžu", "mňa", "na", "nad", "nado", "najmä", "nami", "naša", "naše", "našej", "naši", "našich", "našim", "našimi", "našou", "ne", "nech", "neho", "nej", "nejakej", "nejakom", "nejakou", "nejaká", "nejaké", "nejakého", "nejakému", "nejakú", "nejaký", "nejakých", "nejakým", "nejakými", "nemu", "než", "nich", "nie", "niektorej", "niektorom", "niektorou", "niektorá", "niektoré", "niektorého", "niektorému", "niektorú", "niektorý", "niektorých", "niektorým", "niektorými", "nielen", "niečo", "nim", "nimi", "nič", "ničoho", "ničom", "ničomu", "ničím", "no", "nám", "nás", "náš", "nášho", "ním", "o", "od", "odo", "on", "ona", "oni", "ono", "ony", "oň", "oňho", "po", "pod", "podo", "podľa", "pokiaľ", "popod", "popri", "potom", "poza", "pre", "pred", "predo", "preto", "pretože", "prečo", "pri", "práve", "s", "sa", "seba", "sebe", "sebou", "sem", "si", "sme", "so", "som", "ste", "svoj", "svoja", "svoje", "svojho", "svojich", "svojim", "svojimi", "svojou", "svoju", "svojím", "sú", "ta", "tak", "takej", "takejto", "taká", "takáto", "také", "takého", "takéhoto", "takému", "takémuto", "takéto", "takí", "takú", "takúto", "taký", "takýto", "takže", "tam", "teba", "tebe", "tebou", "teda", "tej", "tejto", "ten", "tento", "ti", "tie", "tieto", "tiež", "to", "toho", "tohoto", "tohto", "tom", "tomto", "tomu", "tomuto", "toto", "tou", "touto", "tu", "tvoj", "tvoja", "tvoje", "tvojej", "tvojho", "tvoji", "tvojich", "tvojim", "tvojimi", "tvojím", "ty", "tá", "táto", "tí", "títo", "tú", "túto", "tých", "tým", "tými", "týmto", "u", "už", "v", "vami", "vaša", "vaše", "vašej", "vaši", "vašich", "vašim", "vaším", "veď", "viac", "vo", "vy", "vám", "vás", "váš", "vášho", "však", "všetci", "všetka", "všetko", "všetky", "všetok", "z", "za", "začo", "začože", "zo", "áno", "čej", "či", "čia", "čie", "čieho", "čiemu", "čiu", "čo", "čoho", "čom", "čomu", "čou", "čože", "čí", "čím", "čími", "ďalšia", "ďalšie", "ďalšieho", "ďalšiemu", "ďalšiu", "ďalšom", "ďalšou", "ďalší", "ďalších", "ďalším", "ďalšími", "ňom", "ňou", "ňu", "že", ]; ================================================ FILE: src/stopwords/slv.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_SLV: &[&str] = &[ "a", "ali", "april", "avgust", "b", "bi", "bil", "bila", "bile", "bili", "bilo", "biti", "blizu", "bo", "bodo", "bojo", "bolj", "bom", "bomo", "boste", "bova", "boš", "brez", "c", "cel", "cela", "celi", "celo", "d", "da", "daleč", "dan", "danes", "datum", "december", "deset", "deseta", "deseti", "deseto", "devet", "deveta", "deveti", "deveto", "do", "dober", "dobra", "dobri", "dobro", "dokler", "dol", "dolg", "dolga", "dolgi", "dovolj", "drug", "druga", "drugi", "drugo", "dva", "dve", "e", "eden", "en", "ena", "ene", "eni", "enkrat", "eno", "etc.", "f", "februar", "g", "g.", "ga", "ga.", "gor", "gospa", "gospod", "h", "halo", "i", "idr.", "ii", "iii", "in", "iv", "ix", "iz", "j", "januar", "jaz", "je", "ji", "jih", "jim", "jo", "julij", "junij", "jutri", "k", "kadarkoli", "kaj", "kajti", "kako", "kakor", "kamor", "kamorkoli", "kar", "karkoli", "katerikoli", "kdaj", "kdo", "kdorkoli", "ker", "ki", "kje", "kjer", "kjerkoli", "ko", "koder", "koderkoli", "koga", "komu", "kot", "kratek", "kratka", "kratke", "kratki", "l", "lahka", "lahke", "lahki", "lahko", "le", "lep", "lepa", "lepe", "lepi", "lepo", "leto", "m", "maj", "majhen", "majhna", "majhni", "malce", "malo", "manj", "marec", "me", "med", "medtem", "mene", "mesec", "mi", "midva", "midve", "mnogo", "moj", "moja", "moje", "mora", "morajo", "moram", "moramo", "morate", "moraš", "morem", "mu", "n", "na", "nad", "naj", "najina", "najino", "najmanj", "naju", "največ", "nam", "narobe", "nas", "nato", "nazaj", "naš", "naša", "naše", "ne", "nedavno", "nedelja", "nek", "neka", "nekaj", "nekatere", "nekateri", "nekatero", "nekdo", "neke", "nekega", "neki", "nekje", "neko", "nekoga", "nekoč", "ni", "nikamor", "nikdar", "nikjer", "nikoli", "nič", "nje", "njega", "njegov", "njegova", "njegovo", "njej", "njemu", "njen", "njena", "njeno", "nji", "njih", "njihov", "njihova", "njihovo", "njiju", "njim", "njo", "njun", "njuna", "njuno", "no", "nocoj", "november", "npr.", "o", "ob", "oba", "obe", "oboje", "od", "odprt", "odprta", "odprti", "okoli", "oktober", "on", "onadva", "one", "oni", "onidve", "osem", "osma", "osmi", "osmo", "oz.", "p", "pa", "pet", "peta", "petek", "peti", "peto", "po", "pod", "pogosto", "poleg", "poln", "polna", "polni", "polno", "ponavadi", "ponedeljek", "ponovno", "potem", "povsod", "pozdravljen", "pozdravljeni", "prav", "prava", "prave", "pravi", "pravo", "prazen", "prazna", "prazno", "prbl.", "precej", "pred", "prej", "preko", "pri", "pribl.", "približno", "primer", "pripravljen", "pripravljena", "pripravljeni", "proti", "prva", "prvi", "prvo", "r", "ravno", "redko", "res", "reč", "s", "saj", "sam", "sama", "same", "sami", "samo", "se", "sebe", "sebi", "sedaj", "sedem", "sedma", "sedmi", "sedmo", "sem", "september", "seveda", "si", "sicer", "skoraj", "skozi", "slab", "smo", "so", "sobota", "spet", "sreda", "srednja", "srednji", "sta", "ste", "stran", "stvar", "sva", "t", "ta", "tak", "taka", "take", "taki", "tako", "takoj", "tam", "te", "tebe", "tebi", "tega", "težak", "težka", "težki", "težko", "ti", "tista", "tiste", "tisti", "tisto", "tj.", "tja", "to", "toda", "torek", "tretja", "tretje", "tretji", "tri", "tu", "tudi", "tukaj", "tvoj", "tvoja", "tvoje", "u", "v", "vaju", "vam", "vas", "vaš", "vaša", "vaše", "ve", "vedno", "velik", "velika", "veliki", "veliko", "vendar", "ves", "več", "vi", "vidva", "vii", "viii", "visok", "visoka", "visoke", "visoki", "vsa", "vsaj", "vsak", "vsaka", "vsakdo", "vsake", "vsaki", "vsakomur", "vse", "vsega", "vsi", "vso", "včasih", "včeraj", "x", "z", "za", "zadaj", "zadnji", "zakaj", "zaprta", "zaprti", "zaprto", "zdaj", "zelo", "zunaj", "č", "če", "često", "četrta", "četrtek", "četrti", "četrto", "čez", "čigav", "š", "šest", "šesta", "šesti", "šesto", "štiri", "ž", "že", ]; ================================================ FILE: src/stopwords/sna.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) // Notice: we do not have stopwords for this language yet. pub static STOPWORDS_SNA: &[&str] = &[]; ================================================ FILE: src/stopwords/spa.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_SPA: &[&str] = &[ "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "_", "a", "actualmente", "acuerdo", "adelante", "ademas", "además", "adrede", "afirmó", "agregó", "ahi", "ahora", "ahí", "al", "algo", "alguna", "algunas", "alguno", "algunos", "algún", "alli", "allí", "alrededor", "ambos", "ampleamos", "antano", "antaño", "ante", "anterior", "antes", "apenas", "aproximadamente", "aquel", "aquella", "aquellas", "aquello", "aquellos", "aqui", "aquél", "aquélla", "aquéllas", "aquéllos", "aquí", "arriba", "arribaabajo", "aseguró", "asi", "así", "atras", "aun", "aunque", "ayer", "añadió", "aún", "b", "bajo", "bastante", "bien", "breve", "buen", "buena", "buenas", "bueno", "buenos", "c", "cada", "casi", "cerca", "cierta", "ciertas", "cierto", "ciertos", "cinco", "claro", "comentó", "como", "con", "conmigo", "conocer", "conseguimos", "conseguir", "considera", "consideró", "consigo", "consigue", "consiguen", "consigues", "contigo", "contra", "cosas", "creo", "cual", "cuales", "cualquier", "cuando", "cuanta", "cuantas", "cuanto", "cuantos", "cuatro", "cuenta", "cuál", "cuáles", "cuándo", "cuánta", "cuántas", "cuánto", "cuántos", "cómo", "d", "da", "dado", "dan", "dar", "de", "debajo", "debe", "deben", "debido", "decir", "dejó", "del", "delante", "demasiado", "demás", "dentro", "deprisa", "desde", "despacio", "despues", "después", "detras", "detrás", "dia", "dias", "dice", "dicen", "dicho", "dieron", "diferente", "diferentes", "dijeron", "dijo", "dio", "donde", "dos", "durante", "día", "días", "dónde", "e", "ejemplo", "el", "ella", "ellas", "ello", "ellos", "embargo", "empleais", "emplean", "emplear", "empleas", "empleo", "en", "encima", "encuentra", "enfrente", "enseguida", "entonces", "entre", "era", "erais", "eramos", "eran", "eras", "eres", "es", "esa", "esas", "ese", "eso", "esos", "esta", "estaba", "estabais", "estaban", "estabas", "estad", "estada", "estadas", "estado", "estados", "estais", "estamos", "estan", "estando", "estar", "estaremos", "estará", "estarán", "estarás", "estaré", "estaréis", "estaría", "estaríais", "estaríamos", "estarían", "estarías", "estas", "este", "estemos", "esto", "estos", "estoy", "estuve", "estuviera", "estuvierais", "estuvieran", "estuvieras", "estuvieron", "estuviese", "estuvieseis", "estuviesen", "estuvieses", "estuvimos", "estuviste", "estuvisteis", "estuviéramos", "estuviésemos", "estuvo", "está", "estábamos", "estáis", "están", "estás", "esté", "estéis", "estén", "estés", "ex", "excepto", "existe", "existen", "explicó", "expresó", "f", "fin", "final", "fue", "fuera", "fuerais", "fueran", "fueras", "fueron", "fuese", "fueseis", "fuesen", "fueses", "fui", "fuimos", "fuiste", "fuisteis", "fuéramos", "fuésemos", "g", "general", "gran", "grandes", "gueno", "h", "ha", "haber", "habia", "habida", "habidas", "habido", "habidos", "habiendo", "habla", "hablan", "habremos", "habrá", "habrán", "habrás", "habré", "habréis", "habría", "habríais", "habríamos", "habrían", "habrías", "habéis", "había", "habíais", "habíamos", "habían", "habías", "hace", "haceis", "hacemos", "hacen", "hacer", "hacerlo", "haces", "hacia", "haciendo", "hago", "han", "has", "hasta", "hay", "haya", "hayamos", "hayan", "hayas", "hayáis", "he", "hecho", "hemos", "hicieron", "hizo", "horas", "hoy", "hube", "hubiera", "hubierais", "hubieran", "hubieras", "hubieron", "hubiese", "hubieseis", "hubiesen", "hubieses", "hubimos", "hubiste", "hubisteis", "hubiéramos", "hubiésemos", "hubo", "i", "igual", "incluso", "indicó", "informo", "informó", "intenta", "intentais", "intentamos", "intentan", "intentar", "intentas", "intento", "ir", "j", "junto", "k", "l", "la", "lado", "largo", "las", "le", "lejos", "les", "llegó", "lleva", "llevar", "lo", "los", "luego", "lugar", "m", "mal", "manera", "manifestó", "mas", "mayor", "me", "mediante", "medio", "mejor", "mencionó", "menos", "menudo", "mi", "mia", "mias", "mientras", "mio", "mios", "mis", "misma", "mismas", "mismo", "mismos", "modo", "momento", "mucha", "muchas", "mucho", "muchos", "muy", "más", "mí", "mía", "mías", "mío", "míos", "n", "nada", "nadie", "ni", "ninguna", "ningunas", "ninguno", "ningunos", "ningún", "no", "nos", "nosotras", "nosotros", "nuestra", "nuestras", "nuestro", "nuestros", "nueva", "nuevas", "nuevo", "nuevos", "nunca", "o", "ocho", "os", "otra", "otras", "otro", "otros", "p", "pais", "para", "parece", "parte", "partir", "pasada", "pasado", "paìs", "peor", "pero", "pesar", "poca", "pocas", "poco", "pocos", "podeis", "podemos", "poder", "podria", "podriais", "podriamos", "podrian", "podrias", "podrá", "podrán", "podría", "podrían", "poner", "por", "por qué", "porque", "posible", "primer", "primera", "primero", "primeros", "principalmente", "pronto", "propia", "propias", "propio", "propios", "proximo", "próximo", "próximos", "pudo", "pueda", "puede", "pueden", "puedo", "pues", "q", "qeu", "que", "quedó", "queremos", "quien", "quienes", "quiere", "quiza", "quizas", "quizá", "quizás", "quién", "quiénes", "qué", "r", "raras", "realizado", "realizar", "realizó", "repente", "respecto", "s", "sabe", "sabeis", "sabemos", "saben", "saber", "sabes", "sal", "salvo", "se", "sea", "seamos", "sean", "seas", "segun", "segunda", "segundo", "según", "seis", "ser", "sera", "seremos", "será", "serán", "serás", "seré", "seréis", "sería", "seríais", "seríamos", "serían", "serías", "seáis", "señaló", "si", "sido", "siempre", "siendo", "siete", "sigue", "siguiente", "sin", "sino", "sobre", "sois", "sola", "solamente", "solas", "solo", "solos", "somos", "son", "soy", "soyos", "su", "supuesto", "sus", "suya", "suyas", "suyo", "suyos", "sé", "sí", "sólo", "t", "tal", "tambien", "también", "tampoco", "tan", "tanto", "tarde", "te", "temprano", "tendremos", "tendrá", "tendrán", "tendrás", "tendré", "tendréis", "tendría", "tendríais", "tendríamos", "tendrían", "tendrías", "tened", "teneis", "tenemos", "tener", "tenga", "tengamos", "tengan", "tengas", "tengo", "tengáis", "tenida", "tenidas", "tenido", "tenidos", "teniendo", "tenéis", "tenía", "teníais", "teníamos", "tenían", "tenías", "tercera", "ti", "tiempo", "tiene", "tienen", "tienes", "toda", "todas", "todavia", "todavía", "todo", "todos", "total", "trabaja", "trabajais", "trabajamos", "trabajan", "trabajar", "trabajas", "trabajo", "tras", "trata", "través", "tres", "tu", "tus", "tuve", "tuviera", "tuvierais", "tuvieran", "tuvieras", "tuvieron", "tuviese", "tuvieseis", "tuviesen", "tuvieses", "tuvimos", "tuviste", "tuvisteis", "tuviéramos", "tuviésemos", "tuvo", "tuya", "tuyas", "tuyo", "tuyos", "tú", "u", "ultimo", "un", "una", "unas", "uno", "unos", "usa", "usais", "usamos", "usan", "usar", "usas", "uso", "usted", "ustedes", "v", "va", "vais", "valor", "vamos", "van", "varias", "varios", "vaya", "veces", "ver", "verdad", "verdadera", "verdadero", "vez", "vosotras", "vosotros", "voy", "vuestra", "vuestras", "vuestro", "vuestros", "w", "x", "y", "ya", "yo", "z", "él", "éramos", "ésa", "ésas", "ése", "ésos", "ésta", "éstas", "éste", "éstos", "última", "últimas", "último", "últimos", ]; ================================================ FILE: src/stopwords/srp.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_SRP: &[&str] = &[ "a", "avaj", "ako", "al", "ali", "arh", "au", "ah", "aha", "aj", "bar", "bi", "bila", "bili", "bilo", "bismo", "biste", "bih", "bijasmo", "bijaste", "bijah", "bijahu", "bijaše", "biće", "blizu", "broj", "brr", "bude", "budimo", "budite", "budu", "budući", "bum", "buć", "vam", "vama", "vas", "vaša", "vaše", "vašim", "vašima", "valjda", "veoma", "verovatno", "već", "većina", "vi", "video", "više", "vrlo", "vrh", "ga", "gde", "gic", "god", "gore", "gđekoje", "da", "dakle", "dana", "danas", "daj", "dva", "de", "deder", "delimice", "delimično", "dem", "do", "dobar", "dobiti", "dovečer", "dokle", "dole", "donekle", "dosad", "doskoro", "dotad", "dotle", "došao", "doći", "drugamo", "drugde", "drugi", "e", "evo", "eno", "eto", "eh", "ehe", "ej", "želela", "želele", "želeli", "želelo", "želeh", "želeći", "želi", "za", "zaista", "zar", "zatim", "zato", "zahvaliti", "zašto", "zbilja", "zimus", "znati", "zum", "i", "ide", "iz", "izvan", "izvoli", "između", "iznad", "ikada", "ikakav", "ikakva", "ikakve", "ikakvi", "ikakvim", "ikakvima", "ikakvih", "ikakvo", "ikakvog", "ikakvoga", "ikakvom", "ikakvome", "ikakvoj", "ili", "im", "ima", "imam", "imao", "ispod", "ih", "iju", "ići", "kad", "kada", "koga", "kojekakav", "kojima", "koju", "krišom", "lani", "li", "mali", "manji", "me", "mene", "meni", "mi", "mimo", "misli", "mnogo", "mogu", "mora", "morao", "moj", "moja", "moje", "moji", "moju", "moći", "mu", "na", "nad", "nakon", "nam", "nama", "nas", "naša", "naše", "našeg", "naši", "naći", "ne", "negde", "neka", "nekad", "neke", "nekog", "neku", "nema", "nemam", "neko", "neće", "nećemo", "nećete", "nećeš", "neću", "ni", "nikada", "nikoga", "nikoje", "nikoji", "nikoju", "nisam", "nisi", "niste", "nisu", "ništa", "nijedan", "no", "o", "ova", "ovako", "ovamo", "ovaj", "ovde", "ove", "ovim", "ovima", "ovo", "ovoj", "od", "odmah", "oko", "okolo", "on", "onaj", "one", "onim", "onima", "onom", "onoj", "onu", "osim", "ostali", "otišao", "pa", "pak", "pitati", "po", "povodom", "pod", "podalje", "poželjan", "poželjna", "poizdalje", "poimence", "ponekad", "popreko", "pored", "posle", "potaman", "potrbuške", "pouzdano", "početak", "pojedini", "praviti", "prvi", "preko", "prema", "prije", "put", "pljus", "radije", "s", "sa", "sav", "sada", "sam", "samo", "sasvim", "sva", "svaki", "svi", "svim", "svog", "svom", "svoj", "svoja", "svoje", "svoju", "svu", "svugde", "se", "sebe", "sebi", "si", "smeti", "smo", "stvar", "stvarno", "ste", "su", "sutra", "ta", "tačno", "tako", "takođe", "tamo", "tvoj", "tvoja", "tvoje", "tvoji", "tvoju", "te", "tebe", "tebi", "ti", "tima", "to", "tome", "toj", "tu", "u", "uvek", "uvijek", "uz", "uza", "uzalud", "uzduž", "uzeti", "umalo", "unutra", "upotrebiti", "uprkos", "učinio", "učiniti", "halo", "hvala", "hej", "hm", "hop", "hoće", "hoćemo", "hoćete", "hoćeš", "hoću", "htedoste", "htedoh", "htedoše", "htela", "htele", "hteli", "hteo", "htejasmo", "htejaste", "htejahu", "hura", "često", "čijem", "čiji", "čijim", "čijima", "šic", "štagod", "što", "štogod", "ja", "je", "jedan", "jedini", "jedna", "jedne", "jedni", "jedno", "jednom", "jer", "jesam", "jesi", "jesmo", "jesu", "jim", "joj", "ju", "juče", "njegova", "njegovo", "njezin", "njezina", "njezino", "njemu", "njen", "njim", "njima", "njihova", "njihovo", "njoj", "nju", "će", "ćemo", "ćete", "ćeš", "ću", ]; ================================================ FILE: src/stopwords/swe.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_SWE: &[&str] = &[ "aderton", "adertonde", "adjö", "aldrig", "alla", "allas", "allt", "alltid", "alltså", "andra", "andras", "annan", "annat", "artonde", "artonn", "att", "av", "bakom", "bara", "behöva", "behövas", "behövde", "behövt", "beslut", "beslutat", "beslutit", "bland", "blev", "bli", "blir", "blivit", "bort", "borta", "bra", "bäst", "bättre", "båda", "bådas", "dag", "dagar", "dagarna", "dagen", "de", "del", "delen", "dem", "den", "denna", "deras", "dess", "dessa", "det", "detta", "dig", "din", "dina", "dit", "ditt", "dock", "dom", "du", "där", "därför", "då", "e", "efter", "eftersom", "ej", "elfte", "eller", "elva", "emot", "en", "enkel", "enkelt", "enkla", "enligt", "ens", "er", "era", "ers", "ert", "ett", "ettusen", "fanns", "fem", "femte", "femtio", "femtionde", "femton", "femtonde", "fick", "fin", "finnas", "finns", "fjorton", "fjortonde", "fjärde", "fler", "flera", "flesta", "fram", "framför", "från", "fyra", "fyrtio", "fyrtionde", "få", "får", "fått", "följande", "för", "före", "förlåt", "förra", "första", "genast", "genom", "gick", "gjorde", "gjort", "god", "goda", "godare", "godast", "gott", "gälla", "gäller", "gällt", "gärna", "gå", "går", "gått", "gör", "göra", "ha", "hade", "haft", "han", "hans", "har", "heller", "hellre", "helst", "helt", "henne", "hennes", "hit", "hon", "honom", "hundra", "hundraen", "hundraett", "hur", "här", "hög", "höger", "högre", "högst", "i", "ibland", "icke", "idag", "igen", "igår", "imorgon", "in", "inför", "inga", "ingen", "ingenting", "inget", "innan", "inne", "inom", "inte", "inuti", "ja", "jag", "jo", "ju", "just", "jämfört", "kan", "kanske", "knappast", "kom", "komma", "kommer", "kommit", "kr", "kunde", "kunna", "kunnat", "kvar", "legat", "ligga", "ligger", "lika", "likställd", "likställda", "lilla", "lite", "liten", "litet", "länge", "längre", "längst", "lätt", "lättare", "lättast", "långsam", "långsammare", "långsammast", "långsamt", "långt", "låt", "man", "med", "mej", "mellan", "men", "mer", "mera", "mest", "mig", "min", "mina", "mindre", "minst", "mitt", "mittemot", "mot", "mycket", "många", "måste", "möjlig", "möjligen", "möjligt", "möjligtvis", "ned", "nederst", "nedersta", "nedre", "nej", "ner", "ni", "nio", "nionde", "nittio", "nittionde", "nitton", "nittonde", "nog", "noll", "nr", "nu", "nummer", "när", "nästa", "någon", "någonting", "något", "några", "nån", "nånting", "nåt", "nödvändig", "nödvändiga", "nödvändigt", "nödvändigtvis", "och", "också", "ofta", "oftast", "olika", "olikt", "om", "oss", "på", "rakt", "redan", "rätt", "sa", "sade", "sagt", "samma", "sedan", "senare", "senast", "sent", "sex", "sextio", "sextionde", "sexton", "sextonde", "sig", "sin", "sina", "sist", "sista", "siste", "sitt", "sitta", "sju", "sjunde", "sjuttio", "sjuttionde", "sjutton", "sjuttonde", "själv", "sjätte", "ska", "skall", "skulle", "slutligen", "små", "smått", "snart", "som", "stor", "stora", "stort", "större", "störst", "säga", "säger", "sämre", "sämst", "så", "sådan", "sådana", "sådant", "ta", "tack", "tar", "tidig", "tidigare", "tidigast", "tidigt", "till", "tills", "tillsammans", "tio", "tionde", "tjugo", "tjugoen", "tjugoett", "tjugonde", "tjugotre", "tjugotvå", "tjungo", "tolfte", "tolv", "tre", "tredje", "trettio", "trettionde", "tretton", "trettonde", "två", "tvåhundra", "under", "upp", "ur", "ursäkt", "ut", "utan", "utanför", "ute", "va", "vad", "var", "vara", "varför", "varifrån", "varit", "varje", "varken", "vars", "varsågod", "vart", "vem", "vems", "verkligen", "vi", "vid", "vidare", "viktig", "viktigare", "viktigast", "viktigt", "vilka", "vilkas", "vilken", "vilket", "vill", "väl", "vänster", "vänstra", "värre", "vår", "våra", "vårt", "än", "ännu", "är", "även", "åt", "åtminstone", "åtta", "åttio", "åttionde", "åttonde", "över", "övermorgon", "överst", "övre", ]; ================================================ FILE: src/stopwords/tam.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_TAM: &[&str] = &[ "ஒரு", "என்று", "மற்றும்", "இந்த", "இது", "என்ற", "கொண்டு", "என்பது", "பல", "ஆகும்", "அல்லது", "அவர்", "நான்", "உள்ள", "அந்த", "இவர்", "என", "முதல்", "என்ன", "இருந்து", "சில", "என்", "போன்ற", "வேண்டும்", "வந்து", "இதன்", "அது", "அவன்", "தான்", "பலரும்", "என்னும்", "மேலும்", "பின்னர்", "கொண்ட", "இருக்கும்", "தனது", "உள்ளது", "போது", "என்றும்", "அதன்", "தன்", "பிறகு", "அவர்கள்", "வரை", "அவள்", "நீ", "ஆகிய", "இருந்தது", "உள்ளன", "வந்த", "இருந்த", "மிகவும்", "இங்கு", "மீது", "ஓர்", "இவை", "இந்தக்", "பற்றி", "வரும்", "வேறு", "இரு", "இதில்", "போல்", "இப்போது", "அவரது", "மட்டும்", "இந்தப்", "எனும்", "மேல்", "பின்", "சேர்ந்த", "ஆகியோர்", "எனக்கு", "இன்னும்", "அந்தப்", "அன்று", "ஒரே", "மிக", "அங்கு", "பல்வேறு", "விட்டு", "பெரும்", "அதை", "பற்றிய", "உன்", "அதிக", "அந்தக்", "பேர்", "இதனால்", "அவை", "அதே", "ஏன்", "முறை", "யார்", "என்பதை", "எல்லாம்", "மட்டுமே", "இங்கே", "அங்கே", "இடம்", "இடத்தில்", "அதில்", "நாம்", "அதற்கு", "எனவே", "பிற", "சிறு", "மற்ற", "விட", "எந்த", "எனவும்", "எனப்படும்", "எனினும்", "அடுத்த", "இதனை", "இதை", "கொள்ள", "இந்தத்", "இதற்கு", "அதனால்", "தவிர", "போல", "வரையில்", "சற்று", "எனக்", ]; ================================================ FILE: src/stopwords/tel.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) // Notice: we do not have stopwords for this language yet. pub static STOPWORDS_TEL: &[&str] = &[]; ================================================ FILE: src/stopwords/tgl.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2022, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_TGL: &[&str] = &[ "akin", "aking", "ako", "alin", "am", "amin", "aming", "ang", "ano", "anumang", "apat", "at", "atin", "ating", "ay", "bababa", "bago", "bakit", "bawat", "bilang", "dahil", "dalawa", "dapat", "din", "dito", "doon", "gagawin", "gayunman", "ginagawa", "ginawa", "ginawang", "gumawa", "gusto", "habang", "hanggang", "hindi", "huwag", "iba", "ibaba", "ibabaw", "ibig", "ikaw", "ilagay", "ilalim", "ilan", "inyong", "isa", "isang", "itaas", "ito", "iyo", "iyon", "iyong", "ka", "kahit", "kailangan", "kailanman", "kami", "kanila", "kanilang", "kanino", "kanya", "kanyang", "kapag", "kapwa", "karamihan", "katiyakan", "katulad", "kaya", "kaysa", "ko", "kong", "kulang", "kumuha", "kung", "laban", "lahat", "lamang", "likod", "lima", "maaari", "maaaring", "maging", "mahusay", "makita", "marami", "marapat", "masyado", "may", "mayroon", "mga", "minsan", "mismo", "mula", "muli", "na", "nabanggit", "naging", "nagkaroon", "nais", "nakita", "namin", "napaka", "narito", "nasaan", "ng", "ngayon", "ni", "nila", "nilang", "nito", "niya", "niyang", "noon", "o", "pa", "paano", "pababa", "paggawa", "pagitan", "pagkakaroon", "pagkatapos", "palabas", "pamamagitan", "panahon", "pangalawa", "para", "paraan", "pareho", "pataas", "pero", "pumunta", "pumupunta", "sa", "saan", "sabi", "sabihin", "sarili", "sila", "sino", "siya", "tatlo", "tayo", "tulad", "tungkol", "una", "walang", ]; ================================================ FILE: src/stopwords/tha.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_THA: &[&str] = &[ "กล่าว", "กว่า", "กัน", "กับ", "การ", "ก็", "ก่อน", "ขณะ", "ขอ", "ของ", "ขึ้น", "คง", "ครั้ง", "ความ", "คือ", "จะ", "จัด", "จาก", "จึง", "ช่วง", "ซึ่ง", "ดัง", "ด้วย", "ด้าน", "ตั้ง", "ตั้งแต่", "ตาม", "ต่อ", "ต่าง", "ต่างๆ", "ต้อง", "ถึง", "ถูก", "ถ้า", "ทั้ง", "ทั้งนี้", "ทาง", "ทำ", "ทำให้", "ที่", "ที่สุด", "ทุก", "นอกจาก", "นัก", "นั้น", "นำ", "นี้", "น่า", "บาง", "ผล", "ผ่าน", "พบ", "พร้อม", "มา", "มาก", "มี", "ยัง", "รวม", "ระหว่าง", "รับ", "ราย", "ร่วม", "ลง", "วัน", "ว่า", "สำหรับ", "สุด", "ส่ง", "ส่วน", "หนึ่ง", "หรือ", "หลัง", "หลังจาก", "หลาย", "หาก", "อยาก", "อยู่", "อย่าง", "ออก", "อะไร", "อาจ", "อีก", "เขา", "เข้า", "เคย", "เฉพาะ", "เช่น", "เดียว", "เดียวกัน", "เนื่องจาก", "เปิด", "เปิดเผย", "เป็น", "เป็นการ", "เพราะ", "เพื่อ", "เมื่อ", "เรา", "เริ่ม", "เลย", "เห็น", "เอง", "แต่", "แบบ", "แรก", "และ", "แล้ว", "แห่ง", "โดย", "ใน", "ให้", "ได้", "ไป", "ไม่", "ไว้", ]; ================================================ FILE: src/stopwords/tuk.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) // Notice: we do not have stopwords for this language yet. pub static STOPWORDS_TUK: &[&str] = &[]; ================================================ FILE: src/stopwords/tur.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_TUR: &[&str] = &[ "acaba", "acep", "adamakıllı", "adeta", "ait", "altmýþ", "altmış", "altý", "altı", "ama", "amma", "anca", "ancak", "arada", "artýk", "aslında", "aynen", "ayrıca", "az", "açıkça", "açıkçası", "bana", "bari", "bazen", "bazý", "bazı", "başkası", "baţka", "belki", "ben", "benden", "beni", "benim", "beri", "beriki", "beþ", "beş", "beţ", "bilcümle", "bile", "bin", "binaen", "binaenaleyh", "bir", "biraz", "birazdan", "birbiri", "birden", "birdenbire", "biri", "birice", "birileri", "birisi", "birkaç", "birkaçı", "birkez", "birlikte", "birçok", "birçoğu", "birþey", "birþeyi", "birşey", "birşeyi", "birţey", "bitevi", "biteviye", "bittabi", "biz", "bizatihi", "bizce", "bizcileyin", "bizden", "bize", "bizi", "bizim", "bizimki", "bizzat", "boşuna", "bu", "buna", "bunda", "bundan", "bunlar", "bunları", "bunların", "bunu", "bunun", "buracıkta", "burada", "buradan", "burası", "böyle", "böylece", "böylecene", "böylelikle", "böylemesine", "böylesine", "büsbütün", "bütün", "cuk", "cümlesi", "da", "daha", "dahi", "dahil", "dahilen", "daima", "dair", "dayanarak", "de", "defa", "dek", "demin", "demincek", "deminden", "denli", "derakap", "derhal", "derken", "deđil", "değil", "değin", "diye", "diđer", "diğer", "diğeri", "doksan", "dokuz", "dolayı", "dolayısıyla", "doğru", "dört", "edecek", "eden", "ederek", "edilecek", "ediliyor", "edilmesi", "ediyor", "elbet", "elbette", "elli", "emme", "en", "enikonu", "epey", "epeyce", "epeyi", "esasen", "esnasında", "etmesi", "etraflı", "etraflıca", "etti", "ettiği", "ettiğini", "evleviyetle", "evvel", "evvela", "evvelce", "evvelden", "evvelemirde", "evveli", "eđer", "eğer", "fakat", "filanca", "gah", "gayet", "gayetle", "gayri", "gayrı", "gelgelelim", "gene", "gerek", "gerçi", "geçende", "geçenlerde", "gibi", "gibilerden", "gibisinden", "gine", "göre", "gırla", "hakeza", "halbuki", "halen", "halihazırda", "haliyle", "handiyse", "hangi", "hangisi", "hani", "hariç", "hasebiyle", "hasılı", "hatta", "hele", "hem", "henüz", "hep", "hepsi", "her", "herhangi", "herkes", "herkesin", "hiç", "hiçbir", "hiçbiri", "hoş", "hulasaten", "iken", "iki", "ila", "ile", "ilen", "ilgili", "ilk", "illa", "illaki", "imdi", "indinde", "inen", "insermi", "ise", "ister", "itibaren", "itibariyle", "itibarıyla", "iyi", "iyice", "iyicene", "için", "iş", "işte", "iţte", "kadar", "kaffesi", "kah", "kala", "kanýmca", "karşın", "katrilyon", "kaynak", "kaçı", "kelli", "kendi", "kendilerine", "kendini", "kendisi", "kendisine", "kendisini", "kere", "kez", "keza", "kezalik", "keşke", "keţke", "ki", "kim", "kimden", "kime", "kimi", "kimisi", "kimse", "kimsecik", "kimsecikler", "külliyen", "kýrk", "kýsaca", "kırk", "kısaca", "lakin", "leh", "lütfen", "maada", "madem", "mademki", "mamafih", "mebni", "međer", "meğer", "meğerki", "meğerse", "milyar", "milyon", "mu", "mü", "mý", "mı", "nasýl", "nasıl", "nasılsa", "nazaran", "naşi", "ne", "neden", "nedeniyle", "nedenle", "nedense", "nerde", "nerden", "nerdeyse", "nere", "nerede", "nereden", "neredeyse", "neresi", "nereye", "netekim", "neye", "neyi", "neyse", "nice", "nihayet", "nihayetinde", "nitekim", "niye", "niçin", "o", "olan", "olarak", "oldu", "olduklarını", "oldukça", "olduğu", "olduğunu", "olmadı", "olmadığı", "olmak", "olması", "olmayan", "olmaz", "olsa", "olsun", "olup", "olur", "olursa", "oluyor", "on", "ona", "onca", "onculayın", "onda", "ondan", "onlar", "onlardan", "onlari", "onlarýn", "onları", "onların", "onu", "onun", "oracık", "oracıkta", "orada", "oradan", "oranca", "oranla", "oraya", "otuz", "oysa", "oysaki", "pek", "pekala", "peki", "pekçe", "peyderpey", "rağmen", "sadece", "sahi", "sahiden", "sana", "sanki", "sekiz", "seksen", "sen", "senden", "seni", "senin", "siz", "sizden", "sizi", "sizin", "sonra", "sonradan", "sonraları", "sonunda", "tabii", "tam", "tamam", "tamamen", "tamamıyla", "tarafından", "tek", "trilyon", "tüm", "var", "vardı", "vasıtasıyla", "ve", "velev", "velhasıl", "velhasılıkelam", "veya", "veyahut", "ya", "yahut", "yakinen", "yakında", "yakından", "yakınlarda", "yalnız", "yalnızca", "yani", "yapacak", "yapmak", "yaptı", "yaptıkları", "yaptığı", "yaptığını", "yapılan", "yapılması", "yapıyor", "yedi", "yeniden", "yenilerde", "yerine", "yetmiþ", "yetmiş", "yetmiţ", "yine", "yirmi", "yok", "yoksa", "yoluyla", "yüz", "yüzünden", "zarfında", "zaten", "zati", "zira", "çabuk", "çabukça", "çeşitli", "çok", "çokları", "çoklarınca", "çokluk", "çoklukla", "çokça", "çoğu", "çoğun", "çoğunca", "çoğunlukla", "çünkü", "öbür", "öbürkü", "öbürü", "önce", "önceden", "önceleri", "öncelikle", "öteki", "ötekisi", "öyle", "öylece", "öylelikle", "öylemesine", "öz", "üzere", "üç", "þey", "þeyden", "þeyi", "þeyler", "þu", "þuna", "þunda", "þundan", "þunu", "şayet", "şey", "şeyden", "şeyi", "şeyler", "şu", "şuna", "şuncacık", "şunda", "şundan", "şunlar", "şunları", "şunu", "şunun", "şura", "şuracık", "şuracıkta", "şurası", "şöyle", "ţayet", "ţimdi", "ţu", "ţöyle", ]; ================================================ FILE: src/stopwords/ukr.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_UKR: &[&str] = &[ "але", "ви", "вона", "вони", "воно", "він", "в╡д", "з", "й", "коли", "ми", "нам", "про", "та", "ти", "хоча", "це", "цей", "чи", "чого", "що", "як", "яко╞", "із", "інших", "╙", "╞х", "╡", ]; ================================================ FILE: src/stopwords/urd.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_URD: &[&str] = &[ "آئی", "آئے", "آج", "آخر", "آخرکبر", "آدهی", "آًب", "آٹھ", "آیب", "اة", "اخبزت", "اختتبم", "ادھر", "ارد", "اردگرد", "ارکبى", "اش", "اضتعوبل", "اضتعوبلات", "اضطرذ", "اضکب", "اضکی", "اضکے", "اطراف", "اغیب", "افراد", "الگ", "اور", "اوًچب", "اوًچبئی", "اوًچی", "اوًچے", "اى", "اً", "اًذر", "اًہیں", "اٹھبًب", "اپٌب", "اپٌے", "اچھب", "اچھی", "اچھے", "اکثر", "اکٹھب", "اکٹھی", "اکٹھے", "اکیلا", "اکیلی", "اکیلے", "اگرچہ", "اہن", "ایطے", "ایک", "ب", "ت", "تبزٍ", "تت", "تر", "ترتیت", "تریي", "تعذاد", "تن", "تو", "توبم", "توہی", "توہیں", "تٌہب", "تک", "تھب", "تھوڑا", "تھوڑی", "تھوڑے", "تھی", "تھے", "تیي", "ثب", "ثبئیں", "ثبترتیت", "ثبری", "ثبرے", "ثبعث", "ثبلا", "ثبلترتیت", "ثبہر", "ثدبئے", "ثرآں", "ثراں", "ثرش", "ثعذ", "ثغیر", "ثلٌذ", "ثلٌذوثبلا", "ثلکہ", "ثي", "ثٌب", "ثٌبرہب", "ثٌبرہی", "ثٌبرہے", "ثٌبًب", "ثٌذ", "ثٌذکرو", "ثٌذکرًب", "ثٌذی", "ثڑا", "ثڑوں", "ثڑی", "ثڑے", "ثھر", "ثھرا", "ثھراہوا", "ثھرپور", "ثھی", "ثہت", "ثہتر", "ثہتری", "ثہتریي", "ثیچ", "ج", "خب", "خبرہب", "خبرہی", "خبرہے", "خبهوظ", "خبًب", "خبًتب", "خبًتی", "خبًتے", "خبًٌب", "خت", "ختن", "خجکہ", "خص", "خططرذ", "خلذی", "خو", "خواى", "خوًہی", "خوکہ", "خٌبة", "خگہ", "خگہوں", "خگہیں", "خیطب", "خیطبکہ", "در", "درخبت", "درخہ", "درخے", "درزقیقت", "درضت", "دش", "دفعہ", "دلچطپ", "دلچطپی", "دلچطپیبں", "دو", "دور", "دوراى", "دوضرا", "دوضروں", "دوضری", "دوضرے", "دوًوں", "دکھبئیں", "دکھبتب", "دکھبتی", "دکھبتے", "دکھبو", "دکھبًب", "دکھبیب", "دی", "دیب", "دیتب", "دیتی", "دیتے", "دیر", "دیٌب", "دیکھو", "دیکھٌب", "دیکھی", "دیکھیں", "دے", "ر", "راضتوں", "راضتہ", "راضتے", "رریعہ", "رریعے", "رکي", "رکھ", "رکھب", "رکھتب", "رکھتبہوں", "رکھتی", "رکھتے", "رکھی", "رکھے", "رہب", "رہی", "رہے", "ز", "زبصل", "زبضر", "زبل", "زبلات", "زبلیہ", "زصوں", "زصہ", "زصے", "زقبئق", "زقیتیں", "زقیقت", "زکن", "زکویہ", "زیبدٍ", "صبف", "صسیر", "صفر", "صورت", "صورتسبل", "صورتوں", "صورتیں", "ض", "ضبت", "ضبتھ", "ضبدٍ", "ضبرا", "ضبرے", "ضبل", "ضبلوں", "ضت", "ضرور", "ضرورت", "ضروری", "ضلطلہ", "ضوچ", "ضوچب", "ضوچتب", "ضوچتی", "ضوچتے", "ضوچو", "ضوچٌب", "ضوچی", "ضوچیں", "ضکب", "ضکتب", "ضکتی", "ضکتے", "ضکٌب", "ضکی", "ضکے", "ضیذھب", "ضیذھی", "ضیذھے", "ضیکٌڈ", "ضے", "طرف", "طریق", "طریقوں", "طریقہ", "طریقے", "طور", "طورپر", "ظبہر", "ع", "عذد", "عظین", "علاقوں", "علاقہ", "علاقے", "علاوٍ", "عووهی", "غبیذ", "غخص", "غذ", "غروع", "غروعبت", "غے", "فرد", "فی", "ق", "قجل", "قجیلہ", "قطن", "لئے", "لا", "لازهی", "لو", "لوجب", "لوجی", "لوجے", "لوسبت", "لوسہ", "لوگ", "لوگوں", "لڑکپي", "لگتب", "لگتی", "لگتے", "لگٌب", "لگی", "لگیں", "لگے", "لی", "لیب", "لیٌب", "لیں", "لے", "ه", "هتعلق", "هختلف", "هسترم", "هسترهہ", "هسطوش", "هسیذ", "هطئلہ", "هطئلے", "هطبئل", "هطتعول", "هطلق", "هعلوم", "هػتول", "هلا", "هوکي", "هوکٌبت", "هوکٌہ", "هٌبضت", "هڑا", "هڑًب", "هڑے", "هکول", "هگر", "هہرثبى", "هیرا", "هیری", "هیرے", "هیں", "و", "وار", "والے", "وٍ", "ًئی", "ًئے", "ًب", "ًبپطٌذ", "ًبگسیر", "ًطجت", "ًقطہ", "ًو", "ًوخواى", "ًکبلٌب", "ًکتہ", "ًہ", "ًہیں", "ًیب", "ًے", "ٓ آش", "ٹھیک", "پبئے", "پبش", "پبًب", "پبًچ", "پر", "پراًب", "پطٌذ", "پل", "پورا", "پوچھب", "پوچھتب", "پوچھتی", "پوچھتے", "پوچھو", "پوچھوں", "پوچھٌب", "پوچھیں", "پچھلا", "پھر", "پہلا", "پہلی", "پہلےضی", "پہلےضے", "پہلےضےہی", "پیع", "چبر", "چبہب", "چبہٌب", "چبہے", "چلا", "چلو", "چلیں", "چلے", "چکب", "چکی", "چکیں", "چکے", "چھوٹب", "چھوٹوں", "چھوٹی", "چھوٹے", "چھہ", "چیسیں", "ڈھوًڈا", "ڈھوًڈلیب", "ڈھوًڈو", "ڈھوًڈًب", "ڈھوًڈی", "ڈھوًڈیں", "ک", "کئی", "کئے", "کب", "کبفی", "کبم", "کت", "کجھی", "کرا", "کرتب", "کرتبہوں", "کرتی", "کرتے", "کرتےہو", "کررہب", "کررہی", "کررہے", "کرو", "کرًب", "کریں", "کرے", "کطی", "کل", "کن", "کوئی", "کوتر", "کورا", "کوروں", "کورٍ", "کورے", "کوطي", "کوى", "کوًطب", "کوًطی", "کوًطے", "کھولا", "کھولو", "کھولٌب", "کھولی", "کھولیں", "کھولے", "کہ", "کہب", "کہتب", "کہتی", "کہتے", "کہو", "کہوں", "کہٌب", "کہی", "کہیں", "کہے", "کی", "کیب", "کیطب", "کیطرف", "کیطے", "کیلئے", "کیوًکہ", "کیوں", "کیے", "کے", "کےثعذ", "کےرریعے", "گئی", "گئے", "گب", "گرد", "گروٍ", "گروپ", "گروہوں", "گٌتی", "گی", "گیب", "گے", "ہر", "ہن", "ہو", "ہوئی", "ہوئے", "ہوا", "ہوبرا", "ہوبری", "ہوبرے", "ہوتب", "ہوتی", "ہوتے", "ہورہب", "ہورہی", "ہورہے", "ہوضکتب", "ہوضکتی", "ہوضکتے", "ہوًب", "ہوًی", "ہوًے", "ہوچکب", "ہوچکی", "ہوچکے", "ہوگئی", "ہوگئے", "ہوگیب", "ہوں", "ہی", "ہیں", "ہے", "ی", "یقیٌی", "یہ", "یہبں", ]; ================================================ FILE: src/stopwords/uzb.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) // Notice: we do not have stopwords for this language yet. pub static STOPWORDS_UZB: &[&str] = &[]; ================================================ FILE: src/stopwords/vie.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_VIE: &[&str] = &[ "a ha", "a-lô", "ai", "ai ai", "ai nấy", "alô", "amen", "anh", "bao giờ", "bao lâu", "bao nhiêu", "bao nả", "bay biến", "biết", "biết bao", "biết bao nhiêu", "biết chừng nào", "biết mấy", "biết đâu", "biết đâu chừng", "biết đâu đấy", "bà", "bài", "bác", "bây bẩy", "bây chừ", "bây giờ", "bây nhiêu", "bèn", "béng", "bông", "bạn", "bản", "bất chợt", "bất cứ", "bất giác", "bất kì", "bất kể", "bất kỳ", "bất luận", "bất nhược", "bất quá", "bất thình lình", "bất tử", "bất đồ", "bấy", "bấy chầy", "bấy chừ", "bấy giờ", "bấy lâu", "bấy lâu nay", "bấy nay", "bấy nhiêu", "bập bà bập bõm", "bập bõm", "bắt đầu từ", "bằng", "bằng không", "bằng nấy", "bằng ấy", "bển", "bệt", "bị", "bỏ mẹ", "bỗng", "bỗng chốc", "bỗng dưng", "bỗng không", "bỗng nhiên", "bỗng đâu", "bộ", "bội phần", "bớ", "bởi", "bởi chưng", "bởi nhưng", "bởi thế", "bởi vì", "bởi vậy", "bức", "cao", "cha", "cha chả", "chao ôi", "chiếc", "cho", "cho nên", "cho tới", "cho tới khi", "cho đến", "cho đến khi", "choa", "chu cha", "chui cha", "chung cục", "chung qui", "chung quy", "chung quy lại", "chuyện", "chành chạnh", "chí chết", "chính", "chính là", "chính thị", "chùn chùn", "chùn chũn", "chú", "chú mày", "chú mình", "chúng mình", "chúng ta", "chúng tôi", "chăn chắn", "chăng", "chưa", "chầm chập", "chậc", "chắc", "chắc hẳn", "chẳng lẽ", "chẳng những", "chẳng nữa", "chẳng phải", "chết nỗi", "chết thật", "chết tiệt", "chỉ", "chỉn", "chốc chốc", "chớ", "chớ chi", "chợt", "chủn", "chứ", "chứ lị", "coi bộ", "coi mòi", "con", "cu cậu", "cuốn", "cuộc", "càng", "các", "cái", "cây", "còn", "có", "có chăng là", "có dễ", "có thể", "có vẻ", "cóc khô", "cô", "cô mình", "công nhiên", "cùng", "cùng cực", "cùng nhau", "cùng với", "căn", "căn cắt", "cũng", "cũng như", "cũng vậy", "cũng vậy thôi", "cơ", "cơ chừng", "cơ hồ", "cơ mà", "cơn", "cả", "cả thảy", "cả thể", "cảm ơn", "cần", "cật lực", "cật sức", "cậu", "cổ lai", "của", "cứ", "cứ việc", "cực lực", "do", "do vì", "do vậy", "do đó", "duy", "dào", "dì", "dù cho", "dù rằng", "dưới", "dạ", "dần dà", "dần dần", "dầu sao", "dẫu", "dẫu sao", "dễ sợ", "dễ thường", "dở chừng", "dữ", "em", "giữa", "gì", "hay", "hoàn toàn", "hoặc", "hơn", "hầu hết", "họ", "hỏi", "khi", "khác", "không", "luôn", "là", "làm", "lên", "lúc", "lại", "lần", "lớn", "muốn", "mà", "mình", "mỗi", "một", "một cách", "mới", "mợ", "ngay", "ngay cả", "ngay khi", "ngay lúc", "ngay lập tức", "ngay tức khắc", "ngay từ", "nghe chừng", "nghe đâu", "nghen", "nghiễm nhiên", "nghỉm", "ngoài", "ngoài ra", "ngoải", "ngày", "ngày càng", "ngày ngày", "ngày xưa", "ngày xửa", "ngôi", "ngõ hầu", "ngăn ngắt", "ngươi", "người", "ngọn", "ngọt", "ngộ nhỡ", "nh", "nhau", "nhiên hậu", "nhiều", "nhiệt liệt", "nhung nhăng", "nhà", "nhân dịp", "nhân tiện", "nhé", "nhón nhén", "như", "như chơi", "như không", "như quả", "như thể", "như tuồng", "như vậy", "nhưng", "nhưng mà", "nhược bằng", "nhất", "nhất loạt", "nhất luật", "nhất mực", "nhất nhất", "nhất quyết", "nhất sinh", "nhất thiết", "nhất tâm", "nhất tề", "nhất đán", "nhất định", "nhận", "nhỉ", "nhỡ ra", "những", "những ai", "những như", "nào", "này", "nên", "nên chi", "nó", "nóc", "nói", "năm", "nơi", "nấy", "nếu", "nếu như", "nền", "nọ", "nớ", "nức nở", "nữa", "oai oái", "oái", "pho", "phè", "phóc", "phót", "phăn phắt", "phương chi", "phải", "phải chi", "phải chăng", "phắt", "phỉ phui", "phỏng", "phỏng như", "phốc", "phụt", "phứt", "qua", "qua quít", "qua quýt", "quyết", "quyết nhiên", "quyển", "quá", "quá chừng", "quá lắm", "quá sá", "quá thể", "quá trời", "quá xá", "quá đỗi", "quá độ", "quá ư", "quý hồ", "quả", "quả là", "quả tang", "quả thật", "quả tình", "quả vậy", "quả đúng", "ra", "ra phết", "ra sao", "ra trò", "ren rén", "riu ríu", "riêng", "riệt", "rày", "ráo", "ráo trọi", "rén", "rích", "rón rén", "rút cục", "răng", "rất", "rằng", "rằng là", "rốt cuộc", "rốt cục", "rồi", "rứa", "sa sả", "sao", "sau", "sau chót", "sau cuối", "sau cùng", "sau đó", "so", "song le", "suýt", "sì", "sạch", "sất", "sắp", "sẽ", "số", "số là", "sốt sột", "sở dĩ", "sự", "tanh", "tha hồ", "than ôi", "thanh", "theo", "thi thoảng", "thoạt", "thoạt nhiên", "thoắt", "thuần", "thà", "thà là", "thà rằng", "thành ra", "thành thử", "thái quá", "tháng", "thì", "thì thôi", "thình lình", "thím", "thôi", "thúng thắng", "thương ôi", "thường", "thảo hèn", "thảo nào", "thấy", "thẩy", "thậm", "thậm chí", "thật lực", "thật ra", "thật vậy", "thế", "thế là", "thế mà", "thế nào", "thế nên", "thế ra", "thế thì", "thế à", "thếch", "thỉnh thoảng", "thỏm", "thốc", "thốc tháo", "thốt", "thốt nhiên", "thộc", "thời gian", "thục mạng", "thửa", "thực ra", "thực sự", "thực vậy", "tiếp theo", "tiếp đó", "tiện thể", "toà", "toé khói", "toẹt", "trong", "trên", "trước", "trước kia", "trước nay", "trước tiên", "trước đây", "trước đó", "trếu tráo", "trển", "trệt", "trệu trạo", "trỏng", "trời đất ơi", "trừ phi", "tuy", "tuy nhiên", "tuy rằng", "tuy thế", "tuy vậy", "tuyệt nhiên", "tuần tự", "tuốt luốt", "tuốt tuồn tuột", "tuốt tuột", "tà tà", "tênh", "tít mù", "tò te", "tôi", "tông tốc", "tù tì", "tăm tắp", "tại", "tại vì", "tấm", "tấn", "tất cả", "tất thảy", "tất tần tật", "tất tật", "tắp", "tắp lự", "tọt", "tỏ ra", "tỏ vẻ", "tốc tả", "tối ư", "tột", "tớ", "tới", "tức thì", "tức tốc", "từ", "từng", "tự vì", "tựu trung", "veo", "veo veo", "việc", "vung thiên địa", "vung tàn tán", "vung tán tàn", "và", "vào", "vâng", "vèo", "vì", "vì chưng", "vì thế", "vì vậy", "ví bằng", "ví dù", "ví phỏng", "ví thử", "vô hình trung", "vô kể", "vô luận", "vô vàn", "văng tê", "vạn nhất", "vả chăng", "vả lại", "vẫn", "vậy", "vậy là", "vậy thì", "về", "vị tất", "vốn dĩ", "với", "với lại", "vở", "vụt", "vừa", "vừa mới", "xa xả", "xiết bao", "xon xón", "xoành xoạch", "xoét", "xoẳn", "xoẹt", "xuất kì bất ý", "xuất kỳ bất ý", "xuể", "xuống", "xăm xúi", "xăm xăm", "xăm xắm", "xềnh xệch", "xệp", "à", "à ơi", "ào", "á", "á à", "ái", "ái chà", "ái dà", "áng", "âu là", "ô hay", "ô hô", "ô kê", "ô kìa", "ôi chao", "ôi thôi", "ông", "úi", "úi chà", "úi dào", "ý", "ý chừng", "ý da", "đang", "đi", "điều", "đành đạch", "đáng lí", "đáng lý", "đáng lẽ", "đánh đùng", "đáo để", "đây", "đã", "đó", "được", "đại loại", "đại nhân", "đại phàm", "đại để", "đến", "đến nỗi", "đều", "để", "ơ", "ơ hay", "ơ kìa", "ơi", "ư", "ạ", "ạ ơi", "ấy", "ầu ơ", "ắt", "ắt hẳn", "ắt là", "ối dào", "ối giời", "ối giời ơi", "ồ", "ổng", "ớ", "ờ", "ở", "ở trên", "ủa", "ứ hự", "ứ ừ", "ừ", "ử", ]; ================================================ FILE: src/stopwords/yid.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) // This is an alias for HEB stopwords, but I may be mistaken there. pub static STOPWORDS_YID: &[&str] = &[ "אבל", "או", "אולי", "אותה", "אותו", "אותי", "אותך", "אותם", "אותן", "אותנו", "אז", "אחר", "אחרות", "אחרי", "אחריכן", "אחרים", "אחרת", "אי", "איזה", "איך", "אין", "איפה", "איתה", "איתו", "איתי", "איתך", "איתכם", "איתכן", "איתם", "איתן", "איתנו", "אך", "אל", "אלה", "אלו", "אם", "אנחנו", "אני", "אס", "אף", "אצל", "אשר", "את", "אתה", "אתכם", "אתכן", "אתם", "אתן", "באיזומידה", "באמצע", "באמצעות", "בגלל", "בין", "בלי", "במידה", "במקוםשבו", "ברם", "בשביל", "בשעהש", "בתוך", "גם", "דרך", "הוא", "היא", "היה", "היכן", "היתה", "היתי", "הם", "הן", "הנה", "הסיבהשבגללה", "הרי", "ואילו", "ואת", "זאת", "זה", "זות", "יהיה", "יוכל", "יוכלו", "יותרמדי", "יכול", "יכולה", "יכולות", "יכולים", "יכל", "יכלה", "יכלו", "יש", "כאן", "כאשר", "כולם", "כולן", "כזה", "כי", "כיצד", "כך", "ככה", "כל", "כלל", "כמו", "כן", "כפי", "כש", "לא", "לאו", "לאיזותכלית", "לאן", "לבין", "לה", "להיות", "להם", "להן", "לו", "לי", "לכם", "לכן", "למה", "למטה", "למעלה", "למקוםשבו", "למרות", "לנו", "לעבר", "לעיכן", "לפיכך", "לפני", "מאד", "מאחורי", "מאיזוסיבה", "מאין", "מאיפה", "מבלי", "מבעד", "מדוע", "מה", "מהיכן", "מול", "מחוץ", "מי", "מכאן", "מכיוון", "מלבד", "מן", "מנין", "מסוגל", "מעט", "מעטים", "מעל", "מצד", "מקוםבו", "מתחת", "מתי", "נגד", "נגר", "נו", "עד", "עז", "על", "עלי", "עליה", "עליהם", "עליהן", "עליו", "עליך", "עליכם", "עלינו", "עם", "עצמה", "עצמהם", "עצמהן", "עצמו", "עצמי", "עצמם", "עצמן", "עצמנו", "פה", "רק", "שוב", "של", "שלה", "שלהם", "שלהן", "שלו", "שלי", "שלך", "שלכה", "שלכם", "שלכן", "שלנו", "שם", "תהיה", "תחת", ]; ================================================ FILE: src/stopwords/zul.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub static STOPWORDS_ZUL: &[&str] = &[ "futhi", "kahle", "kakhulu", "kanye", "khona", "kodwa", "kungani", "kusho", "la", "lakhe", "lapho", "mina", "ngesikhathi", "nje", "phansi", "phezulu", "u", "ukuba", "ukuthi", "ukuze", "uma", "wahamba", "wakhe", "wami", "wase", "wathi", "yakhe", "zakhe", "zonke", ]; ================================================ FILE: src/store/fst.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) use fst::automaton::AlwaysMatch; use fst::set::Stream as FSTStream; use fst::{ Automaton, Error as FSTError, IntoStreamer, Set as FSTSet, SetBuilder as FSTSetBuilder, Streamer, }; use fst_levenshtein::Levenshtein; use fst_regex::Regex; use hashbrown::{HashMap, HashSet}; use radix::RadixNum; use regex_syntax::escape as regex_escape; use std::collections::VecDeque; use std::fmt; use std::fs::{self, File}; use std::io::{self, BufRead, BufReader, BufWriter, Write}; use std::iter::FromIterator; use std::path::{Path, PathBuf}; use std::str; use std::sync::{Arc, Mutex, RwLock}; use std::thread; use std::time::{Duration, SystemTime}; use super::generic::{ StoreGeneric, StoreGenericActionBuilder, StoreGenericBuilder, StoreGenericPool, }; use super::keyer::StoreKeyerHasher; use crate::lexer::ranges::LexerRegexRange; use crate::APP_CONF; pub struct StoreFSTPool; pub struct StoreFSTBuilder; pub struct StoreFST { graph: FSTSet, target: StoreFSTKey, pending: StoreFSTPending, last_used: Arc>, last_consolidated: Arc>, } #[derive(Default)] pub struct StoreFSTPending { pop: Arc>>>, push: Arc>>>, } pub struct StoreFSTActionBuilder; pub struct StoreFSTAction { store: StoreFSTBox, } #[derive(PartialEq, Eq, Hash, Clone, Copy)] pub struct StoreFSTKey { collection_hash: StoreFSTAtom, bucket_hash: StoreFSTAtom, } pub struct StoreFSTMisc; #[derive(Copy, Clone)] enum StoreFSTPathMode { Permanent, Temporary, Backup, } type StoreFSTAtom = u32; type StoreFSTBox = Arc; const WORD_LIMIT_LENGTH: usize = 40; const ATOM_HASH_RADIX: usize = 16; lazy_static! { pub static ref GRAPH_ACCESS_LOCK: Arc> = Arc::new(RwLock::new(false)); static ref GRAPH_ACQUIRE_LOCK: Arc> = Arc::new(Mutex::new(())); static ref GRAPH_REBUILD_LOCK: Arc> = Arc::new(Mutex::new(())); static ref GRAPH_POOL: Arc>> = Arc::new(RwLock::new(HashMap::new())); static ref GRAPH_CONSOLIDATE: Arc>> = Arc::new(RwLock::new(HashSet::new())); } impl StoreFSTPathMode { fn extension(&self) -> &'static str { match self { StoreFSTPathMode::Permanent => ".fst", StoreFSTPathMode::Temporary => ".fst.tmp", StoreFSTPathMode::Backup => ".fst.bck", } } } impl StoreFSTPool { pub fn count() -> (usize, usize) { ( GRAPH_POOL.read().unwrap().len(), GRAPH_CONSOLIDATE.read().unwrap().len(), ) } pub fn acquire<'a, T: Into<&'a str>>(collection: T, bucket: T) -> Result { let (collection_str, bucket_str) = (collection.into(), bucket.into()); let pool_key = StoreFSTKey::from_str(collection_str, bucket_str); // Freeze acquire lock, and reference it in context // Notice: this prevents two graphs on the same collection to be opened at the same time. let _acquire = GRAPH_ACQUIRE_LOCK.lock().unwrap(); // Acquire a thread-safe store pool reference in read mode let graph_pool_read = GRAPH_POOL.read().unwrap(); if let Some(store_fst) = graph_pool_read.get(&pool_key) { Self::proceed_acquire_cache("fst", collection_str, pool_key, store_fst) } else { info!( "fst store not in pool for collection: {} <{:x?}> / bucket: {} <{:x?}>, opening it", collection_str, pool_key.collection_hash, bucket_str, pool_key.bucket_hash ); // Important: we need to drop the read reference first, to avoid dead-locking \ // when acquiring the RWLock in write mode in this block. drop(graph_pool_read); Self::proceed_acquire_open("fst", collection_str, pool_key, &*GRAPH_POOL) } } pub fn janitor() { Self::proceed_janitor( "fst", &*GRAPH_POOL, APP_CONF.store.fst.pool.inactive_after, &*GRAPH_ACCESS_LOCK, ) } pub fn backup(path: &Path) -> Result<(), io::Error> { debug!("backing up all fst stores to path: {:?}", path); // Create backup directory (full path) fs::create_dir_all(path)?; // Proceed dump action (backup) Self::dump_action( "backup", StoreFSTPathMode::Permanent, &*APP_CONF.store.fst.path, path, &Self::backup_item, ) } pub fn restore(path: &Path) -> Result<(), io::Error> { debug!("restoring all fst stores from path: {:?}", path); // Proceed dump action (restore) Self::dump_action( "restore", StoreFSTPathMode::Backup, path, &*APP_CONF.store.fst.path, &Self::restore_item, ) } pub fn consolidate(force: bool) { debug!("scanning for fst store pool items to consolidate"); // Notice: we do not consolidate all items at each tick, we try to even out multiple \ // consolidation tasks over time. This lowers the overall HZ of the tasker system for \ // certain heavy tasks, which is better to spread out consolidation steps over time over \ // a large number of very active buckets. // Acquire rebuild lock, and reference it in context // Notice: this prevents two consolidate operations to be executed at the same time. let _rebuild = GRAPH_REBUILD_LOCK.lock().unwrap(); // Exit trap: Register is empty? Abort there. if GRAPH_CONSOLIDATE.read().unwrap().is_empty() { info!("no fst store pool items to consolidate in register"); return; } // Step 1: List keys to be consolidated let mut keys_consolidate: Vec = Vec::new(); { // Acquire access lock (in blocking write mode), and reference it in context // Notice: this prevents store to be acquired from any context let _access = GRAPH_ACCESS_LOCK.write().unwrap(); let (graph_pool_read, graph_consolidate_read) = ( GRAPH_POOL.read().unwrap(), GRAPH_CONSOLIDATE.read().unwrap(), ); for key in &*graph_consolidate_read { if let Some(store) = graph_pool_read.get(key) { // Important: be lenient with system clock going back to a past duration, \ // since we may be running in a virtualized environment where clock is not \ // guaranteed to be monotonic. This is done to avoid poisoning associated \ // mutexes by crashing on unwrap(). let not_consolidated_for = store .last_consolidated .read() .unwrap() .elapsed() .unwrap_or_else(|err| { error!( "fst key: {} last consolidated duration clock issue, zeroing: {}", key, err ); // Assuming a zero seconds fallback duration Duration::from_secs(0) }) .as_secs(); if force || not_consolidated_for >= APP_CONF.store.fst.graph.consolidate_after { info!( "fst key: {} not consolidated for: {} seconds, may consolidate", key, not_consolidated_for ); keys_consolidate.push(*key); } else { debug!( "fst key: {} not consolidated for: {} seconds, no consolidate", key, not_consolidated_for ); } } } } // Exit trap: Nothing to consolidate yet? Abort there. if keys_consolidate.is_empty() { info!("no fst store pool items need to consolidate at the moment"); return; } // Step 2: Clear keys to be consolidated from register { // Acquire access lock (in blocking write mode), and reference it in context // Notice: this prevents store to be acquired from any context let _access = GRAPH_ACCESS_LOCK.write().unwrap(); let mut graph_consolidate_write = GRAPH_CONSOLIDATE.write().unwrap(); for key in &keys_consolidate { graph_consolidate_write.remove(key); debug!("fst key: {} cleared from consolidate register", key); } } // Step 3: Consolidate FSTs, one-by-one (sequential locking; this avoids global locks) let (mut count_moved, mut count_pushed, mut count_popped) = (0, 0, 0); { for key in &keys_consolidate { { // As we may be renaming the FST file, ensure no consumer out of this is \ // trying to access the FST file as it gets processed. This also waits for \ // current consumers to finish reading the FST, and prevents any new \ // consumer from opening it while we are not done there. let _access = GRAPH_ACCESS_LOCK.write().unwrap(); let do_close = if let Some(store) = GRAPH_POOL.read().unwrap().get(key) { debug!("fst key: {} consolidate started", key); let consolidate_counts = Self::consolidate_item(store); count_moved += consolidate_counts.1; count_pushed += consolidate_counts.2; count_popped += consolidate_counts.3; debug!("fst key: {} consolidate complete", key); // Should close this FST? consolidate_counts.0 } else { false }; // Nuke old opened FST? // Notice: last consolidated date will be bumped to a new date in the future \ // when a push or pop operation will be done, thus effectively scheduling \ // a consolidation in the future properly. // Notice: we remove this one early as to release write lock early if do_close { GRAPH_POOL.write().unwrap().remove(key); } } // Give a bit of time to other threads before continuing (a consolidate operation \ // must not block all other threads until it completes); this method tells the \ // thread scheduler to give a bit of priority to other threads, and get back \ // to this thread's work when other threads are done. On large setups, this \ // loop can starve other threads due to the locks used (unfortunately they \ // are all necessary). thread::yield_now(); } } info!( "done scanning for fst store pool items to consolidate (move: {}, push: {}, pop: {})", count_moved, count_pushed, count_popped ); } fn dump_action( action: &str, path_mode: StoreFSTPathMode, read_path: &Path, write_path: &Path, fn_item: &dyn Fn(&Path, &Path, &str, &str) -> Result<(), io::Error>, ) -> Result<(), io::Error> { let fst_extension = path_mode.extension(); let fst_extension_len = fst_extension.len(); // Iterate on FST collections for collection in fs::read_dir(read_path)? { let collection = collection?; // Actual collection found? if let (Ok(collection_file_type), Some(collection_name)) = (collection.file_type(), collection.file_name().to_str()) { if collection_file_type.is_dir() { debug!("fst collection ongoing {}: {}", action, collection_name); // Create write folder for collection fs::create_dir_all(write_path.join(collection_name))?; // Iterate on FST collection buckets for bucket in fs::read_dir(read_path.join(collection_name))? { let bucket = bucket?; // Actual bucket found? if let (Ok(bucket_file_type), Some(bucket_file_name)) = (bucket.file_type(), bucket.file_name().to_str()) { let bucket_file_name_len = bucket_file_name.len(); if bucket_file_type.is_file() && bucket_file_name_len > fst_extension_len && bucket_file_name.ends_with(fst_extension) { // Acquire bucket name (from full file name) let bucket_name = &bucket_file_name[..(bucket_file_name_len - fst_extension_len)]; debug!( "fst bucket ongoing {}: {}/{}", action, collection_name, bucket_name ); fn_item(write_path, &bucket.path(), collection_name, bucket_name)?; } } } } } } Ok(()) } fn backup_item( backup_path: &Path, _origin_path: &Path, collection_name: &str, bucket_name: &str, ) -> Result<(), io::Error> { // Acquire access lock (in blocking write mode), and reference it in context // Notice: this prevents store to be acquired from any context let _access = GRAPH_ACCESS_LOCK.write().unwrap(); // Generate path to FST backup let fst_backup_path = backup_path.join(collection_name).join(format!( "{}{}", bucket_name, StoreFSTPathMode::Backup.extension() )); debug!( "fst bucket: {}/{} backing up to path: {:?}", collection_name, bucket_name, fst_backup_path ); // Erase any previously-existing FST backup fs::remove_file(&fst_backup_path).ok(); // Stream actual FST data to FST backup let backup_fst_file = File::create(&fst_backup_path)?; let mut backup_fst_writer = BufWriter::new(backup_fst_file); let mut count_words = 0; // Convert names to hashes (as names are hashes encoded as base-16 strings, but we need \ // them as proper integers) if let (Ok(collection_radix), Ok(bucket_radix)) = ( RadixNum::from_str(collection_name, ATOM_HASH_RADIX), RadixNum::from_str(bucket_name, ATOM_HASH_RADIX), ) { if let (Ok(collection_hash), Ok(bucket_hash)) = (collection_radix.as_decimal(), bucket_radix.as_decimal()) { let origin_fst = StoreFSTBuilder::open( collection_hash as StoreFSTAtom, bucket_hash as StoreFSTAtom, ) .map_err(|_| io_error!("graph open failure"))?; let mut origin_fst_stream = origin_fst.stream(); while let Some(word) = origin_fst_stream.next() { count_words += 1; // Write word, and append a new line backup_fst_writer.write_all(word)?; backup_fst_writer.write_all(b"\n")?; } info!( "fst bucket: {}/{} backed up to path: {:?} ({} words)", collection_name, bucket_name, fst_backup_path, count_words ); } } Ok(()) } fn restore_item( _backup_path: &Path, origin_path: &Path, collection_name: &str, bucket_name: &str, ) -> Result<(), io::Error> { // Acquire access lock (in blocking write mode), and reference it in context // Notice: this prevents store to be acquired from any context let _access = GRAPH_ACCESS_LOCK.write().unwrap(); debug!( "fst bucket: {}/{} restoring from path: {:?}", collection_name, bucket_name, origin_path ); // Convert names to hashes (as names are hashes encoded as base-16 strings, but we need \ // them as proper integers) if let (Ok(collection_radix), Ok(bucket_radix)) = ( RadixNum::from_str(collection_name, ATOM_HASH_RADIX), RadixNum::from_str(bucket_name, ATOM_HASH_RADIX), ) { if let (Ok(collection_hash), Ok(bucket_hash)) = (collection_radix.as_decimal(), bucket_radix.as_decimal()) { // Force a FST store close StoreFSTBuilder::close( collection_hash as StoreFSTAtom, bucket_hash as StoreFSTAtom, ); // Generate path to FST let fst_path = StoreFSTBuilder::path( StoreFSTPathMode::Permanent, collection_hash as StoreFSTAtom, Some(bucket_hash as StoreFSTAtom), ); // Remove existing FST data? if fst_path.exists() { fs::remove_file(&fst_path)?; } // Stream backup words to restored FST let fst_writer = BufWriter::new(File::create(&fst_path)?); let fst_backup_reader = BufReader::new(File::open(&origin_path)?); let mut fst_builder = FSTSetBuilder::new(fst_writer) .map_err(|_| io_error!("graph restore builder failure"))?; for word in fst_backup_reader.lines() { let word = word?; fst_builder .insert(word) .map_err(|_| io_error!("graph restore word insert failure"))?; } fst_builder .finish() .map_err(|_| io_error!("graph restore finish failure"))?; info!( "fst bucket: {}/{} restored to path: {:?} from backup: {:?}", collection_name, bucket_name, fst_path, origin_path ); } } Ok(()) } fn consolidate_item(store: &StoreFSTBox) -> (bool, usize, usize, usize) { let (mut should_close, mut count_moved, mut count_pushed, mut count_popped) = (false, 0, 0, 0); // Acquire write references to pending sets let (mut pending_push_write, mut pending_pop_write) = ( store.pending.push.write().unwrap(), store.pending.pop.write().unwrap(), ); // Do consolidate? (any change committed) // Notice: if both pending sets are empty do not consolidate as there may have been a \ // push then a pop of this push, nulling out any committed change. if pending_push_write.len() > 0 || pending_pop_write.len() > 0 { // Read old FST (or default to empty FST) if let Ok(old_fst) = StoreFSTBuilder::open(store.target.collection_hash, store.target.bucket_hash) { // Initialize the new FST (temporary) let bucket_tmp_path = StoreFSTBuilder::path( StoreFSTPathMode::Temporary, store.target.collection_hash, Some(store.target.bucket_hash), ); let bucket_tmp_path_parent = bucket_tmp_path.parent().unwrap(); if fs::create_dir_all(&bucket_tmp_path_parent).is_ok() { // Erase any previously-existing temporary FST (eg. process stopped while \ // writing the temporary FST); there is no guarantee this succeeds. fs::remove_file(&bucket_tmp_path).ok(); if let Ok(tmp_fst_file) = File::create(&bucket_tmp_path) { let tmp_fst_writer = BufWriter::new(tmp_fst_file); // Create a builder that can be used to insert new key-value pairs. if let Ok(mut tmp_fst_builder) = FSTSetBuilder::new(tmp_fst_writer) { // Convert push keys to an ordered vector // Notice: we must go from a Vec to a VecDeque as to sort values, \ // which is a requirement for FST insertions. let mut ordered_push_vec: Vec<&[u8]> = Vec::from_iter(pending_push_write.iter().map(|item| item.as_ref())); ordered_push_vec.sort(); let mut ordered_push: VecDeque<&[u8]> = VecDeque::from_iter(ordered_push_vec); // Append words not in pop list to new FST (ie. old words minus pop \ // words) let mut old_fst_stream = old_fst.stream(); 'old: while let Some(old_fst_word) = old_fst_stream.next() { // Append new words from front? (ie. push words) // Notice: as an FST is ordered, inserts would fail if they are \ // committed out-of-order. Thus, the only way to check for \ // order is there. // Notice: a quick check is done before engaging in the loop, to \ // prevent any de-optimized jump instruction, as we may call \ // this code block a lot on large FSTs, and the loop should not \ // be engaged that often on stabilized FSTs (ie. mature FSTs). if let Some(push_first_ref) = ordered_push.front() { // Engage the loop? if *push_first_ref <= old_fst_word { while let Some(push_front_ref) = ordered_push.front() { if *push_front_ref <= old_fst_word { // Pop front item and consume it // Notice: as we validated previously that there \ // is a front value, this unwrap is safe. let push_front = ordered_push.pop_front().unwrap(); if StoreFSTMisc::check_over_limits( tmp_fst_builder.bytes_written() as usize, count_pushed + count_moved, ) { // FST cannot accept more items (limits reached) warn!("limit reached on new from old in fst"); // Important: stop the main loop (limit reached) break 'old; } if let Err(err) = tmp_fst_builder.insert(push_front) { // Could not insert word in FST error!( "failed inserting new from old in fst: {}", err ); } else { // Word inserted in FST count_pushed += 1; } // Continue scanning next word (may also come \ // before this FST word in order) continue; } // Important: stop loop on next front item (always \ // the same) break; } } } // Restore old word (if not popped) if !pending_pop_write.contains(old_fst_word) { if StoreFSTMisc::check_over_limits( tmp_fst_builder.bytes_written() as usize, count_pushed + count_moved, ) { // FST cannot accept more items (limits reached) warn!("limit reached on old word in fst"); // Important: stop the main loop (limit reached) break 'old; } if let Err(err) = tmp_fst_builder.insert(old_fst_word) { // Could not move word to FST error!("failed inserting old word in fst: {}", err); } else { // Word moved to FST count_moved += 1; } } else { count_popped += 1; } } // Complete FST with last pushed items // Notice: this is necessary if the FST was empty, or if we have push \ // items that come after the last ordered word of the FST. while let Some(push_front) = ordered_push.pop_front() { if StoreFSTMisc::check_over_limits( tmp_fst_builder.bytes_written() as usize, count_pushed + count_moved, ) { // FST cannot accept more items (limits reached) warn!("limit reached on new word from complete in fst"); // Important: stop the main loop (limit reached) break; } if let Err(err) = tmp_fst_builder.insert(push_front) { // Could not insert word in FST error!( "failed inserting new word from complete in fst: {}", err ); } else { // Word inserted in FST count_pushed += 1; } } // Finish building new FST if tmp_fst_builder.finish().is_ok() { // Should close open store reference to old FST should_close = true; // Replace old FST with new FST (this nukes the old FST) // Notice: there is no need to re-open the new FST, as it will be \ // automatically opened on its next access. let bucket_final_path = StoreFSTBuilder::path( StoreFSTPathMode::Permanent, store.target.collection_hash, Some(store.target.bucket_hash), ); // Proceed temporary FST to final FST path rename if fs::rename(&bucket_tmp_path, &bucket_final_path).is_ok() { info!("done consolidate fst at path: {:?}", bucket_final_path); } else { error!( "error consolidating fst at path: {:?}", bucket_final_path ); } } else { error!( "error finishing building temporary fst at path: {:?}", bucket_tmp_path ); } } else { error!( "error starting building temporary fst at path: {:?}", bucket_tmp_path ); } } else { error!( "error initializing temporary fst at path: {:?}", bucket_tmp_path ); } } else { error!( "error initializing temporary fst directory at path: {:?}", bucket_tmp_path_parent ); } } else { error!("error opening old fst"); } // Reset all pending sets *pending_push_write = HashSet::new(); *pending_pop_write = HashSet::new(); } (should_close, count_moved, count_pushed, count_popped) } } impl StoreGenericPool for StoreFSTPool {} impl StoreFSTBuilder { fn open(collection_hash: StoreFSTAtom, bucket_hash: StoreFSTAtom) -> Result { debug!( "opening finite-state transducer graph for collection: <{:x?}> and bucket: <{:x?}>", collection_hash, bucket_hash ); let collection_bucket_path = Self::path( StoreFSTPathMode::Permanent, collection_hash, Some(bucket_hash), ); if collection_bucket_path.exists() { // Open graph at path for collection // Notice: this is unsafe, as loaded memory is a memory-mapped file, that cannot be \ // guaranteed not to be muted while we own a read handle to it. Though, we use \ // higher-level locking mechanisms on all callers of this method, so we are safe. unsafe { FSTSet::from_path(collection_bucket_path) } } else { // FST does not exist on disk, generate an empty FST for now; until a consolidation \ // task occurs and populates the on-disk-FST. let empty_iter: Vec<&str> = Vec::new(); FSTSet::from_iter(empty_iter) } } fn close(collection_hash: StoreFSTAtom, bucket_hash: StoreFSTAtom) { debug!( "closing finite-state transducer graph for collection: <{:x?}> and bucket: <{:x?}>", collection_hash, bucket_hash ); let bucket_target = StoreFSTKey::from_atom(collection_hash, bucket_hash); GRAPH_POOL.write().unwrap().remove(&bucket_target); GRAPH_CONSOLIDATE.write().unwrap().remove(&bucket_target); } fn path( mode: StoreFSTPathMode, collection_hash: StoreFSTAtom, bucket_hash: Option, ) -> PathBuf { let mut final_path = APP_CONF .store .fst .path .join(format!("{:x?}", collection_hash)); if let Some(bucket_hash) = bucket_hash { final_path = final_path.join(format!("{:x?}{}", bucket_hash, mode.extension())); } final_path } } impl StoreGenericBuilder for StoreFSTBuilder { fn build(pool_key: StoreFSTKey) -> Result { Self::open(pool_key.collection_hash, pool_key.bucket_hash) .map(|graph| { let now = SystemTime::now(); StoreFST { graph, target: pool_key, pending: StoreFSTPending::default(), last_used: Arc::new(RwLock::new(now)), last_consolidated: Arc::new(RwLock::new(now)), } }) .map_err(|err| { error!("failed opening fst: {}", err); }) } } impl StoreFST { pub fn cardinality(&self) -> usize { self.graph.len() } pub fn as_stream(&self) -> FSTStream<'_, AlwaysMatch> { self.graph.into_stream() } pub fn lookup_begins(&self, word: &str) -> Result, ()> { // Notice: this regex maps over an unicode range, for speed reasons at scale. \ // We found out that the 'match any' syntax ('.*') was super-slow. Using the restrictive \ // syntax below divided the cost of eg. a search query by 2. The regex below has been \ // found out to be nearly zero-cost to compile and execute, for whatever reason. // Regex format: '{escaped_word}([{unicode_range}]*)' let mut regex_str = regex_escape(word); regex_str.push('('); let write_result = LexerRegexRange::from(word) .unwrap_or_default() .write_to(&mut regex_str); regex_str.push_str("*)"); // Regex write failed? (this should not happen) if let Err(err) = write_result { error!( "could not lookup word in fst via 'begins': {} because regex write failed: {}", word, err ); return Err(()); } // Proceed word lookup debug!( "looking-up word in fst via 'begins': {} with regex: {}", word, regex_str ); if let Ok(regex) = Regex::new(®ex_str) { Ok(self.graph.search(regex).into_stream()) } else { Err(()) } } pub fn lookup_typos( &self, word: &str, max_factor: Option, ) -> Result, ()> { // Allow more typos in word as the word gets longer, up to a maximum limit let mut typo_factor = match word.len() { 1 | 2 | 3 => 0, 4 | 5 | 6 => 1, 7 | 8 | 9 => 2, _ => 3, }; // Cap typo factor to set maximum? if let Some(max_factor) = max_factor { if typo_factor > max_factor { typo_factor = max_factor; } } debug!( "looking-up word in fst via 'typos': {} with typo factor: {}", word, typo_factor ); if let Ok(fuzzy) = Levenshtein::new(word, typo_factor) { Ok(self.graph.search(fuzzy).into_stream()) } else { Err(()) } } pub fn should_consolidate(&self) { // Check if not already scheduled if !GRAPH_CONSOLIDATE.read().unwrap().contains(&self.target) { // Schedule target for next consolidation tick (ie. collection + bucket tuple) GRAPH_CONSOLIDATE.write().unwrap().insert(self.target); // Bump 'last consolidated' time, effectively de-bouncing consolidation to a fixed \ // and predictable tick time in the future. let mut last_consolidated_value = self.last_consolidated.write().unwrap(); *last_consolidated_value = SystemTime::now(); // Perform an early drop of the lock (frees up write lock early) drop(last_consolidated_value); info!("graph consolidation scheduled on pool key: {}", self.target); } else { debug!( "graph consolidation already scheduled on pool key: {}", self.target ); } } } impl StoreGeneric for StoreFST { fn ref_last_used(&self) -> &RwLock { &self.last_used } } impl StoreFSTActionBuilder { pub fn access(store: StoreFSTBox) -> StoreFSTAction { Self::build(store) } pub fn erase<'a, T: Into<&'a str>>(collection: T, bucket: Option) -> Result { Self::dispatch_erase("fst", collection, bucket) } fn build(store: StoreFSTBox) -> StoreFSTAction { StoreFSTAction { store } } } impl StoreGenericActionBuilder for StoreFSTActionBuilder { fn proceed_erase_collection(collection_str: &str) -> Result { let path_mode = StoreFSTPathMode::Permanent; let collection_atom = StoreKeyerHasher::to_compact(collection_str); let collection_path = StoreFSTBuilder::path(path_mode, collection_atom, None); // Force a FST graph close (on all contained buckets) // Notice: we first need to scan for opened buckets in-memory, as not all FSTs may be \ // committed to disk; thus some FST stores that exist in-memory may not exist on-disk. let mut bucket_atoms: Vec = Vec::new(); { let graph_pool_read = GRAPH_POOL.read().unwrap(); for target_key in graph_pool_read.keys() { if target_key.collection_hash == collection_atom { bucket_atoms.push(target_key.bucket_hash); } } } if !bucket_atoms.is_empty() { debug!( "will force-close {} fst buckets for collection: {}", bucket_atoms.len(), collection_str ); let (mut graph_pool_write, mut graph_consolidate_write) = ( GRAPH_POOL.write().unwrap(), GRAPH_CONSOLIDATE.write().unwrap(), ); for bucket_atom in bucket_atoms { debug!( "fst bucket graph force close for bucket: {}/<{:x?}>", collection_str, bucket_atom ); let bucket_target = StoreFSTKey::from_atom(collection_atom, bucket_atom); graph_pool_write.remove(&bucket_target); graph_consolidate_write.remove(&bucket_target); } } // Remove all FSTs on-disk if collection_path.exists() { debug!( "fst collection store exists, erasing: {}/* at path: {:?}", collection_str, &collection_path ); // Remove FST graph storage from filesystem let erase_result = fs::remove_dir_all(&collection_path); if erase_result.is_ok() { debug!("done with fst collection erasure"); Ok(1) } else { Err(()) } } else { debug!( "fst collection store does not exist, consider already erased: {}/* at path: {:?}", collection_str, &collection_path ); Ok(0) } } fn proceed_erase_bucket(collection_str: &str, bucket_str: &str) -> Result { debug!( "sub-erase on fst bucket: {} for collection: {}", bucket_str, collection_str ); let (collection_atom, bucket_atom) = ( StoreKeyerHasher::to_compact(collection_str), StoreKeyerHasher::to_compact(bucket_str), ); let bucket_path = StoreFSTBuilder::path( StoreFSTPathMode::Permanent, collection_atom, Some(bucket_atom), ); // Force a FST graph close StoreFSTBuilder::close(collection_atom, bucket_atom); // Remove FST on-disk if bucket_path.exists() { debug!( "fst bucket graph exists, erasing: {}/{} at path: {:?}", collection_str, bucket_str, &bucket_path ); // Remove FST graph storage from filesystem let erase_result = fs::remove_file(&bucket_path); if erase_result.is_ok() { debug!("done with fst bucket erasure"); Ok(1) } else { Err(()) } } else { debug!( "fst bucket graph does not exist, consider already erased: {}/{} at path: {:?}", collection_str, bucket_str, &bucket_path ); Ok(0) } } } impl StoreFSTAction { pub fn push_word(&self, word: &str) -> bool { // Word over limit? (abort, the FST does not perform well over large words) if Self::word_over_limit(word) { return false; } let word_bytes = word.as_bytes(); // Nuke word from 'pop' set? (void a previous un-consolidated commit) if self.store.pending.pop.read().unwrap().contains(word_bytes) { self.store.pending.pop.write().unwrap().remove(word_bytes); } // Add word in 'push' set? (only if word is not in FST) // Notice: also check whether FST is over limits or not from there, to avoid stacking \ // words that could never be consolidated to final FST anyway. let graph_fst = self.store.graph.as_fst(); if !self.store.graph.contains(&word) && !self.store.pending.push.read().unwrap().contains(word_bytes) && self.store.pending.push.read().unwrap().len() < APP_CONF.store.fst.graph.max_words && !StoreFSTMisc::check_over_limits(graph_fst.size(), graph_fst.len()) { self.store .pending .push .write() .unwrap() .insert(word_bytes.to_vec()); self.store.should_consolidate(); // Pushed true } else { // Not pushed false } } pub fn pop_word(&self, word: &str) -> bool { // Word over limit? (abort, the FST does not perform well over large words) if Self::word_over_limit(word) { return false; } let word_bytes = word.as_bytes(); // Nuke word from 'push' set? (void a previous un-consolidated commit) if self.store.pending.push.read().unwrap().contains(word_bytes) { self.store.pending.push.write().unwrap().remove(word_bytes); } // Add word in 'pop' set? (only if word is in FST) if self.store.graph.contains(word_bytes) && !self.store.pending.pop.read().unwrap().contains(word_bytes) { self.store .pending .pop .write() .unwrap() .insert(word_bytes.to_vec()); self.store.should_consolidate(); // Popped true } else { // Not popped false } } pub fn suggest_words( &self, from_word: &str, limit: usize, max_typo_factor: Option, ) -> Option> { // Word over limit? (abort, the FST does not perform well over large words) if Self::word_over_limit(from_word) { return None; } let mut found_words = Vec::with_capacity(limit); // Try to complete provided word if let Ok(stream) = self.store.lookup_begins(from_word) { debug!("looking up for word: {} in 'begins' fst stream", from_word); Self::find_words_stream(stream, &mut found_words, limit); } // Try to fuzzy-suggest other words? (eg. correct typos) if found_words.len() < limit { if let Ok(stream) = self.store.lookup_typos(from_word, max_typo_factor) { debug!("looking up for word: {} in 'typos' fst stream", from_word); Self::find_words_stream(stream, &mut found_words, limit); } } if !found_words.is_empty() { Some(found_words) } else { None } } pub fn list_words(&self, limit: usize, offset: usize) -> Result, ()> { let stream = self.store.as_stream(); // Enumerate words from FST stream match stream .into_strs() .map(|words| words.into_iter().skip(offset).take(limit).collect()) { Err(err) => { debug!("conversion of stream failed: {}", err.to_string()); Err(()) } Ok(words) => Ok(words), } } pub fn count_words(&self) -> usize { self.store.cardinality() } fn word_over_limit(word: &str) -> bool { if word.len() > WORD_LIMIT_LENGTH { debug!("got over-limit fst word: {}", word); true } else { false } } fn find_words_stream( mut stream: FSTStream, found_words: &mut Vec, limit: usize, ) { while let Some(word) = stream.next() { if let Ok(word_str) = str::from_utf8(word) { let word_string = word_str.to_string(); if !found_words.contains(&word_string) { found_words.push(word_string); // Requested limit reached? Stop there. if found_words.len() >= limit { break; } } } } } } impl StoreFSTMisc { pub fn count_collection_buckets<'a, T: Into<&'a str>>(collection: T) -> Result { let mut count = 0; let path_mode = StoreFSTPathMode::Permanent; let collection_atom = StoreKeyerHasher::to_compact(collection.into()); let collection_path = StoreFSTBuilder::path(path_mode, collection_atom, None); if collection_path.exists() { // Scan collection directory for contained buckets (count them) if let Ok(entries) = fs::read_dir(&collection_path) { let fst_extension = path_mode.extension(); let fst_extension_len = fst_extension.len(); for entry in entries.flatten() { if let Some(entry_name) = entry.file_name().to_str() { let entry_name_len = entry_name.len(); // FST file found? This is a bucket. if entry_name_len > fst_extension_len && entry_name.ends_with(fst_extension) { count += 1; } } } } else { error!("failed reading directory for count: {:?}", collection_path); return Err(()); } } Ok(count) } fn check_over_limits(bytes_count: usize, words_count: usize) -> bool { // Over bytes limit? let max_size = APP_CONF.store.fst.graph.max_size * 1024; if bytes_count >= max_size { info!( "fst has exceeded maximum allowed bytes: {} over limit: {}", bytes_count, max_size ); return true; } // Over words limit? if words_count >= APP_CONF.store.fst.graph.max_words { info!( "fst has exceeded maximum allowed words: {} over limit: {}", words_count, APP_CONF.store.fst.graph.max_words ); return true; } // Not over limit false } } impl StoreFSTKey { pub fn from_atom(collection_hash: StoreFSTAtom, bucket_hash: StoreFSTAtom) -> StoreFSTKey { StoreFSTKey { collection_hash, bucket_hash, } } pub fn from_str(collection_str: &str, bucket_str: &str) -> StoreFSTKey { StoreFSTKey { collection_hash: StoreKeyerHasher::to_compact(collection_str), bucket_hash: StoreKeyerHasher::to_compact(bucket_str), } } } impl fmt::Display for StoreFSTKey { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "<{:x?}>/<{:x?}>", self.collection_hash, self.bucket_hash) } } #[cfg(test)] mod tests { use super::*; #[test] fn it_acquires_graph() { assert!(StoreFSTPool::acquire("c:test:1", "b:test:1").is_ok()); } #[test] fn it_janitors_graph() { StoreFSTPool::janitor(); } #[test] fn it_proceeds_primitives() { let store = StoreFSTPool::acquire("c:test:2", "b:test:2").unwrap(); assert!(store.lookup_typos("valerien", None).is_ok()); } } ================================================ FILE: src/store/generic.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) use core::cmp::Eq; use core::hash::Hash; use hashbrown::HashMap; use std::fmt::Display; use std::sync::{Arc, RwLock}; use std::time::{Duration, SystemTime}; pub trait StoreGeneric { fn ref_last_used(&self) -> &RwLock; } pub trait StoreGenericPool< K: Hash + Eq + Copy + Display, S: StoreGeneric, B: StoreGenericBuilder, > { fn proceed_acquire_cache( kind: &str, collection_str: &str, pool_key: K, store: &Arc, ) -> Result, ()> { debug!( "{} store acquired from pool for collection: {} (pool key: {})", kind, collection_str, pool_key ); // Bump store last used date (avoids early janitor eviction) let mut last_used_value = store.ref_last_used().write().unwrap(); *last_used_value = SystemTime::now(); // Perform an early drop of the lock (frees up write lock early) drop(last_used_value); Ok(store.clone()) } fn proceed_acquire_open( kind: &str, collection_str: &str, pool_key: K, pool: &Arc>>>, ) -> Result, ()> { match B::build(pool_key) { Ok(store) => { // Acquire a thread-safe store pool reference in write mode let mut store_pool_write = pool.write().unwrap(); let store_box = Arc::new(store); store_pool_write.insert(pool_key, store_box.clone()); debug!( "opened and cached {} store in pool for collection: {} (pool key: {})", kind, collection_str, pool_key ); Ok(store_box) } Err(_) => { error!( "failed opening {} store for collection: {} (pool key: {})", kind, collection_str, pool_key ); Err(()) } } } fn proceed_janitor( kind: &str, pool: &Arc>>>, inactive_after: u64, access_lock: &Arc>, ) { debug!("scanning for {} store pool items to janitor", kind); // Acquire access lock (in blocking write mode), and reference it in context // Notice: this prevents store to be acquired from any context let _access = access_lock.write().unwrap(); let mut removal_register: Vec = Vec::new(); for (collection_bucket, store) in pool.read().unwrap().iter() { // Important: be lenient with system clock going back to a past duration, since \ // we may be running in a virtualized environment where clock is not guaranteed \ // to be monotonic. This is done to avoid poisoning associated mutexes by \ // crashing on unwrap(). let last_used_elapsed = store .ref_last_used() .read() .unwrap() .elapsed() .unwrap_or_else(|err| { error!( "store pool item: {} last used duration clock issue, zeroing: {}", collection_bucket, err ); // Assuming a zero seconds fallback duration Duration::from_secs(0) }) .as_secs(); if last_used_elapsed >= inactive_after { debug!( "found expired {} store pool item: {}; elapsed time: {}s", kind, collection_bucket, last_used_elapsed ); // Notice: the bucket value needs to be cloned, as we cannot reference as value \ // that will outlive referenced value once we remove it from its owner set. removal_register.push(*collection_bucket); } else { debug!( "found non-expired {} store pool item: {}; elapsed time: {}s", kind, collection_bucket, last_used_elapsed ); } } if !removal_register.is_empty() { let mut store_pool_write = pool.write().unwrap(); for collection_bucket in &removal_register { store_pool_write.remove(collection_bucket); } } info!( "done scanning for {} store pool items to janitor, expired {} items, now has {} items", kind, removal_register.len(), pool.read().unwrap().len() ); } } pub trait StoreGenericBuilder { fn build(pool_key: K) -> Result; } pub trait StoreGenericActionBuilder { fn proceed_erase_collection(collection_str: &str) -> Result; fn proceed_erase_bucket(collection_str: &str, bucket_str: &str) -> Result; fn dispatch_erase<'a, T: Into<&'a str>>( kind: &str, collection: T, bucket: Option, ) -> Result { let collection_str = collection.into(); info!("{} erase requested on collection: {}", kind, collection_str); if let Some(bucket) = bucket { Self::proceed_erase_bucket(collection_str, bucket.into()) } else { Self::proceed_erase_collection(collection_str) } } } ================================================ FILE: src/store/identifiers.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) use std::hash::Hasher; use twox_hash::XxHash32; pub type StoreObjectIID = u32; pub type StoreObjectOID<'a> = &'a str; pub type StoreTermHashed = u32; pub struct StoreTermHash; pub enum StoreMetaKey { IIDIncr, } pub enum StoreMetaValue { IIDIncr(StoreObjectIID), } impl StoreMetaKey { pub fn as_u32(&self) -> u32 { match self { StoreMetaKey::IIDIncr => 0, } } } impl StoreTermHash { pub fn from(term: &str) -> StoreTermHashed { let mut hasher = XxHash32::with_seed(0); hasher.write(term.as_bytes()); hasher.finish() as u32 } } #[cfg(test)] mod tests { use super::*; #[test] fn it_converts_meta_key_to_u32() { assert_eq!(StoreMetaKey::IIDIncr.as_u32(), 0); } #[test] fn it_hashes_term() { assert_eq!(StoreTermHash::from("hash:1"), 3637660813); assert_eq!(StoreTermHash::from("hash:2"), 3577985381); } } ================================================ FILE: src/store/item.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub struct StoreItemBuilder; #[derive(PartialEq, Debug)] pub struct StoreItem<'a>( pub StoreItemPart<'a>, pub Option>, pub Option>, ); #[derive(Copy, Clone, PartialEq, Debug)] pub struct StoreItemPart<'a>(&'a str); // TODO: Change variant names #[allow(clippy::enum_variant_names)] #[derive(PartialEq, Debug)] pub enum StoreItemError { InvalidCollection, InvalidBucket, InvalidObject, } const STORE_ITEM_PART_LEN_MIN: usize = 0; const STORE_ITEM_PART_LEN_MAX: usize = 128; impl<'a> StoreItemPart<'a> { pub fn from_str(part: &'a str) -> Result { let len = part.len(); if len > STORE_ITEM_PART_LEN_MIN && len <= STORE_ITEM_PART_LEN_MAX && part.chars().all(|character| character.is_ascii()) { Ok(StoreItemPart(part)) } else { Err(()) } } pub fn as_str(&self) -> &'a str { self.0 } } impl<'a> From> for &'a str { fn from(part: StoreItemPart<'a>) -> Self { part.as_str() } } impl StoreItemBuilder { pub fn from_depth_1(collection: &str) -> Result, StoreItemError> { // Validate & box collection if let Ok(collection_item) = StoreItemPart::from_str(collection) { Ok(StoreItem(collection_item, None, None)) } else { Err(StoreItemError::InvalidCollection) } } pub fn from_depth_2<'a>( collection: &'a str, bucket: &'a str, ) -> Result, StoreItemError> { // Validate & box collection + bucket match ( StoreItemPart::from_str(collection), StoreItemPart::from_str(bucket), ) { (Ok(collection_item), Ok(bucket_item)) => { Ok(StoreItem(collection_item, Some(bucket_item), None)) } (Err(_), _) => Err(StoreItemError::InvalidCollection), (_, Err(_)) => Err(StoreItemError::InvalidBucket), } } pub fn from_depth_3<'a>( collection: &'a str, bucket: &'a str, object: &'a str, ) -> Result, StoreItemError> { // Validate & box collection + bucket + object match ( StoreItemPart::from_str(collection), StoreItemPart::from_str(bucket), StoreItemPart::from_str(object), ) { (Ok(collection_item), Ok(bucket_item), Ok(object_item)) => Ok(StoreItem( collection_item, Some(bucket_item), Some(object_item), )), (Err(_), _, _) => Err(StoreItemError::InvalidCollection), (_, Err(_), _) => Err(StoreItemError::InvalidBucket), (_, _, Err(_)) => Err(StoreItemError::InvalidObject), } } } #[cfg(test)] mod tests { use super::*; #[test] fn it_builds_store_item_depth_1() { assert_eq!( StoreItemBuilder::from_depth_1("c:test:1"), Ok(StoreItem(StoreItemPart("c:test:1"), None, None)) ); assert_eq!( StoreItemBuilder::from_depth_1(""), Err(StoreItemError::InvalidCollection) ); } #[test] fn it_builds_store_item_depth_2() { assert_eq!( StoreItemBuilder::from_depth_2("c:test:2", "b:test:2"), Ok(StoreItem( StoreItemPart("c:test:2"), Some(StoreItemPart("b:test:2")), None )) ); assert_eq!( StoreItemBuilder::from_depth_2("", "b:test:2"), Err(StoreItemError::InvalidCollection) ); assert_eq!( StoreItemBuilder::from_depth_2("c:test:2", ""), Err(StoreItemError::InvalidBucket) ); } #[test] fn it_builds_store_item_depth_3() { assert_eq!( StoreItemBuilder::from_depth_3("c:test:3", "b:test:3", "o:test:3"), Ok(StoreItem( StoreItemPart("c:test:3"), Some(StoreItemPart("b:test:3")), Some(StoreItemPart("o:test:3")) )) ); assert_eq!( StoreItemBuilder::from_depth_3("", "b:test:3", "o:test:3"), Err(StoreItemError::InvalidCollection) ); assert_eq!( StoreItemBuilder::from_depth_3("c:test:3", "", "o:test:3"), Err(StoreItemError::InvalidBucket) ); assert_eq!( StoreItemBuilder::from_depth_3("c:test:3", "b:test:3", ""), Err(StoreItemError::InvalidObject) ); } } ================================================ FILE: src/store/keyer.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) use byteorder::{ByteOrder, LittleEndian, ReadBytesExt}; use std::fmt; use std::hash::Hasher; use std::io::Cursor; use twox_hash::XxHash32; use super::identifiers::*; pub struct StoreKeyerBuilder; pub struct StoreKeyer { key: StoreKeyerKey, } pub struct StoreKeyerHasher; enum StoreKeyerIdx<'a> { MetaToValue(&'a StoreMetaKey), TermToIIDs(StoreTermHashed), OIDToIID(StoreObjectOID<'a>), IIDToOID(StoreObjectIID), IIDToTerms(StoreObjectIID), } pub type StoreKeyerKey = [u8; 9]; pub type StoreKeyerPrefix = [u8; 5]; impl<'a> StoreKeyerIdx<'a> { pub fn to_index(&self) -> u8 { match self { StoreKeyerIdx::MetaToValue(_) => 0, StoreKeyerIdx::TermToIIDs(_) => 1, StoreKeyerIdx::OIDToIID(_) => 2, StoreKeyerIdx::IIDToOID(_) => 3, StoreKeyerIdx::IIDToTerms(_) => 4, } } } impl StoreKeyerBuilder { pub fn meta_to_value<'a>(bucket: &'a str, meta: &'a StoreMetaKey) -> StoreKeyer { Self::make(StoreKeyerIdx::MetaToValue(meta), bucket) } pub fn term_to_iids(bucket: &str, term_hash: StoreTermHashed) -> StoreKeyer { Self::make(StoreKeyerIdx::TermToIIDs(term_hash), bucket) } pub fn oid_to_iid<'a>(bucket: &'a str, oid: StoreObjectOID<'a>) -> StoreKeyer { Self::make(StoreKeyerIdx::OIDToIID(oid), bucket) } pub fn iid_to_oid(bucket: &str, iid: StoreObjectIID) -> StoreKeyer { Self::make(StoreKeyerIdx::IIDToOID(iid), bucket) } pub fn iid_to_terms(bucket: &str, iid: StoreObjectIID) -> StoreKeyer { Self::make(StoreKeyerIdx::IIDToTerms(iid), bucket) } fn make<'a>(idx: StoreKeyerIdx<'a>, bucket: &'a str) -> StoreKeyer { StoreKeyer { key: Self::build_key(idx, bucket), } } fn build_key<'a>(idx: StoreKeyerIdx<'a>, bucket: &'a str) -> StoreKeyerKey { // Key format: [idx<1B> | bucket<4B> | route<4B>] // Encode key bucket + key route from u32 to array of u8 (ie. binary) let (mut bucket_encoded, mut route_encoded) = ([0; 4], [0; 4]); LittleEndian::write_u32(&mut bucket_encoded, StoreKeyerHasher::to_compact(bucket)); LittleEndian::write_u32(&mut route_encoded, Self::route_to_compact(&idx)); // Generate final binary key [ // [idx<1B>] idx.to_index(), // [bucket<4B>] bucket_encoded[0], bucket_encoded[1], bucket_encoded[2], bucket_encoded[3], // [route<4B>] route_encoded[0], route_encoded[1], route_encoded[2], route_encoded[3], ] } fn route_to_compact(idx: &StoreKeyerIdx) -> u32 { match idx { StoreKeyerIdx::MetaToValue(route) => route.as_u32(), StoreKeyerIdx::TermToIIDs(route) => *route, StoreKeyerIdx::OIDToIID(route) => StoreKeyerHasher::to_compact(route), StoreKeyerIdx::IIDToOID(route) => *route, StoreKeyerIdx::IIDToTerms(route) => *route, } } } impl StoreKeyer { pub fn as_bytes(&self) -> StoreKeyerKey { self.key } pub fn as_prefix(&self) -> StoreKeyerPrefix { // Prefix format: [idx<1B> | bucket<4B>] [ self.key[0], self.key[1], self.key[2], self.key[3], self.key[4], ] } } impl StoreKeyerHasher { #![allow(clippy::wrong_self_convention)] pub fn to_compact(part: &str) -> u32 { let mut hasher = XxHash32::with_seed(0); hasher.write(part.as_bytes()); hasher.finish() as u32 } } impl fmt::Display for StoreKeyer { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { // Convert to number let (key_bucket, key_route) = ( Cursor::new(&self.key[1..5]) .read_u32::() .unwrap_or(0), Cursor::new(&self.key[5..9]) .read_u32::() .unwrap_or(0), ); write!( f, "'{}:{:x?}:{:x?}' {:?}", self.key[0], key_bucket, key_route, self.key ) } } #[cfg(test)] mod tests { use super::*; #[test] fn it_keys_meta_to_value() { assert_eq!( StoreKeyerBuilder::meta_to_value("bucket:1", &StoreMetaKey::IIDIncr).as_bytes(), [0, 108, 244, 29, 93, 0, 0, 0, 0] ); } #[test] fn it_keys_term_to_iids() { assert_eq!( StoreKeyerBuilder::term_to_iids("bucket:2", 772137347).as_bytes(), [1, 50, 220, 166, 65, 131, 225, 5, 46] ); assert_eq!( StoreKeyerBuilder::term_to_iids("bucket:2", 3582484684).as_bytes(), [1, 50, 220, 166, 65, 204, 96, 136, 213] ); } #[test] fn it_keys_oid_to_iid() { assert_eq!( StoreKeyerBuilder::oid_to_iid("bucket:3", &"conversation:6501e83a".to_string()) .as_bytes(), [2, 171, 194, 213, 57, 31, 156, 118, 213] ); } #[test] fn it_keys_iid_to_oid() { assert_eq!( StoreKeyerBuilder::iid_to_oid("bucket:4", 10292198).as_bytes(), [3, 105, 12, 54, 147, 230, 11, 157, 0] ); } #[test] fn it_keys_iid_to_terms() { assert_eq!( StoreKeyerBuilder::iid_to_terms("bucket:5", 1).as_bytes(), [4, 137, 142, 73, 67, 1, 0, 0, 0] ); assert_eq!( StoreKeyerBuilder::iid_to_terms("bucket:5", 20).as_bytes(), [4, 137, 142, 73, 67, 20, 0, 0, 0] ); } #[test] fn it_hashes_compact() { assert_eq!(StoreKeyerHasher::to_compact("key:1"), 3370353088); assert_eq!(StoreKeyerHasher::to_compact("key:2"), 1042559698); } #[test] fn it_formats_key() { assert_eq!( &format!("{}", StoreKeyerBuilder::term_to_iids("bucket:6", 72137347)), "'1:71198b49:44cba83' [1, 73, 139, 25, 113, 131, 186, 76, 4]" ); assert_eq!( &format!( "{}", StoreKeyerBuilder::meta_to_value("bucket:6", &StoreMetaKey::IIDIncr) ), "'0:71198b49:0' [0, 73, 139, 25, 113, 0, 0, 0, 0]" ); } } #[cfg(all(feature = "benchmark", test))] mod benches { extern crate test; use super::*; use test::Bencher; #[bench] fn bench_hash_compact_short(b: &mut Bencher) { b.iter(|| StoreKeyerHasher::to_compact("key:bench:1")); } #[bench] fn bench_hash_compact_long(b: &mut Bencher) { b.iter(|| { StoreKeyerHasher::to_compact( "key:bench:2:long:long:long:long:long:long:long:long:long:long:long:long:long:long", ) }); } #[bench] fn bench_key_meta_to_value(b: &mut Bencher) { b.iter(|| StoreKeyerBuilder::meta_to_value("bucket:bench:1", &StoreMetaKey::IIDIncr)); } #[bench] fn bench_key_term_to_iids(b: &mut Bencher) { b.iter(|| StoreKeyerBuilder::term_to_iids("bucket:bench:2", 772137347)); } #[bench] fn bench_key_oid_to_iid(b: &mut Bencher) { let key = "conversation:6501e83a".to_string(); b.iter(|| StoreKeyerBuilder::oid_to_iid("bucket:bench:3", &key)); } #[bench] fn bench_key_iid_to_oid(b: &mut Bencher) { b.iter(|| StoreKeyerBuilder::iid_to_oid("bucket:bench:4", 10292198)); } #[bench] fn bench_key_iid_to_terms(b: &mut Bencher) { b.iter(|| StoreKeyerBuilder::iid_to_terms("bucket:bench:5", 1)); } } ================================================ FILE: src/store/kv.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) use byteorder::{ByteOrder, LittleEndian, ReadBytesExt}; use hashbrown::HashMap; use radix::RadixNum; use rocksdb::backup::{ BackupEngine as DBBackupEngine, BackupEngineOptions as DBBackupEngineOptions, RestoreOptions as DBRestoreOptions, }; use rocksdb::{ DBCompactionStyle, DBCompressionType, Env as DBEnv, Error as DBError, FlushOptions, Options as DBOptions, WriteBatch, WriteOptions, DB, }; use std::fmt; use std::fs; use std::io::{self, Cursor}; use std::path::{Path, PathBuf}; use std::str; use std::sync::{Arc, Mutex, RwLock}; use std::thread; use std::time::{Duration, SystemTime}; use std::vec::Drain; use super::generic::{ StoreGeneric, StoreGenericActionBuilder, StoreGenericBuilder, StoreGenericPool, }; use super::identifiers::*; use super::item::StoreItemPart; use super::keyer::{StoreKeyerBuilder, StoreKeyerHasher, StoreKeyerKey, StoreKeyerPrefix}; use crate::APP_CONF; pub struct StoreKVPool; pub struct StoreKVBuilder; pub struct StoreKV { database: DB, last_used: Arc>, last_flushed: Arc>, pub lock: RwLock, } pub struct StoreKVActionBuilder; pub struct StoreKVAction<'a> { store: Option, bucket: StoreItemPart<'a>, } #[derive(PartialEq, Eq, Hash, Clone, Copy)] pub struct StoreKVKey { collection_hash: StoreKVAtom, } #[derive(PartialEq)] pub enum StoreKVAcquireMode { Any, OpenOnly, } type StoreKVAtom = u32; type StoreKVBox = Arc; const ATOM_HASH_RADIX: usize = 16; lazy_static! { pub static ref STORE_ACCESS_LOCK: Arc> = Arc::new(RwLock::new(false)); static ref STORE_ACQUIRE_LOCK: Arc> = Arc::new(Mutex::new(())); static ref STORE_FLUSH_LOCK: Arc> = Arc::new(Mutex::new(())); static ref STORE_POOL: Arc>> = Arc::new(RwLock::new(HashMap::new())); } impl StoreKVPool { pub fn count() -> usize { STORE_POOL.read().unwrap().len() } pub fn acquire<'a, T: Into<&'a str>>( mode: StoreKVAcquireMode, collection: T, ) -> Result, ()> { let collection_str = collection.into(); let pool_key = StoreKVKey::from_str(collection_str); // Freeze acquire lock, and reference it in context // Notice: this prevents two databases on the same collection to be opened at the same time. let _acquire = STORE_ACQUIRE_LOCK.lock().unwrap(); // Acquire a thread-safe store pool reference in read mode let store_pool_read = STORE_POOL.read().unwrap(); if let Some(store_kv) = store_pool_read.get(&pool_key) { Self::proceed_acquire_cache("kv", collection_str, pool_key, store_kv).map(Some) } else { info!( "kv store not in pool for collection: {} {}, opening it", collection_str, pool_key ); // Important: we need to drop the read reference first, to avoid \ // dead-locking when acquiring the RWLock in write mode in this block. drop(store_pool_read); // Check if can open database? let can_open_db = if mode == StoreKVAcquireMode::OpenOnly { StoreKVBuilder::path(pool_key.collection_hash).exists() } else { true }; // Open KV database? (ie. we do not need to create a new KV database file tree if \ // the database does not exist yet on disk and we are just looking to read data from \ // it) if can_open_db { Self::proceed_acquire_open("kv", collection_str, pool_key, &*STORE_POOL).map(Some) } else { Ok(None) } } } pub fn janitor() { Self::proceed_janitor( "kv", &*STORE_POOL, APP_CONF.store.kv.pool.inactive_after, &*STORE_ACCESS_LOCK, ) } pub fn backup(path: &Path) -> Result<(), io::Error> { debug!("backing up all kv stores to path: {:?}", path); // Create backup directory (full path) fs::create_dir_all(path)?; // Proceed dump action (backup) Self::dump_action("backup", &*APP_CONF.store.kv.path, path, &Self::backup_item) } pub fn restore(path: &Path) -> Result<(), io::Error> { debug!("restoring all kv stores from path: {:?}", path); // Proceed dump action (restore) Self::dump_action( "restore", path, &*APP_CONF.store.kv.path, &Self::restore_item, ) } pub fn flush(force: bool) { debug!("scanning for kv store pool items to flush to disk"); // Acquire flush lock, and reference it in context // Notice: this prevents two flush operations to be executed at the same time. let _flush = STORE_FLUSH_LOCK.lock().unwrap(); // Step 1: List keys to be flushed let mut keys_flush: Vec = Vec::new(); { // Acquire access lock (in blocking write mode), and reference it in context // Notice: this prevents store to be acquired from any context let _access = STORE_ACCESS_LOCK.write().unwrap(); let store_pool_read = STORE_POOL.read().unwrap(); for (key, store) in &*store_pool_read { // Important: be lenient with system clock going back to a past duration, since \ // we may be running in a virtualized environment where clock is not guaranteed \ // to be monotonic. This is done to avoid poisoning associated mutexes by \ // crashing on unwrap(). let not_flushed_for = store .last_flushed .read() .unwrap() .elapsed() .unwrap_or_else(|err| { error!( "kv key: {} last flush duration clock issue, zeroing: {}", key, err ); // Assuming a zero seconds fallback duration Duration::from_secs(0) }) .as_secs(); if force || not_flushed_for >= APP_CONF.store.kv.database.flush_after { info!( "kv key: {} not flushed for: {} seconds, may flush", key, not_flushed_for ); keys_flush.push(*key); } else { debug!( "kv key: {} not flushed for: {} seconds, no flush", key, not_flushed_for ); } } } // Exit trap: Nothing to flush yet? Abort there. if keys_flush.is_empty() { info!("no kv store pool items need to be flushed at the moment"); return; } // Step 2: Flush KVs, one-by-one (sequential locking; this avoids global locks) let mut count_flushed = 0; { for key in &keys_flush { { // Acquire access lock (in blocking write mode), and reference it in context // Notice: this prevents store to be acquired from any context let _access = STORE_ACCESS_LOCK.write().unwrap(); if let Some(store) = STORE_POOL.read().unwrap().get(key) { debug!("kv key: {} flush started", key); if let Err(err) = store.flush() { error!("kv key: {} flush failed: {}", key, err); } else { count_flushed += 1; debug!("kv key: {} flush complete", key); } // Bump 'last flushed' time *store.last_flushed.write().unwrap() = SystemTime::now(); } } // Give a bit of time to other threads before continuing thread::yield_now(); } } info!( "done scanning for kv store pool items to flush to disk (flushed: {})", count_flushed ); } fn dump_action( action: &str, read_path: &Path, write_path: &Path, fn_item: &dyn Fn(&Path, &Path, &str) -> Result<(), io::Error>, ) -> Result<(), io::Error> { // Iterate on KV collections for collection in fs::read_dir(read_path)? { let collection = collection?; // Actual collection found? if let (Ok(collection_file_type), Some(collection_name)) = (collection.file_type(), collection.file_name().to_str()) { if collection_file_type.is_dir() { debug!("kv collection ongoing {}: {}", action, collection_name); fn_item(write_path, &collection.path(), collection_name)?; } } } Ok(()) } fn backup_item( backup_path: &Path, _origin_path: &Path, collection_name: &str, ) -> Result<(), io::Error> { // Acquire access lock (in blocking write mode), and reference it in context // Notice: this prevents store to be acquired from any context let _access = STORE_ACCESS_LOCK.write().unwrap(); // Generate path to KV backup let kv_backup_path = backup_path.join(collection_name); debug!( "kv collection: {} backing up to path: {:?}", collection_name, kv_backup_path ); // Erase any previously-existing KV backup if kv_backup_path.exists() { fs::remove_dir_all(&kv_backup_path)?; } // Create backup folder for collection fs::create_dir_all(backup_path.join(collection_name))?; // Convert names to hashes (as names are hashes encoded as base-16 strings, but we need \ // them as proper integers) if let Ok(collection_radix) = RadixNum::from_str(collection_name, ATOM_HASH_RADIX) { if let Ok(collection_hash) = collection_radix.as_decimal() { let origin_kv = StoreKVBuilder::open(collection_hash as StoreKVAtom) .map_err(|_| io_error!("database open failure"))?; // Initialize KV database backup engine let kv_backup_options = DBBackupEngineOptions::new(&kv_backup_path) .map_err(|_| io_error!("backup engine options acquire failure"))?; let kv_backup_environment = DBEnv::new() .map_err(|_| io_error!("backup engine environment acquire failure"))?; let mut kv_backup_engine = DBBackupEngine::open(&kv_backup_options, &kv_backup_environment) .map_err(|_| io_error!("backup engine failure"))?; // Proceed actual KV database backup kv_backup_engine .create_new_backup(&origin_kv) .map_err(|_| io_error!("database backup failure"))?; info!( "kv collection: {} backed up to path: {:?}", collection_name, kv_backup_path ); } } Ok(()) } fn restore_item( _backup_path: &Path, origin_path: &Path, collection_name: &str, ) -> Result<(), io::Error> { // Acquire access lock (in blocking write mode), and reference it in context // Notice: this prevents store to be acquired from any context let _access = STORE_ACCESS_LOCK.write().unwrap(); debug!( "kv collection: {} restoring from path: {:?}", collection_name, origin_path ); // Convert names to hashes (as names are hashes encoded as base-16 strings, but we need \ // them as proper integers) if let Ok(collection_radix) = RadixNum::from_str(collection_name, ATOM_HASH_RADIX) { if let Ok(collection_hash) = collection_radix.as_decimal() { // Force a KV store close StoreKVBuilder::close(collection_hash as StoreKVAtom); // Generate path to KV let kv_path = StoreKVBuilder::path(collection_hash as StoreKVAtom); // Remove existing KV database data? if kv_path.exists() { fs::remove_dir_all(&kv_path)?; } // Create KV folder for collection fs::create_dir_all(&kv_path)?; // Initialize KV database backup engine let kv_backup_options = DBBackupEngineOptions::new(&origin_path) .map_err(|_| io_error!("backup engine options acquire failure"))?; let kv_backup_environment = DBEnv::new() .map_err(|_| io_error!("backup engine environment acquire failure"))?; let mut kv_backup_engine = DBBackupEngine::open(&kv_backup_options, &kv_backup_environment) .map_err(|_| io_error!("backup engine failure"))?; kv_backup_engine .restore_from_latest_backup(&kv_path, &kv_path, &DBRestoreOptions::default()) .map_err(|_| io_error!("database restore failure"))?; info!( "kv collection: {} restored to path: {:?} from backup: {:?}", collection_name, kv_path, origin_path ); } } Ok(()) } } impl StoreGenericPool for StoreKVPool {} impl StoreKVBuilder { fn open(collection_hash: StoreKVAtom) -> Result { debug!( "opening key-value database for collection: <{:x?}>", collection_hash ); // Configure database options let db_options = Self::configure(); // Open database at path for collection DB::open(&db_options, Self::path(collection_hash)) } fn close(collection_hash: StoreKVAtom) { debug!( "closing key-value database for collection: <{:x?}>", collection_hash ); let mut store_pool_write = STORE_POOL.write().unwrap(); let collection_target = StoreKVKey::from_atom(collection_hash); store_pool_write.remove(&collection_target); } fn path(collection_hash: StoreKVAtom) -> PathBuf { APP_CONF .store .kv .path .join(format!("{:x?}", collection_hash)) } fn configure() -> DBOptions { debug!("configuring key-value database"); // Make database options let mut db_options = DBOptions::default(); // Set static options db_options.create_if_missing(true); db_options.set_use_fsync(false); db_options.set_compaction_style(DBCompactionStyle::Level); db_options.set_min_write_buffer_number(1); db_options.set_max_write_buffer_number(2); // Set dynamic options db_options.set_compression_type(if APP_CONF.store.kv.database.compress { DBCompressionType::Zstd } else { DBCompressionType::None }); db_options.set_max_open_files(if let Some(value) = APP_CONF.store.kv.database.max_files { value as i32 } else { -1 }); db_options.increase_parallelism(APP_CONF.store.kv.database.parallelism as i32); db_options.set_max_subcompactions(APP_CONF.store.kv.database.max_compactions as u32); db_options.set_max_background_jobs( (APP_CONF.store.kv.database.max_compactions + APP_CONF.store.kv.database.max_flushes) as i32, ); db_options.set_write_buffer_size(APP_CONF.store.kv.database.write_buffer * 1024); db_options } } impl StoreGenericBuilder for StoreKVBuilder { fn build(pool_key: StoreKVKey) -> Result { Self::open(pool_key.collection_hash) .map(|db| { let now = SystemTime::now(); StoreKV { database: db, last_used: Arc::new(RwLock::new(now)), last_flushed: Arc::new(RwLock::new(now)), lock: RwLock::new(false), } }) .map_err(|err| { error!("failed opening kv: {}", err); }) } } impl StoreKV { pub fn get(&self, key: &[u8]) -> Result>, DBError> { self.database.get(key) } pub fn put(&self, key: &[u8], data: &[u8]) -> Result<(), DBError> { let mut batch = WriteBatch::default(); batch.put(key, data); self.do_write(batch) } pub fn delete(&self, key: &[u8]) -> Result<(), DBError> { let mut batch = WriteBatch::default(); batch.delete(key); self.do_write(batch) } fn flush(&self) -> Result<(), DBError> { // Generate flush options let mut flush_options = FlushOptions::default(); flush_options.set_wait(true); // Perform flush (in blocking mode) self.database.flush_opt(&flush_options) } fn do_write(&self, batch: WriteBatch) -> Result<(), DBError> { // Configure this write let mut write_options = WriteOptions::default(); // WAL disabled? if !APP_CONF.store.kv.database.write_ahead_log { debug!("ignoring wal for kv write"); write_options.disable_wal(true); } else { debug!("using wal for kv write"); write_options.disable_wal(false); } // Commit this write self.database.write_opt(batch, &write_options) } } impl StoreGeneric for StoreKV { fn ref_last_used(&self) -> &RwLock { &self.last_used } } impl StoreKVActionBuilder { pub fn access(bucket: StoreItemPart, store: Option) -> StoreKVAction { Self::build(bucket, store) } pub fn erase<'a, T: Into<&'a str>>(collection: T, bucket: Option) -> Result { Self::dispatch_erase("kv", collection, bucket) } fn build(bucket: StoreItemPart, store: Option) -> StoreKVAction { StoreKVAction { store, bucket } } } impl StoreGenericActionBuilder for StoreKVActionBuilder { fn proceed_erase_collection(collection_str: &str) -> Result { let collection_atom = StoreKeyerHasher::to_compact(collection_str); let collection_path = StoreKVBuilder::path(collection_atom); // Force a KV store close StoreKVBuilder::close(collection_atom); if collection_path.exists() { debug!( "kv collection store exists, erasing: {}/* at path: {:?}", collection_str, &collection_path ); // Remove KV store storage from filesystem let erase_result = fs::remove_dir_all(&collection_path); if erase_result.is_ok() { debug!("done with kv collection erasure"); Ok(1) } else { Err(()) } } else { debug!( "kv collection store does not exist, consider already erased: {}/* at path: {:?}", collection_str, &collection_path ); Ok(0) } } fn proceed_erase_bucket(_collection: &str, _bucket: &str) -> Result { // This one is not implemented, as we need to acquire the collection; which would cause \ // a party-killer dead-lock. Err(()) } } impl<'a> StoreKVAction<'a> { /// Meta-to-Value mapper /// /// [IDX=0] ((meta)) ~> ((value)) pub fn get_meta_to_value(&self, meta: StoreMetaKey) -> Result, ()> { if let Some(ref store) = self.store { let store_key = StoreKeyerBuilder::meta_to_value(self.bucket.as_str(), &meta); debug!("store get meta-to-value: {}", store_key); match store.get(&store_key.as_bytes()) { Ok(Some(value)) => { debug!("got meta-to-value: {}", store_key); Ok(if let Ok(value) = str::from_utf8(&value) { match meta { StoreMetaKey::IIDIncr => value .parse::() .ok() .map(StoreMetaValue::IIDIncr) .or(None), } } else { None }) } Ok(None) => { debug!("no meta-to-value found: {}", store_key); Ok(None) } Err(err) => { error!( "error getting meta-to-value: {} with trace: {}", store_key, err ); Err(()) } } } else { Ok(None) } } pub fn set_meta_to_value(&self, meta: StoreMetaKey, value: StoreMetaValue) -> Result<(), ()> { if let Some(ref store) = self.store { let store_key = StoreKeyerBuilder::meta_to_value(self.bucket.as_str(), &meta); debug!("store set meta-to-value: {}", store_key); let value_string = match value { StoreMetaValue::IIDIncr(iid_incr) => iid_incr.to_string(), }; store .put(&store_key.as_bytes(), value_string.as_bytes()) .or(Err(())) } else { Err(()) } } /// Term-to-IIDs mapper /// /// [IDX=1] ((term)) ~> [((iid))] pub fn get_term_to_iids( &self, term_hashed: StoreTermHashed, ) -> Result>, ()> { if let Some(ref store) = self.store { let store_key = StoreKeyerBuilder::term_to_iids(self.bucket.as_str(), term_hashed); debug!("store get term-to-iids: {}", store_key); match store.get(&store_key.as_bytes()) { Ok(Some(value)) => { debug!( "got term-to-iids: {} with encoded value: {:?}", store_key, &*value ); Self::decode_u32_list(&*value) .or(Err(())) .map(|value_decoded| { debug!( "got term-to-iids: {} with decoded value: {:?}", store_key, &value_decoded ); Some(value_decoded) }) } Ok(None) => { debug!("no term-to-iids found: {}", store_key); Ok(None) } Err(err) => { error!( "error getting term-to-iids: {} with trace: {}", store_key, err ); Err(()) } } } else { Ok(None) } } pub fn set_term_to_iids( &self, term_hashed: StoreTermHashed, iids: &[StoreObjectIID], ) -> Result<(), ()> { if let Some(ref store) = self.store { let store_key = StoreKeyerBuilder::term_to_iids(self.bucket.as_str(), term_hashed); debug!("store set term-to-iids: {}", store_key); // Encode IID list into storage serialized format let iids_encoded = Self::encode_u32_list(iids); debug!( "store set term-to-iids: {} with encoded value: {:?}", store_key, iids_encoded ); store.put(&store_key.as_bytes(), &iids_encoded).or(Err(())) } else { Err(()) } } pub fn delete_term_to_iids(&self, term_hashed: StoreTermHashed) -> Result<(), ()> { if let Some(ref store) = self.store { let store_key = StoreKeyerBuilder::term_to_iids(self.bucket.as_str(), term_hashed); debug!("store delete term-to-iids: {}", store_key); store.delete(&store_key.as_bytes()).or(Err(())) } else { Err(()) } } /// OID-to-IID mapper /// /// [IDX=2] ((oid)) ~> ((iid)) pub fn get_oid_to_iid(&self, oid: StoreObjectOID<'a>) -> Result, ()> { if let Some(ref store) = self.store { let store_key = StoreKeyerBuilder::oid_to_iid(self.bucket.as_str(), oid); debug!("store get oid-to-iid: {}", store_key); match store.get(&store_key.as_bytes()) { Ok(Some(value)) => { debug!( "got oid-to-iid: {} with encoded value: {:?}", store_key, &*value ); Self::decode_u32(&*value).or(Err(())).map(|value_decoded| { debug!( "got oid-to-iid: {} with decoded value: {:?}", store_key, &value_decoded ); Some(value_decoded) }) } Ok(None) => { debug!("no oid-to-iid found: {}", store_key); Ok(None) } Err(err) => { error!( "error getting oid-to-iid: {} with trace: {}", store_key, err ); Err(()) } } } else { Ok(None) } } pub fn set_oid_to_iid(&self, oid: StoreObjectOID<'a>, iid: StoreObjectIID) -> Result<(), ()> { if let Some(ref store) = self.store { let store_key = StoreKeyerBuilder::oid_to_iid(self.bucket.as_str(), oid); debug!("store set oid-to-iid: {}", store_key); // Encode IID let iid_encoded = Self::encode_u32(iid); debug!( "store set oid-to-iid: {} with encoded value: {:?}", store_key, iid_encoded ); store.put(&store_key.as_bytes(), &iid_encoded).or(Err(())) } else { Err(()) } } pub fn delete_oid_to_iid(&self, oid: StoreObjectOID<'a>) -> Result<(), ()> { if let Some(ref store) = self.store { let store_key = StoreKeyerBuilder::oid_to_iid(self.bucket.as_str(), oid); debug!("store delete oid-to-iid: {}", store_key); store.delete(&store_key.as_bytes()).or(Err(())) } else { Err(()) } } /// IID-to-OID mapper /// /// [IDX=3] ((iid)) ~> ((oid)) pub fn get_iid_to_oid(&self, iid: StoreObjectIID) -> Result, ()> { if let Some(ref store) = self.store { let store_key = StoreKeyerBuilder::iid_to_oid(self.bucket.as_str(), iid); debug!("store get iid-to-oid: {}", store_key); match store.get(&store_key.as_bytes()) { Ok(Some(value)) => Ok(str::from_utf8(&value).ok().map(|value| value.to_string())), Ok(None) => Ok(None), Err(_) => Err(()), } } else { Ok(None) } } pub fn set_iid_to_oid(&self, iid: StoreObjectIID, oid: StoreObjectOID<'a>) -> Result<(), ()> { if let Some(ref store) = self.store { let store_key = StoreKeyerBuilder::iid_to_oid(self.bucket.as_str(), iid); debug!("store set iid-to-oid: {}", store_key); store.put(&store_key.as_bytes(), oid.as_bytes()).or(Err(())) } else { Err(()) } } pub fn delete_iid_to_oid(&self, iid: StoreObjectIID) -> Result<(), ()> { if let Some(ref store) = self.store { let store_key = StoreKeyerBuilder::iid_to_oid(self.bucket.as_str(), iid); debug!("store delete iid-to-oid: {}", store_key); store.delete(&store_key.as_bytes()).or(Err(())) } else { Err(()) } } /// IID-to-Terms mapper /// /// [IDX=4] ((iid)) ~> [((term))] pub fn get_iid_to_terms( &self, iid: StoreObjectIID, ) -> Result>, ()> { if let Some(ref store) = self.store { let store_key = StoreKeyerBuilder::iid_to_terms(self.bucket.as_str(), iid); debug!("store get iid-to-terms: {}", store_key); match store.get(&store_key.as_bytes()) { Ok(Some(value)) => { debug!( "got iid-to-terms: {} with encoded value: {:?}", store_key, &*value ); Self::decode_u32_list(&*value) .or(Err(())) .map(|value_decoded| { debug!( "got iid-to-terms: {} with decoded value: {:?}", store_key, &value_decoded ); if !value_decoded.is_empty() { Some(value_decoded) } else { None } }) } Ok(None) => Ok(None), Err(_) => Err(()), } } else { Ok(None) } } pub fn set_iid_to_terms( &self, iid: StoreObjectIID, terms_hashed: &[StoreTermHashed], ) -> Result<(), ()> { if let Some(ref store) = self.store { let store_key = StoreKeyerBuilder::iid_to_terms(self.bucket.as_str(), iid); debug!("store set iid-to-terms: {}", store_key); // Encode term list into storage serialized format let terms_hashed_encoded = Self::encode_u32_list(terms_hashed); debug!( "store set iid-to-terms: {} with encoded value: {:?}", store_key, terms_hashed_encoded ); store .put(&store_key.as_bytes(), &terms_hashed_encoded) .or(Err(())) } else { Err(()) } } pub fn delete_iid_to_terms(&self, iid: StoreObjectIID) -> Result<(), ()> { if let Some(ref store) = self.store { let store_key = StoreKeyerBuilder::iid_to_terms(self.bucket.as_str(), iid); debug!("store delete iid-to-terms: {}", store_key); store.delete(&store_key.as_bytes()).or(Err(())) } else { Err(()) } } pub fn batch_flush_bucket( &self, iid: StoreObjectIID, oid: StoreObjectOID<'a>, iid_terms_hashed: &[StoreTermHashed], ) -> Result { let mut count = 0; debug!( "store batch flush bucket: {} with hashed terms: {:?}", iid, iid_terms_hashed ); // Delete OID <> IID association match ( self.delete_oid_to_iid(oid), self.delete_iid_to_oid(iid), self.delete_iid_to_terms(iid), ) { (Ok(_), Ok(_), Ok(_)) => { // Delete IID from each associated term for iid_term in iid_terms_hashed { if let Ok(Some(mut iid_term_iids)) = self.get_term_to_iids(*iid_term) { if iid_term_iids.contains(&iid) { count += 1; // Remove IID from list of IIDs iid_term_iids.retain(|cur_iid| cur_iid != &iid); } let is_ok = if iid_term_iids.is_empty() { self.delete_term_to_iids(*iid_term).is_ok() } else { self.set_term_to_iids(*iid_term, &iid_term_iids).is_ok() }; if !is_ok { return Err(()); } } } Ok(count) } _ => Err(()), } } pub fn batch_truncate_object( &self, term_hashed: StoreTermHashed, term_iids_drain: Drain, ) -> Result { let mut count = 0; for term_iid_drain in term_iids_drain { debug!("store batch truncate object iid: {}", term_iid_drain); // Nuke term in IID to Terms list if let Ok(Some(mut term_iid_drain_terms)) = self.get_iid_to_terms(term_iid_drain) { count += 1; term_iid_drain_terms.retain(|cur_term| cur_term != &term_hashed); // IID to Terms list is empty? Flush whole object. if term_iid_drain_terms.is_empty() { // Acquire OID for this drained IID if let Ok(Some(term_iid_drain_oid)) = self.get_iid_to_oid(term_iid_drain) { if self .batch_flush_bucket(term_iid_drain, &term_iid_drain_oid, &Vec::new()) .is_err() { error!( "failed executing store batch truncate object batch-flush-bucket" ); } } else { error!("failed getting store batch truncate object iid-to-oid"); } } else { // Update IID to Terms list if self .set_iid_to_terms(term_iid_drain, &term_iid_drain_terms) .is_err() { error!("failed setting store batch truncate object iid-to-terms"); } } } } Ok(count) } pub fn batch_erase_bucket(&self) -> Result { if let Some(ref store) = self.store { // Generate all key prefix values (with dummy post-prefix values; we dont care) let (k_meta_to_value, k_term_to_iids, k_oid_to_iid, k_iid_to_oid, k_iid_to_terms) = ( StoreKeyerBuilder::meta_to_value(self.bucket.as_str(), &StoreMetaKey::IIDIncr), StoreKeyerBuilder::term_to_iids(self.bucket.as_str(), 0), StoreKeyerBuilder::oid_to_iid(self.bucket.as_str(), &String::new()), StoreKeyerBuilder::iid_to_oid(self.bucket.as_str(), 0), StoreKeyerBuilder::iid_to_terms(self.bucket.as_str(), 0), ); let key_prefixes: [StoreKeyerPrefix; 5] = [ k_meta_to_value.as_prefix(), k_term_to_iids.as_prefix(), k_oid_to_iid.as_prefix(), k_iid_to_oid.as_prefix(), k_iid_to_terms.as_prefix(), ]; // Scan all keys per-prefix and nuke them right away for key_prefix in &key_prefixes { debug!( "store batch erase bucket: {} for prefix: {:?}", self.bucket.as_str(), key_prefix ); // Generate start and end prefix for batch delete (in other words, the minimum \ // key value possible, and the highest key value possible) let key_prefix_start: StoreKeyerKey = [ key_prefix[0], key_prefix[1], key_prefix[2], key_prefix[3], key_prefix[4], 0, 0, 0, 0, ]; let key_prefix_end: StoreKeyerKey = [ key_prefix[0], key_prefix[1], key_prefix[2], key_prefix[3], key_prefix[4], 255, 255, 255, 255, ]; // Batch-delete keys matching range let mut batch = WriteBatch::default(); batch.delete_range(&key_prefix_start, &key_prefix_end); // Commit operation to database if let Err(err) = store.do_write(batch) { error!( "failed in store batch erase bucket: {} with error: {}", self.bucket.as_str(), err ); } else { // Ensure last key is deleted (as RocksDB end key is exclusive; while \ // start key is inclusive, we need to ensure the end-of-range key is \ // deleted) store.delete(&key_prefix_end).ok(); debug!( "succeeded in store batch erase bucket: {}", self.bucket.as_str() ); } } info!( "done processing store batch erase bucket: {}", self.bucket.as_str() ); Ok(1) } else { Err(()) } } fn encode_u32(decoded: u32) -> [u8; 4] { let mut encoded = [0; 4]; LittleEndian::write_u32(&mut encoded, decoded); encoded } fn decode_u32(encoded: &[u8]) -> Result { Cursor::new(encoded).read_u32::().or(Err(())) } fn encode_u32_list(decoded: &[u32]) -> Vec { // Pre-reserve required capacity as to avoid heap resizes (50% performance gain relative \ // to initializing this with a zero-capacity) let mut encoded = Vec::with_capacity(decoded.len() * 4); for decoded_item in decoded { encoded.extend(&Self::encode_u32(*decoded_item)) } encoded } fn decode_u32_list(encoded: &[u8]) -> Result, ()> { // Pre-reserve required capacity as to avoid heap resizes (50% performance gain relative \ // to initializing this with a zero-capacity) let mut decoded = Vec::with_capacity(encoded.len() / 4); for encoded_chunk in encoded.chunks(4) { if let Ok(decoded_chunk) = Self::decode_u32(encoded_chunk) { decoded.push(decoded_chunk); } else { return Err(()); } } Ok(decoded) } } impl StoreKVKey { pub fn from_atom(collection_hash: StoreKVAtom) -> StoreKVKey { StoreKVKey { collection_hash } } pub fn from_str(collection_str: &str) -> StoreKVKey { StoreKVKey { collection_hash: StoreKeyerHasher::to_compact(collection_str), } } } impl fmt::Display for StoreKVKey { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "<{:x?}>", self.collection_hash) } } #[cfg(test)] mod tests { use super::*; #[test] fn it_acquires_database() { assert!(StoreKVPool::acquire(StoreKVAcquireMode::Any, "c:test:1").is_ok()); } #[test] fn it_janitors_database() { StoreKVPool::janitor(); } #[test] fn it_proceeds_primitives() { let store = StoreKVPool::acquire(StoreKVAcquireMode::Any, "c:test:2") .unwrap() .unwrap(); assert!(store.get(&[0]).is_ok()); assert!(store.put(&[0], &[1, 0, 0, 0]).is_ok()); assert!(store.delete(&[0]).is_ok()); } #[test] fn it_proceeds_actions() { let store = StoreKVPool::acquire(StoreKVAcquireMode::Any, "c:test:3").unwrap(); let action = StoreKVActionBuilder::access(StoreItemPart::from_str("b:test:3").unwrap(), store); assert!(action.get_meta_to_value(StoreMetaKey::IIDIncr).is_ok()); assert!(action .set_meta_to_value(StoreMetaKey::IIDIncr, StoreMetaValue::IIDIncr(1)) .is_ok()); assert!(action.get_term_to_iids(1).is_ok()); assert!(action.set_term_to_iids(1, &[0, 1, 2]).is_ok()); assert!(action.delete_term_to_iids(1).is_ok()); assert!(action.get_oid_to_iid(&"s".to_string()).is_ok()); assert!(action.set_oid_to_iid(&"s".to_string(), 4).is_ok()); assert!(action.delete_oid_to_iid(&"s".to_string()).is_ok()); assert!(action.get_iid_to_oid(4).is_ok()); assert!(action.set_iid_to_oid(4, &"s".to_string()).is_ok()); assert!(action.delete_iid_to_oid(4).is_ok()); assert!(action.get_iid_to_terms(4).is_ok()); assert!(action.set_iid_to_terms(4, &[45402]).is_ok()); assert!(action.delete_iid_to_terms(4).is_ok()); } #[test] fn it_encodes_atom() { assert_eq!(StoreKVAction::encode_u32(0), [0, 0, 0, 0]); assert_eq!(StoreKVAction::encode_u32(1), [1, 0, 0, 0]); assert_eq!(StoreKVAction::encode_u32(45402), [90, 177, 0, 0]); } #[test] fn it_decodes_atom() { assert_eq!(StoreKVAction::decode_u32(&[0, 0, 0, 0]), Ok(0)); assert_eq!(StoreKVAction::decode_u32(&[1, 0, 0, 0]), Ok(1)); assert_eq!(StoreKVAction::decode_u32(&[90, 177, 0, 0]), Ok(45402)); } #[test] fn it_encodes_atom_list() { assert_eq!( StoreKVAction::encode_u32_list(&[0, 2, 3]), [0, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0] ); assert_eq!(StoreKVAction::encode_u32_list(&[45402]), [90, 177, 0, 0]); } #[test] fn it_decodes_atom_list() { assert_eq!( StoreKVAction::decode_u32_list(&[0, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0]), Ok(vec![0, 2, 3]) ); assert_eq!( StoreKVAction::decode_u32_list(&[90, 177, 0, 0]), Ok(vec![45402]) ); } } #[cfg(all(feature = "benchmark", test))] mod benches { extern crate test; use super::*; use test::Bencher; #[bench] fn bench_encode_atom(b: &mut Bencher) { b.iter(|| StoreKVAction::encode_u32(0)); } #[bench] fn bench_decode_atom(b: &mut Bencher) { let encoded_atom = [0, 0, 0, 0]; b.iter(|| StoreKVAction::decode_u32(&encoded_atom)); } #[bench] fn bench_encode_atom_list(b: &mut Bencher) { let atom_list = [0, 2, 3]; b.iter(|| StoreKVAction::encode_u32_list(&atom_list)); } #[bench] fn bench_decode_atom_list(b: &mut Bencher) { let encoded_atom_list = [0, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0]; b.iter(|| StoreKVAction::decode_u32_list(&encoded_atom_list)); } } ================================================ FILE: src/store/macros.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) #[macro_export] macro_rules! io_error { ($error:expr) => { io::Error::new(io::ErrorKind::Other, $error) }; } ================================================ FILE: src/store/mod.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) #[macro_use] mod macros; mod generic; mod keyer; pub mod fst; pub mod identifiers; pub mod item; pub mod kv; pub mod operation; ================================================ FILE: src/store/operation.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) use crate::executor::count::ExecutorCount; use crate::executor::flushb::ExecutorFlushB; use crate::executor::flushc::ExecutorFlushC; use crate::executor::flusho::ExecutorFlushO; use crate::executor::list::ExecutorList; use crate::executor::pop::ExecutorPop; use crate::executor::push::ExecutorPush; use crate::executor::search::ExecutorSearch; use crate::executor::suggest::ExecutorSuggest; use crate::query::actions::Query; pub struct StoreOperationDispatch; impl StoreOperationDispatch { pub fn dispatch(query: Query) -> Result, ()> { // Dispatch de-constructed query to its target executor match query { Query::Search(store, query_id, lexer, limit, offset) => { ExecutorSearch::execute(store, query_id, lexer, limit, offset) .map(|results| results.map(|results| results.join(" "))) } Query::Suggest(store, query_id, lexer, limit) => { ExecutorSuggest::execute(store, query_id, lexer, limit) .map(|results| results.map(|results| results.join(" "))) } Query::List(store, query_id, limit, offset) => { ExecutorList::execute(store, query_id, limit, offset) .map(|results| results.join(" ")) .map(|results| Some(results)) } Query::Push(store, lexer) => ExecutorPush::execute(store, lexer).map(|_| None), Query::Pop(store, lexer) => { ExecutorPop::execute(store, lexer).map(|count| Some(count.to_string())) } Query::Count(store) => { ExecutorCount::execute(store).map(|count| Some(count.to_string())) } Query::FlushC(store) => { ExecutorFlushC::execute(store).map(|count| Some(count.to_string())) } Query::FlushB(store) => { ExecutorFlushB::execute(store).map(|count| Some(count.to_string())) } Query::FlushO(store) => { ExecutorFlushO::execute(store).map(|count| Some(count.to_string())) } } } } ================================================ FILE: src/tasker/mod.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) pub mod runtime; pub mod shutdown; ================================================ FILE: src/tasker/runtime.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) use std::thread; use std::time::{Duration, Instant}; use crate::store::fst::StoreFSTPool; use crate::store::kv::StoreKVPool; pub struct TaskerBuilder; pub struct Tasker; const TASKER_TICK_INTERVAL: Duration = Duration::from_secs(10); impl TaskerBuilder { pub fn build() -> Tasker { Tasker {} } } impl Tasker { pub fn run(&self) { info!("tasker is now active"); loop { // Hold for next aggregate run thread::sleep(TASKER_TICK_INTERVAL); debug!("running a tasker tick..."); let tick_start = Instant::now(); Self::tick(); let tick_took = tick_start.elapsed(); info!( "ran tasker tick (took {}s + {}ms)", tick_took.as_secs(), tick_took.subsec_millis() ); } } fn tick() { // Proceed all tick actions // #1: Janitors StoreKVPool::janitor(); StoreFSTPool::janitor(); // #2: Others StoreKVPool::flush(false); StoreFSTPool::consolidate(false); } } ================================================ FILE: src/tasker/shutdown.rs ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Valerian Saliou // License: Mozilla Public License v2.0 (MPL v2.0) #[cfg(windows)] mod platform { // Notice: the following module is inspired from a fork of `graceful`, which implements \ // Windows support upon the original `graceful` crate; find the fork at: \ // https://github.com/Git0Shuai/graceful use std::sync::mpsc::{sync_channel, Receiver, SyncSender}; use std::sync::Mutex; use winapi::shared::minwindef::{BOOL, DWORD, TRUE}; use winapi::um::consoleapi::SetConsoleCtrlHandler; lazy_static! { static ref CHANNEL: (SyncSender, Mutex>) = { let channel = sync_channel(0); (channel.0, Mutex::new(channel.1)) }; } unsafe extern "system" fn handler(event: DWORD) -> BOOL { CHANNEL.0.send(event).unwrap(); CHANNEL.0.send(0).unwrap(); TRUE } pub struct ShutdownSignal; impl ShutdownSignal { pub fn new() -> ShutdownSignal { unsafe { SetConsoleCtrlHandler(Some(handler), TRUE) }; ShutdownSignal } pub fn at_exit(&self, handler: F) { let event = { let receiver = CHANNEL.1.lock().unwrap(); receiver.recv().unwrap() }; handler(event as usize); CHANNEL.1.lock().unwrap().recv().unwrap(); } } } #[cfg(unix)] mod platform { // Notice: the following module is inspired from `graceful`, which can be found at: \ // https://github.com/0x1997/graceful use nix::sys::signal::{SigSet, SIGINT, SIGQUIT, SIGTERM}; pub struct ShutdownSignal(SigSet); impl ShutdownSignal { pub fn new() -> ShutdownSignal { let mut mask = SigSet::empty(); ShutdownSignal::init(&mut mask).unwrap(); ShutdownSignal(mask) } fn init(mask: &mut SigSet) -> nix::Result<()> { mask.add(SIGINT); mask.add(SIGQUIT); mask.add(SIGTERM); mask.thread_block() } pub fn at_exit(&self, handler: F) { let signal = self.0.wait().unwrap(); handler(signal as usize); } } } pub use platform::ShutdownSignal; ================================================ FILE: tests/integration/.gitignore ================================================ instance/data/ runner/node_modules/ ================================================ FILE: tests/integration/instance/config.cfg ================================================ # Sonic # Configuration file (integration tests) [server] log_level = "warn" [channel] inet = "127.0.0.1:1491" auth_password = "password:test" [channel.search] [store] [store.kv] [store.kv.pool] [store.kv.database] [store.fst] [store.fst.pool] [store.fst.graph] ================================================ FILE: tests/integration/runner/package.json ================================================ { "name": "sonic-tests-integration", "description": "Sonic integration tests", "version": "1.0.0", "main": "runner.js", "homepage": "https://github.com/valeriansaliou/sonic", "license": "ISC", "engineStrict": true, "engines": { "node": ">=10.0.0", "npm": ">=6.0.0" }, "scripts": { "test": "echo \"Error: no test specified\" && exit 1" }, "author": { "name": "Nikita Vilunov", "email": "nikitaoryol@gmail.com" }, "dependencies": { "sonic-channel": "^1.2.5" } } ================================================ FILE: tests/integration/runner/runner.js ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Nikita Vilunov // License: Mozilla Public License v2.0 (MPL v2.0) const SonicChannel = require("sonic-channel"); function connect(channel, name) { return new Promise((resolve, reject) => { channel.connect({ connected() { console.info( `=== Sonic Channel succeeded to connect to host (${name}) ===` ); resolve(channel); }, disconnected() { console.error(`=== Sonic Channel is now disconnected (${name}) ===`); }, timeout() { console.error(`=== Sonic Channel connection timed out (${name}) ===`); }, retrying() { console.error(`=== Trying to reconnect to Sonic Channel (${name}) ===`); }, error(error) { console.error( `=== Sonic Channel failed to connect to host (${name}) ===`, error ); reject(error); } }); }); } async function main(scenario) { let parameters = { host : "localhost", port : 1491, auth : "password:test" }; // Connect to Sonic Channel let search = new SonicChannel.Search(parameters); let ingest = new SonicChannel.Ingest(parameters); await Promise.all([ connect(search, "search"), connect(ingest, "ingest") ]); // Run scenario await scenario(search, ingest); // Close Sonic Channel await Promise.all([ search.close(), ingest.close() ]); } function wrapper(name, scenario, timeout) { console.log(`=== Running test scenario ${name} ===`) timeout = (timeout || 1000); let timer = new Promise((_, reject) => { setTimeout(() => { reject("Timeout reached"); }, timeout); }); let start = process.hrtime(); Promise.race([ main(scenario), timer ]) .then( () => { let end = process.hrtime(start); console.log( `=== Test scenario ${name} succedeed, execution time: ` + `${end[0] + end[1] / 1e9} s ===` ); }, (error) => { let end = process.hrtime(start); console.error( (`=== Test scenario ${name} failed, execution time: ` + `${end[0] + end[1] / 1e9} s ===`), `\nERROR >> ${error}` ); process.exit(-1); } ); } module.exports = wrapper; ================================================ FILE: tests/integration/scenarios/insert.js ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Nikita Vilunov // License: Mozilla Public License v2.0 (MPL v2.0) const expected_documents = { "conversation:1" : ( "Batch normalization is a technique for improving the speed, " + "performance, and stability of artificial neural networks" ), "conversation:2" : ( "This scratch technique is much like the transform in some ways" ) }; const unexpected_documents = { "conversation:3" : "Glissando is a glide from one pitch to another" } async function run(search, ingest) { // Ingest documents for (const key in expected_documents) { await ingest.push("messages", "default", key, expected_documents[key]); } for (const key in unexpected_documents) { await ingest.push("messages", "default", key, unexpected_documents[key]); } // Perform search on ingested documents let response = await search.query("messages", "default", "technique"); for (const key in expected_documents) { if (!response.includes(key) === true) { throw `Expected document ${key} was not found`; } } for (const key in unexpected_documents) { if (response.includes(key) === true) { throw `Unexpected document ${key} was returned`; } } } require("../runner/runner.js")( "Insert & Search", run ); ================================================ FILE: tests/integration/scenarios/ping.js ================================================ // Sonic // // Fast, lightweight and schema-less search backend // Copyright: 2019, Nikita Vilunov // License: Mozilla Public License v2.0 (MPL v2.0) async function run(search) { // Perform a ping await search.ping(); } require("../runner/runner.js")( "Ping", run ); ================================================ FILE: tests/integration/scripts/run.sh ================================================ #!/bin/bash ## # Sonic # Fast, lightweight and schema-less search backend # # Copyright: 2019, Nikita Vilunov , \ # 2019, Valerian Saliou # License: Mozilla Public License v2.0 (MPL v2.0) ## ABSPATH=$(cd "$(dirname "$0")"; pwd) TESTSPATH="$ABSPATH/../" STATUS=0 # Run tests pushd "$TESTSPATH" > /dev/null # Install test dependencies from a clean state pushd "./runner/" > /dev/null npm ci popd # Run each test scenario for scenario in $(find ./scenarios/ -name "*.js") do [[ -d ./instance/data/ ]] && rm -r ./instance/data/ # Run sonic from a clean state pushd "./instance/" > /dev/null cargo run -- --config config.cfg & SONIC_PID=$! sleep 2 popd # Run scenario node $scenario [[ $? -eq 0 ]] || STATUS=1 # Stop Sonic kill $SONIC_PID wait $SONIC_PID done [[ -d ./instance/data/ ]] && rm -r ./instance/data/ popd exit $STATUS