gitextract_9bwzq4yg/

├── .github/
│   ├── conda/
│   │   ├── bld.bat
│   │   ├── build.sh
│   │   └── meta.yaml
│   ├── stale.yml
│   └── workflows/
│       ├── CI.yml
│       ├── build_documentation.yml
│       ├── build_pr_documentation.yml
│       ├── delete_doc_comment.yml
│       ├── delete_doc_comment_trigger.yml
│       ├── docs-check.yml
│       ├── node-release.yml
│       ├── node.yml
│       ├── python-release.yml
│       ├── python.yml
│       ├── rust-release.yml
│       ├── rust.yml
│       ├── stale.yml
│       ├── trufflehog.yml
│       └── upload_pr_documentation.yml
├── .gitignore
├── CITATION.cff
├── LICENSE
├── README.md
├── RELEASE.md
├── bindings/
│   ├── node/
│   │   ├── .cargo/
│   │   │   └── config.toml
│   │   ├── .editorconfig
│   │   ├── .eslintrc.yml
│   │   ├── .gitattributes
│   │   ├── .gitignore
│   │   ├── .prettierignore
│   │   ├── .taplo.toml
│   │   ├── .yarn/
│   │   │   └── releases/
│   │   │       └── yarn-3.5.1.cjs
│   │   ├── .yarnrc.yml
│   │   ├── Cargo.toml
│   │   ├── LICENSE
│   │   ├── Makefile
│   │   ├── README.md
│   │   ├── build.rs
│   │   ├── examples/
│   │   │   └── documentation/
│   │   │       ├── pipeline.test.ts
│   │   │       └── quicktour.test.ts
│   │   ├── index.d.ts
│   │   ├── index.js
│   │   ├── jest.config.js
│   │   ├── lib/
│   │   │   └── bindings/
│   │   │       ├── __mocks__/
│   │   │       │   ├── merges.txt
│   │   │       │   ├── vocab.json
│   │   │       │   └── vocab.txt
│   │   │       ├── decoders.test.ts
│   │   │       ├── encoding.test.ts
│   │   │       ├── models.test.ts
│   │   │       ├── normalizers.test.ts
│   │   │       ├── post-processors.test.ts
│   │   │       ├── pre-tokenizers.test.ts
│   │   │       ├── tokenizer.test.ts
│   │   │       └── utils.test.ts
│   │   ├── npm/
│   │   │   ├── android-arm-eabi/
│   │   │   │   ├── README.md
│   │   │   │   └── package.json
│   │   │   ├── android-arm64/
│   │   │   │   ├── README.md
│   │   │   │   └── package.json
│   │   │   ├── darwin-arm64/
│   │   │   │   ├── README.md
│   │   │   │   └── package.json
│   │   │   ├── darwin-x64/
│   │   │   │   ├── README.md
│   │   │   │   └── package.json
│   │   │   ├── freebsd-x64/
│   │   │   │   ├── README.md
│   │   │   │   └── package.json
│   │   │   ├── linux-arm-gnueabihf/
│   │   │   │   ├── README.md
│   │   │   │   └── package.json
│   │   │   ├── linux-arm64-gnu/
│   │   │   │   ├── README.md
│   │   │   │   └── package.json
│   │   │   ├── linux-arm64-musl/
│   │   │   │   ├── README.md
│   │   │   │   └── package.json
│   │   │   ├── linux-x64-gnu/
│   │   │   │   ├── README.md
│   │   │   │   └── package.json
│   │   │   ├── linux-x64-musl/
│   │   │   │   ├── README.md
│   │   │   │   └── package.json
│   │   │   ├── win32-arm64-msvc/
│   │   │   │   ├── README.md
│   │   │   │   └── package.json
│   │   │   ├── win32-ia32-msvc/
│   │   │   │   ├── README.md
│   │   │   │   └── package.json
│   │   │   └── win32-x64-msvc/
│   │   │       ├── README.md
│   │   │       └── package.json
│   │   ├── package.json
│   │   ├── rustfmt.toml
│   │   ├── src/
│   │   │   ├── arc_rwlock_serde.rs
│   │   │   ├── decoders.rs
│   │   │   ├── encoding.rs
│   │   │   ├── lib.rs
│   │   │   ├── models.rs
│   │   │   ├── normalizers.rs
│   │   │   ├── pre_tokenizers.rs
│   │   │   ├── processors.rs
│   │   │   ├── tasks/
│   │   │   │   ├── mod.rs
│   │   │   │   ├── models.rs
│   │   │   │   └── tokenizer.rs
│   │   │   ├── tokenizer.rs
│   │   │   ├── trainers.rs
│   │   │   └── utils.rs
│   │   ├── tsconfig.json
│   │   └── types.ts
│   └── python/
│       ├── .cargo/
│       │   └── config.toml
│       ├── .gitignore
│       ├── CHANGELOG.md
│       ├── Cargo.toml
│       ├── MANIFEST.in
│       ├── Makefile
│       ├── README.md
│       ├── benches/
│       │   └── test_tiktoken.py
│       ├── conftest.py
│       ├── docs/
│       │   └── pyo3.md
│       ├── examples/
│       │   ├── custom_components.py
│       │   ├── example.py
│       │   ├── train_bert_wordpiece.py
│       │   ├── train_bytelevel_bpe.py
│       │   ├── train_with_datasets.py
│       │   └── using_the_visualizer.ipynb
│       ├── py_src/
│       │   └── tokenizers/
│       │       ├── __init__.py
│       │       ├── __init__.pyi
│       │       ├── decoders/
│       │       │   └── __init__.py
│       │       ├── decoders.pyi
│       │       ├── implementations/
│       │       │   ├── __init__.py
│       │       │   ├── base_tokenizer.py
│       │       │   ├── bert_wordpiece.py
│       │       │   ├── byte_level_bpe.py
│       │       │   ├── char_level_bpe.py
│       │       │   ├── sentencepiece_bpe.py
│       │       │   └── sentencepiece_unigram.py
│       │       ├── models/
│       │       │   └── __init__.py
│       │       ├── models.pyi
│       │       ├── normalizers/
│       │       │   └── __init__.py
│       │       ├── normalizers.pyi
│       │       ├── pre_tokenizers/
│       │       │   └── __init__.py
│       │       ├── pre_tokenizers.pyi
│       │       ├── processors/
│       │       │   └── __init__.py
│       │       ├── processors.pyi
│       │       ├── py.typed
│       │       ├── tokenizers.pyi
│       │       ├── tools/
│       │       │   ├── __init__.py
│       │       │   ├── visualizer-styles.css
│       │       │   └── visualizer.py
│       │       ├── trainers/
│       │       │   ├── __init__.py
│       │       │   └── __init__.pyi
│       │       └── trainers.pyi
│       ├── pyproject.toml
│       ├── rust-toolchain
│       ├── scripts/
│       │   ├── convert.py
│       │   ├── sentencepiece_extractor.py
│       │   └── spm_parity_check.py
│       ├── setup.cfg
│       ├── src/
│       │   ├── decoders.rs
│       │   ├── encoding.rs
│       │   ├── error.rs
│       │   ├── lib.rs
│       │   ├── models.rs
│       │   ├── normalizers.rs
│       │   ├── pre_tokenizers.rs
│       │   ├── processors.rs
│       │   ├── token.rs
│       │   ├── tokenizer.rs
│       │   ├── trainers.rs
│       │   └── utils/
│       │       ├── iterators.rs
│       │       ├── mod.rs
│       │       ├── normalization.rs
│       │       ├── pretokenization.rs
│       │       ├── regex.rs
│       │       └── serde_pyo3.rs
│       ├── stub.py
│       ├── test.txt
│       ├── tests/
│       │   ├── __init__.py
│       │   ├── bindings/
│       │   │   ├── __init__.py
│       │   │   ├── test_decoders.py
│       │   │   ├── test_encoding.py
│       │   │   ├── test_models.py
│       │   │   ├── test_normalizers.py
│       │   │   ├── test_pre_tokenizers.py
│       │   │   ├── test_processors.py
│       │   │   ├── test_tokenizer.py
│       │   │   └── test_trainers.py
│       │   ├── documentation/
│       │   │   ├── __init__.py
│       │   │   ├── test_pipeline.py
│       │   │   ├── test_quicktour.py
│       │   │   └── test_tutorial_train_from_iterators.py
│       │   ├── implementations/
│       │   │   ├── __init__.py
│       │   │   ├── test_base_tokenizer.py
│       │   │   ├── test_bert_wordpiece.py
│       │   │   ├── test_byte_level_bpe.py
│       │   │   ├── test_char_bpe.py
│       │   │   └── test_sentencepiece.py
│       │   ├── test_serialization.py
│       │   └── utils.py
│       └── tools/
│           └── stub-gen/
│               ├── Cargo.toml
│               └── src/
│                   └── main.rs
├── docs/
│   ├── Makefile
│   ├── README.md
│   ├── source/
│   │   ├── _ext/
│   │   │   ├── entities.py
│   │   │   ├── rust_doc.py
│   │   │   └── toctree_tags.py
│   │   ├── _static/
│   │   │   ├── css/
│   │   │   │   ├── Calibre-Medium.otf
│   │   │   │   ├── Calibre-Regular.otf
│   │   │   │   ├── Calibre-Thin.otf
│   │   │   │   ├── code-snippets.css
│   │   │   │   └── huggingface.css
│   │   │   └── js/
│   │   │       └── custom.js
│   │   ├── api/
│   │   │   ├── node.inc
│   │   │   ├── python.inc
│   │   │   ├── reference.rst
│   │   │   └── rust.inc
│   │   ├── components.rst
│   │   ├── conf.py
│   │   ├── entities.inc
│   │   ├── index.rst
│   │   ├── installation/
│   │   │   ├── main.rst
│   │   │   ├── node.inc
│   │   │   ├── python.inc
│   │   │   └── rust.inc
│   │   ├── pipeline.rst
│   │   ├── quicktour.rst
│   │   └── tutorials/
│   │       └── python/
│   │           └── training_from_memory.rst
│   └── source-doc-builder/
│       ├── _toctree.yml
│       ├── api/
│       │   ├── added-tokens.mdx
│       │   ├── decoders.mdx
│       │   ├── encode-inputs.mdx
│       │   ├── encoding.mdx
│       │   ├── input-sequences.mdx
│       │   ├── models.mdx
│       │   ├── normalizers.mdx
│       │   ├── post-processors.mdx
│       │   ├── pre-tokenizers.mdx
│       │   ├── tokenizer.mdx
│       │   ├── trainers.mdx
│       │   └── visualizer.mdx
│       ├── components.mdx
│       ├── index.mdx
│       ├── installation.mdx
│       ├── pipeline.mdx
│       ├── quicktour.mdx
│       └── training_from_memory.mdx
└── tokenizers/
    ├── CHANGELOG.md
    ├── Cargo.toml
    ├── Makefile
    ├── README.md
    ├── README.tpl
    ├── benches/
    │   ├── added_vocab_deserialize.rs
    │   ├── bert_benchmark.rs
    │   ├── bpe_benchmark.rs
    │   ├── common/
    │   │   └── mod.rs
    │   ├── layout_benchmark.rs
    │   ├── llama3_benchmark.rs
    │   └── unigram_benchmark.rs
    ├── examples/
    │   ├── encode_batch.rs
    │   ├── serialization.rs
    │   └── unstable_wasm/
    │       ├── .gitignore
    │       ├── Cargo.toml
    │       ├── README.md
    │       ├── src/
    │       │   ├── lib.rs
    │       │   └── utils.rs
    │       ├── tests/
    │       │   └── web.rs
    │       └── www/
    │           ├── .gitignore
    │           ├── .travis.yml
    │           ├── LICENSE-APACHE
    │           ├── LICENSE-MIT
    │           ├── README.md
    │           ├── bootstrap.js
    │           ├── index.html
    │           ├── index.js
    │           ├── package.json
    │           └── webpack.config.js
    ├── rust-toolchain
    ├── src/
    │   ├── decoders/
    │   │   ├── bpe.rs
    │   │   ├── byte_fallback.rs
    │   │   ├── ctc.rs
    │   │   ├── fuse.rs
    │   │   ├── mod.rs
    │   │   ├── sequence.rs
    │   │   ├── strip.rs
    │   │   └── wordpiece.rs
    │   ├── lib.rs
    │   ├── models/
    │   │   ├── bpe/
    │   │   │   ├── mod.rs
    │   │   │   ├── model.rs
    │   │   │   ├── serialization.rs
    │   │   │   ├── trainer.rs
    │   │   │   └── word.rs
    │   │   ├── mod.rs
    │   │   ├── unigram/
    │   │   │   ├── lattice.rs
    │   │   │   ├── mod.rs
    │   │   │   ├── model.rs
    │   │   │   ├── serialization.rs
    │   │   │   ├── trainer.rs
    │   │   │   └── trie.rs
    │   │   ├── wordlevel/
    │   │   │   ├── mod.rs
    │   │   │   ├── serialization.rs
    │   │   │   └── trainer.rs
    │   │   └── wordpiece/
    │   │       ├── mod.rs
    │   │       ├── serialization.rs
    │   │       └── trainer.rs
    │   ├── normalizers/
    │   │   ├── bert.rs
    │   │   ├── byte_level.rs
    │   │   ├── mod.rs
    │   │   ├── precompiled.rs
    │   │   ├── prepend.rs
    │   │   ├── replace.rs
    │   │   ├── strip.rs
    │   │   ├── unicode.rs
    │   │   └── utils.rs
    │   ├── pre_tokenizers/
    │   │   ├── bert.rs
    │   │   ├── byte_level.rs
    │   │   ├── delimiter.rs
    │   │   ├── digits.rs
    │   │   ├── fixed_length.rs
    │   │   ├── metaspace.rs
    │   │   ├── mod.rs
    │   │   ├── punctuation.rs
    │   │   ├── sequence.rs
    │   │   ├── split.rs
    │   │   ├── unicode_scripts/
    │   │   │   ├── mod.rs
    │   │   │   ├── pre_tokenizer.rs
    │   │   │   └── scripts.rs
    │   │   └── whitespace.rs
    │   ├── processors/
    │   │   ├── bert.rs
    │   │   ├── mod.rs
    │   │   ├── roberta.rs
    │   │   ├── sequence.rs
    │   │   └── template.rs
    │   ├── tokenizer/
    │   │   ├── added_vocabulary.rs
    │   │   ├── encoding.rs
    │   │   ├── mod.rs
    │   │   ├── normalizer.rs
    │   │   ├── pattern.rs
    │   │   ├── pre_tokenizer.rs
    │   │   └── serialization.rs
    │   └── utils/
    │       ├── cache.rs
    │       ├── fancy.rs
    │       ├── from_pretrained.rs
    │       ├── iter.rs
    │       ├── mod.rs
    │       ├── onig.rs
    │       ├── padding.rs
    │       ├── parallelism.rs
    │       ├── progress.rs
    │       └── truncation.rs
    └── tests/
        ├── added_tokens.rs
        ├── common/
        │   └── mod.rs
        ├── documentation.rs
        ├── from_pretrained.rs
        ├── offsets.rs
        ├── serialization.rs
        ├── stream.rs
        ├── training.rs
        └── unigram.rs