[
  {
    "path": ".github/workflows/build_wheels.yml",
    "content": "name: Build wheels\n\non: [push, pull_request, workflow_dispatch]\n\nconcurrency:\n  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}\n  cancel-in-progress: true\n\njobs:\n  build_wheels:\n    name: py${{ matrix.python-version }} on ${{ matrix.os }}\n    runs-on: ${{ matrix.os }}\n    timeout-minutes: 60\n    strategy:\n      fail-fast: false\n      matrix:\n        # cibuildwheel builds linux wheels inside a manylinux container\n        # it also takes care of procuring the correct python version for us\n        os: [ubuntu-latest, windows-latest, macos-latest]\n        python-version: [39, 310, 311, 312, 313, 313t, 314, 314t]\n\n    steps:\n      - uses: actions/checkout@v6\n\n      - uses: pypa/cibuildwheel@v3.1.4\n        env:\n          CIBW_BUILD: \"cp${{ matrix.python-version}}-*\"\n          CIBW_ENABLE: cpython-freethreading\n\n      - uses: actions/upload-artifact@v6\n        with:\n          name: cibw-wheels-${{ matrix.os }}-${{ strategy.job-index }}\n          path: ./wheelhouse/*.whl\n\n  build_wheels_aarch64:\n    name: py${{ matrix.python-version }} on ${{ matrix.os }} (aarch64)\n    runs-on: ${{ matrix.os }}\n    timeout-minutes: 60\n    strategy:\n      fail-fast: false\n      matrix:\n        os: [ubuntu-24.04-arm]\n        python-version: [39, 310, 311, 312, 313, 313t, 314, 314t]\n\n    steps:\n      - uses: actions/checkout@v6\n\n      - name: Build wheels\n        uses: pypa/cibuildwheel@v3.1.4\n        env:\n          CIBW_BUILD: \"cp${{ matrix.python-version}}-*\"\n          CIBW_ARCHS: aarch64\n          CIBW_BUILD_VERBOSITY: 3\n          # https://github.com/rust-lang/cargo/issues/10583\n          CIBW_ENVIRONMENT_LINUX: PATH=\"$PATH:$HOME/.cargo/bin\" CARGO_NET_GIT_FETCH_WITH_CLI=true\n          CIBW_ENABLE: cpython-freethreading\n\n      - uses: actions/upload-artifact@v6\n        with:\n          name: cibw-wheels-aarch64-${{ matrix.os }}-${{ strategy.job-index }}\n          path: ./wheelhouse/*.whl\n\n  build_sdist:\n    name: sdist\n    runs-on: ubuntu-latest\n    timeout-minutes: 60\n    steps:\n      - uses: actions/checkout@v6\n      - uses: actions/setup-python@v6\n        name: Install Python\n        with:\n          python-version: \"3.9\"\n      - name: Run check-manifest\n        run: |\n          pip install check-manifest\n          check-manifest -v\n      - name: Build sdist\n        run: |\n          pip install --upgrade build\n          python -m build --sdist\n      - uses: actions/upload-artifact@v6\n        with:\n          name: cibw-wheels-${{ matrix.os }}-${{ strategy.job-index }}\n          path: ./dist/*.tar.gz\n\n  join_artifacts:\n    name: Join artifacts\n    runs-on: ubuntu-latest\n    needs: [build_wheels, build_wheels_aarch64, build_sdist]\n    steps:\n     - name: Merge artifacts\n       uses: actions/upload-artifact/merge@v4\n       with:\n         name: cibw-wheels\n         pattern: cibw-wheels-*\n         delete-merged: true\n"
  },
  {
    "path": ".gitignore",
    "content": "# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so\n\n# Distribution / packaging\n.Python\nbuild/\ndevelop-eggs/\ndist/\ndownloads/\neggs/\n.eggs/\nlib/\nlib64/\nparts/\nsdist/\nvar/\nwheels/\nshare/python-wheels/\n*.egg-info/\n.installed.cfg\n*.egg\nMANIFEST\n\n# Environments\n.env\n.venv\n\n# Tools\n.mypy_cache\n.coverage\n.hypothesis\nhtmlcov\n\n# General\n.DS_Store\n\nCargo.lock\ntarget/\n"
  },
  {
    "path": "CHANGELOG.md",
    "content": "# Changelog\n\nThis is the changelog for the open source version of tiktoken.\n\n## [v0.12.0]\n- Build wheels for Python 3.14\n- Build musllinux aarch64 wheels\n- Support for free-threaded Python\n- Update version of `pyo3` and `rustc-hash`\n- Avoid use of `blobfile` for reading local files\n- Recognise `gpt-5` model identifier\n- Minor performance improvement for file reading\n\n## [v0.11.0]\n- Support for `GPT-5`\n- Update version of `pyo3`\n- Use new Rust edition\n- Fix special token handling in `encode_to_numpy`\n- Better error handling\n- Improvements to private APIs\n\n## [v0.10.0]\n- Support for newer models\n- Improvements to private APIs\n\n## [v0.9.0]\n- Support for `o1` and `o3` models\n- Better error messages when loading invalid vocabulary files\n- Support for encoding to numpy arrays\n- Delayed imports when not strictly necessary\n\n## [v0.8.0]\n\n- Support for `o1-` and `chatgpt-4o-` models\n- Build wheels for Python 3.13\n- Add possessive quantifiers to limit backtracking in regular expressions, thanks to @l0rinc!\n- Provide a better error message and type for invalid token decode\n- Permit tuples in type hints\n- Better error message for passing invalid input to `get_encoding`\n- Better error messages during plugin loading\n- Add a `__version__` attribute\n- Update versions of `pyo3`, `regex`, `fancy-regex`\n- Drop support for Python 3.8\n\n## [v0.7.0]\n\n- Support for `gpt-4o`\n- Performance improvements\n\n## [v0.6.0]\n\n- Optimise regular expressions for a 20% performance improvement, thanks to @paplorinc!\n- Add `text-embedding-3-*` models to `encoding_for_model`\n- Check content hash for downloaded files\n- Allow pickling `Encoding` objects. Registered `Encoding` will be pickled by reference\n- Workaround PyO3 bug for frozenset conversion\n\nThank you to @paplorinc, @mdwelsh, @Praneet460!\n\n## [v0.5.2]\n\n- Build wheels for Python 3.12\n- Update version of PyO3 to allow multiple imports\n- Avoid permission errors when using default cache logic\n\n## [v0.5.1]\n\n- Add `encoding_name_for_model`, undo some renames to variables that are implementation details\n\n## [v0.5.0]\n\n- Add `tiktoken._educational` submodule to better document how byte pair encoding works\n- Ensure `encoding_for_model` knows about several new models\n- Add `decode_with_offets`\n- Better error for failures with the plugin mechanism\n- Make more tests public\n- Update versions of dependencies\n\n## [v0.4.0]\n\n- Add `decode_batch` and `decode_bytes_batch`\n- Improve error messages and handling\n\n## [v0.3.3]\n\n- `tiktoken` will now make a best effort attempt to replace surrogate pairs with the corresponding\n  Unicode character and will replace lone surrogates with the Unicode replacement character.\n\n## [v0.3.2]\n\n- Add encoding for GPT-4\n\n## [v0.3.1]\n\n- Build aarch64 wheels\n- Make `blobfile` an optional dependency\n\nThank you to @messense for the environment variable that makes cargo not OOM under emulation!\n\n## [v0.3.0]\n\n- Improve performance by 5-20%; thank you to @nistath!\n- Add `gpt-3.5-turbo` models to `encoding_for_model`\n- Add prefix matching to `encoding_for_model` to better support future model versions\n- Fix a bug in the README instructions on extending tiktoken\n- Update the set of available encodings\n- Add packaging metadata\n\n## [v0.2.0]\n\n- Add `tiktoken.encoding_for_model` to get the encoding for a specific model\n- Improve portability of caching logic\n\nThank you to @fritzo, @arvid220u, @khanhvu207, @henriktorget for various small corrections\n\n## [v0.1.2]\n\n- Avoid use of `blobfile` for public files\n- Add support for Python 3.8\n- Add py.typed\n- Improve the public tests\n\n## [v0.1.1]\n\n- Initial release\n"
  },
  {
    "path": "Cargo.toml",
    "content": "[package]\nname = \"tiktoken\"\nversion = \"0.12.0\"\nedition = \"2024\"\n\n[lib]\nname = \"tiktoken\"\ncrate-type = [\"cdylib\", \"rlib\"]\n\n[features]\ndefault = []\npython = [\n    \"pyo3\",\n]\n\n[dependencies]\npyo3 = { version = \"0.27.2\", default-features = false, features = [\n    \"extension-module\",\n    \"macros\",\n], optional = true }\n\n# tiktoken dependencies\nfancy-regex = \"0.17.0\"\nregex = \"1.10.3\"\nrustc-hash = \"2\"\nbstr = \"1.5.0\"\n"
  },
  {
    "path": "LICENSE",
    "content": "MIT License\n\nCopyright (c) 2022 OpenAI, Shantanu Jain\n\nPermission is hereby granted, free of charge, to any person obtaining a copy\nof this software and associated documentation files (the \"Software\"), to deal\nin the Software without restriction, including without limitation the rights\nto use, copy, modify, merge, publish, distribute, sublicense, and/or sell\ncopies of the Software, and to permit persons to whom the Software is\nfurnished to do so, subject to the following conditions:\n\nThe above copyright notice and this permission notice shall be included in all\ncopies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\nIMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\nFITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\nAUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\nLIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\nOUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\nSOFTWARE.\n"
  },
  {
    "path": "MANIFEST.in",
    "content": "include *.svg\ninclude *.toml\ninclude *.md\ninclude Makefile\nglobal-include py.typed\nrecursive-include scripts *.py\nrecursive-include tests *.py\nrecursive-include src *.rs\n"
  },
  {
    "path": "README.md",
    "content": "# ⏳ tiktoken\n\ntiktoken is a fast [BPE](https://en.wikipedia.org/wiki/Byte_pair_encoding) tokeniser for use with\nOpenAI's models.\n\n```python\nimport tiktoken\nenc = tiktoken.get_encoding(\"o200k_base\")\nassert enc.decode(enc.encode(\"hello world\")) == \"hello world\"\n\n# To get the tokeniser corresponding to a specific model in the OpenAI API:\nenc = tiktoken.encoding_for_model(\"gpt-4o\")\n```\n\nThe open source version of `tiktoken` can be installed from [PyPI](https://pypi.org/project/tiktoken):\n```\npip install tiktoken\n```\n\nThe tokeniser API is documented in `tiktoken/core.py`.\n\nExample code using `tiktoken` can be found in the\n[OpenAI Cookbook](https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb).\n\n\n## Performance\n\n`tiktoken` is between 3-6x faster than a comparable open source tokeniser:\n\n![image](https://raw.githubusercontent.com/openai/tiktoken/main/perf.svg)\n\nPerformance measured on 1GB of text using the GPT-2 tokeniser, using `GPT2TokenizerFast` from\n`tokenizers==0.13.2`, `transformers==4.24.0` and `tiktoken==0.2.0`.\n\n\n## Getting help\n\nPlease post questions in the [issue tracker](https://github.com/openai/tiktoken/issues).\n\nIf you work at OpenAI, make sure to check the internal documentation or feel free to contact\n@shantanu.\n\n## What is BPE anyway?\n\nLanguage models don't see text like you and I, instead they see a sequence of numbers (known as tokens).\nByte pair encoding (BPE) is a way of converting text into tokens. It has a couple desirable\nproperties:\n1) It's reversible and lossless, so you can convert tokens back into the original text\n2) It works on arbitrary text, even text that is not in the tokeniser's training data\n3) It compresses the text: the token sequence is shorter than the bytes corresponding to the\n   original text. On average, in practice, each token corresponds to about 4 bytes.\n4) It attempts to let the model see common subwords. For instance, \"ing\" is a common subword in\n   English, so BPE encodings will often split \"encoding\" into tokens like \"encod\" and \"ing\"\n   (instead of e.g. \"enc\" and \"oding\"). Because the model will then see the \"ing\" token again and\n   again in different contexts, it helps models generalise and better understand grammar.\n\n`tiktoken` contains an educational submodule that is friendlier if you want to learn more about\nthe details of BPE, including code that helps visualise the BPE procedure:\n```python\nfrom tiktoken._educational import *\n\n# Train a BPE tokeniser on a small amount of text\nenc = train_simple_encoding()\n\n# Visualise how the GPT-4 encoder encodes text\nenc = SimpleBytePairEncoding.from_tiktoken(\"cl100k_base\")\nenc.encode(\"hello world aaaaaaaaaaaa\")\n```\n\n\n## Extending tiktoken\n\nYou may wish to extend `tiktoken` to support new encodings. There are two ways to do this.\n\n\n**Create your `Encoding` object exactly the way you want and simply pass it around.**\n\n```python\ncl100k_base = tiktoken.get_encoding(\"cl100k_base\")\n\n# In production, load the arguments directly instead of accessing private attributes\n# See openai_public.py for examples of arguments for specific encodings\nenc = tiktoken.Encoding(\n    # If you're changing the set of special tokens, make sure to use a different name\n    # It should be clear from the name what behaviour to expect.\n    name=\"cl100k_im\",\n    pat_str=cl100k_base._pat_str,\n    mergeable_ranks=cl100k_base._mergeable_ranks,\n    special_tokens={\n        **cl100k_base._special_tokens,\n        \"<|im_start|>\": 100264,\n        \"<|im_end|>\": 100265,\n    }\n)\n```\n\n**Use the `tiktoken_ext` plugin mechanism to register your `Encoding` objects with `tiktoken`.**\n\nThis is only useful if you need `tiktoken.get_encoding` to find your encoding, otherwise prefer\noption 1.\n\nTo do this, you'll need to create a namespace package under `tiktoken_ext`.\n\nLayout your project like this, making sure to omit the `tiktoken_ext/__init__.py` file:\n```\nmy_tiktoken_extension\n├── tiktoken_ext\n│   └── my_encodings.py\n└── setup.py\n```\n\n`my_encodings.py` should be a module that contains a variable named `ENCODING_CONSTRUCTORS`.\nThis is a dictionary from an encoding name to a function that takes no arguments and returns\narguments that can be passed to `tiktoken.Encoding` to construct that encoding. For an example, see\n`tiktoken_ext/openai_public.py`. For precise details, see `tiktoken/registry.py`.\n\nYour `setup.py` should look something like this:\n```python\nfrom setuptools import setup, find_namespace_packages\n\nsetup(\n    name=\"my_tiktoken_extension\",\n    packages=find_namespace_packages(include=['tiktoken_ext*']),\n    install_requires=[\"tiktoken\"],\n    ...\n)\n```\n\nThen simply `pip install ./my_tiktoken_extension` and you should be able to use your\ncustom encodings! Make sure **not** to use an editable install.\n\n"
  },
  {
    "path": "pyproject.toml",
    "content": "[project]\nname = \"tiktoken\"\nversion = \"0.12.0\"\ndescription = \"tiktoken is a fast BPE tokeniser for use with OpenAI's models\"\nreadme = \"README.md\"\nlicense = { file = \"LICENSE\" }\nauthors = [{ name = \"Shantanu Jain\" }, { email = \"shantanu@openai.com\" }]\ndependencies = [\"regex\", \"requests\"]\noptional-dependencies = { blobfile = [\"blobfile>=3\"] }\nrequires-python = \">=3.9\"\n\n[project.urls]\nhomepage = \"https://github.com/openai/tiktoken\"\nrepository = \"https://github.com/openai/tiktoken\"\nchangelog = \"https://github.com/openai/tiktoken/blob/main/CHANGELOG.md\"\n\n[build-system]\nbuild-backend = \"setuptools.build_meta\"\nrequires = [\"setuptools>=62.4\", \"wheel\", \"setuptools-rust>=1.5.2\"]\n\n[tool.cibuildwheel]\nbuild-frontend = \"build\"\nbuild-verbosity = 1\n\nlinux.before-all = \"curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y\"\nlinux.environment = { PATH = \"$PATH:$HOME/.cargo/bin\" }\nmacos.before-all = \"rustup target add aarch64-apple-darwin x86_64-apple-darwin\"\nmacos.environment = { MACOSX_DEPLOYMENT_TARGET = \"10.12\" }\n\nskip = [\n  \"*-manylinux_i686\",\n  \"*-musllinux_i686\",\n  \"*-win32\",\n]\nmacos.archs = [\"x86_64\", \"arm64\"]\n# When cross-compiling on Intel, it is not possible to test arm64 wheels.\n# Warnings will be silenced with following CIBW_TEST_SKIP\ntest-skip = \"*-macosx_arm64\"\n\nbefore-test = \"pip install pytest hypothesis\"\ntest-command = \"pytest {project}/tests --import-mode=append\"\n"
  },
  {
    "path": "scripts/benchmark.py",
    "content": "import base64\nimport functools\nimport gzip\nimport json\nimport os\nimport random\nimport time\nfrom typing import Any, cast\n\nimport blobfile\n\nimport tiktoken\n\n\ndef benchmark_batch(documents: list[str]) -> None:\n    num_threads = int(os.environ[\"RAYON_NUM_THREADS\"])\n    num_bytes = sum(map(len, map(str.encode, documents)))\n    print(f\"num_threads: {num_threads}, num_bytes: {num_bytes}\")\n\n    enc = tiktoken.get_encoding(\"gpt2\")\n    enc.encode(\"warmup\")\n\n    start = time.perf_counter_ns()\n    enc.encode_ordinary_batch(documents, num_threads=num_threads)\n    end = time.perf_counter_ns()\n    print(f\"tiktoken \\t{num_bytes / (end - start) * 1e9} bytes / s\")\n\n    import transformers\n\n    hf_enc = cast(Any, transformers).GPT2TokenizerFast.from_pretrained(\"gpt2\")\n    hf_enc.model_max_length = 1e30  # silence!\n    hf_enc.encode(\"warmup\")\n\n    start = time.perf_counter_ns()\n    hf_enc(documents)\n    end = time.perf_counter_ns()\n    print(f\"huggingface \\t{num_bytes / (end - start) * 1e9} bytes / s\")\n\n\n"
  },
  {
    "path": "scripts/redact.py",
    "content": "import argparse\nimport re\nimport subprocess\nfrom pathlib import Path\n\n\ndef redact_file(path: Path, dry_run: bool) -> None:\n    if not path.exists() or path.is_dir():\n        return\n\n    text = path.read_text()\n    if not text:\n        return\n\n    first_line = text.splitlines()[0]\n    if \"redact\" in first_line:\n        if not dry_run:\n            path.unlink()\n        print(f\"Deleted {path}\")\n        return\n\n    pattern = \"|\".join(\n        r\" *\" + re.escape(x)\n        for x in [\n            \"# ===== redact-beg =====\\n\",\n            \"# ===== redact-end =====\\n\",\n            \"<!--- redact-beg -->\\n\",\n            \"<!--- redact-end -->\\n\",\n        ]\n    )\n\n    if re.search(pattern, text):\n        redacted_text = \"\".join(re.split(pattern, text)[::2])\n        if not dry_run:\n            path.write_text(redacted_text)\n        print(f\"Redacted {path}\")\n        return\n\n    print(f\"Skipped {path}\")\n\n\ndef redact(dry_run: bool) -> None:\n    tiktoken_root = Path(__file__).parent.parent\n    assert tiktoken_root.name == \"tiktoken\"\n    assert (tiktoken_root / \"pyproject.toml\").exists()\n\n    try:\n        output = subprocess.check_output([\"git\", \"ls-files\"], cwd=tiktoken_root, text=True)\n        paths = [Path(p) for p in output.splitlines()]\n    except subprocess.CalledProcessError:\n        paths = list(tiktoken_root.glob(\"**/*\"))\n\n    for path in paths:\n        redact_file(path, dry_run=dry_run)\n\n\ndef main() -> None:\n    parser = argparse.ArgumentParser()\n    parser.add_argument(\"--dry-run\", type=lambda x: not x or x[0].lower() != \"f\", default=True)\n    args = parser.parse_args()\n    redact(args.dry_run)\n    if args.dry_run:\n        print(\"Dry run, use --dry-run=false to actually redact files\")\n\n\nif __name__ == \"__main__\":\n    main()\n"
  },
  {
    "path": "scripts/wheel_download.py",
    "content": "import argparse\nimport zipfile\nfrom pathlib import Path\n\nimport requests\n\n\ndef download_artifacts(token, owner, repo, run_id, output_dir):\n    headers = {\"Authorization\": f\"token {token}\", \"Accept\": \"application/vnd.github.v3+json\"}\n\n    # Get list of artifacts\n    artifacts_url = f\"https://api.github.com/repos/{owner}/{repo}/actions/runs/{run_id}/artifacts\"\n    response = requests.get(artifacts_url, headers=headers)\n    response.raise_for_status()\n    artifacts = response.json()[\"artifacts\"]\n\n    if not artifacts:\n        print(f\"No artifacts found for run ID: {run_id}\")\n        return\n\n    output_dir = Path(output_dir)\n    output_dir.mkdir(parents=True, exist_ok=True)\n\n    print(f\"Found {len(artifacts)} artifacts\")\n    for artifact in artifacts:\n        name = artifact[\"name\"]\n        download_url = artifact[\"archive_download_url\"]\n\n        print(f\"Downloading {name}...\")\n\n        response = requests.get(download_url, headers=headers, stream=True)\n        response.raise_for_status()\n\n        temp_zip = output_dir / f\"{name}.zip\"\n        with open(temp_zip, \"wb\") as f:\n            for chunk in response.iter_content(chunk_size=8192):\n                f.write(chunk)\n        with zipfile.ZipFile(temp_zip, \"r\") as zip_ref:\n            zip_ref.extractall(output_dir)\n        temp_zip.unlink()\n        print(f\"Downloaded and extracted {name}\")\n\n\nif __name__ == \"__main__\":\n    parser = argparse.ArgumentParser(description=\"Download artifacts from a GitHub Actions run\")\n    parser.add_argument(\"--token\", required=True, help=\"GitHub Personal Access Token\")\n    parser.add_argument(\"--owner\", required=True, help=\"Repository owner\")\n    parser.add_argument(\"--repo\", required=True, help=\"Repository name\")\n    parser.add_argument(\"--run-id\", required=True, help=\"Workflow run ID\")\n    parser.add_argument(\n        \"--output-dir\", default=\"artifacts\", help=\"Output directory for downloaded artifacts\"\n    )\n\n    args = parser.parse_args()\n\n    download_artifacts(args.token, args.owner, args.repo, args.run_id, args.output_dir)\n"
  },
  {
    "path": "setup.py",
    "content": "from setuptools import setup\nfrom setuptools_rust import Binding, RustExtension\n\nsetup(\n    name=\"tiktoken\",\n    rust_extensions=[\n        RustExtension(\n            \"tiktoken._tiktoken\",\n            binding=Binding.PyO3,\n            # Between our use of editable installs and wanting to use Rust for performance sensitive\n            # code, it makes sense to just always use --release\n            debug=False,\n            features=[\"python\"],\n        )\n    ],\n    package_data={\"tiktoken\": [\"py.typed\"]},\n    packages=[\"tiktoken\", \"tiktoken_ext\"],\n    zip_safe=False,\n)\n"
  },
  {
    "path": "src/lib.rs",
    "content": "use std::collections::HashSet;\nuse std::num::NonZeroU64;\nuse std::thread;\n\nuse fancy_regex::Regex;\n#[cfg(feature = \"python\")]\nuse pyo3::prelude::*;\nuse rustc_hash::FxHashMap as HashMap;\n\n#[cfg(feature = \"python\")]\nmod py;\n\npub type Rank = u32;\n\nuse std::collections::BinaryHeap;\n\n#[derive(Eq, PartialEq, Clone, Copy)]\nstruct Merge {\n    start: usize,\n    rank: Rank,\n}\n\nimpl Ord for Merge {\n    #[inline]\n    fn cmp(&self, other: &Self) -> std::cmp::Ordering {\n        other\n            .rank\n            .cmp(&self.rank)\n            .then_with(|| other.start.cmp(&self.start))\n    }\n}\n\nimpl PartialOrd for Merge {\n    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {\n        Some(self.cmp(other))\n    }\n}\n\nstruct State {\n    prev: usize,\n    end: usize,\n    next_end: usize,\n    next_rank: Rank,\n    cur_rank: Rank,\n}\n\nfn _byte_pair_merge_large(ranks: &HashMap<Vec<u8>, Rank>, piece: &[u8]) -> Vec<Rank> {\n    let mut state = Vec::with_capacity(piece.len());\n    state.push(State {\n        prev: usize::MAX,\n        end: 1,\n        next_end: 2,\n        next_rank: Rank::MAX,\n        cur_rank: Rank::MAX,\n    });\n\n    let mut heap = BinaryHeap::with_capacity(piece.len());\n    for i in 0..piece.len() - 1 {\n        if let Some(&rank) = ranks.get(&piece[i..i + 2]) {\n            heap.push(Merge { start: i, rank });\n            state[i].next_rank = rank;\n        }\n        // note this is happening offset by 1\n        state.push(State {\n            prev: i,\n            end: i + 2,\n            next_end: i + 3,\n            next_rank: Rank::MAX,\n            cur_rank: Rank::MAX,\n        });\n    }\n\n    // Repeatedly find the valid merge with smallest rank. We merge the (left) token that\n    // starts at `start` and ends at `state[start].end` with the (right) token that starts at\n    // `state[start].end` and ends at `state[start].next_end`.  We invalidate the old merges\n    // (the ones that started at `state[start].end` and ended at `state[start]`) and add the two\n    // new potential merges to the heap.\n\n    let potential_merge = {\n        #[inline(always)]\n        |state: &mut Vec<State>,\n         heap: &mut BinaryHeap<Merge>,\n         start: usize,\n         next_end_item: usize| {\n            state[start].next_end = next_end_item;\n            state[start].next_rank = Rank::MAX; // Always invalidate the old merge\n            if next_end_item <= piece.len()\n                && let Some(&rank) = ranks.get(&piece[start..next_end_item])\n            {\n                // We have a valid potential merge!\n                heap.push(Merge { start, rank });\n                state[start].next_rank = rank;\n            }\n        }\n    };\n\n    while let Some(left) = heap.pop() {\n        if left.rank == Rank::MAX {\n            break;\n        }\n        if left.rank != state[left.start].next_rank {\n            continue; // This merge was invalidated, ignore it\n        }\n\n        let left_start = left.start;\n        let right_start = state[left_start].end;\n        let right_end = state[left_start].next_end;\n        debug_assert!(right_end == state[right_start].end);\n        let right_next_end = state[right_start].next_end;\n\n        // Merge left and right into a single token\n        state[left_start].cur_rank = state[left_start].next_rank;\n        state[left_start].end = right_end;\n        potential_merge(&mut state, &mut heap, left_start, right_next_end);\n        if right_end < state.len() {\n            state[right_end].prev = left_start;\n        }\n        // Update the merge that ends at left_start\n        if left_start > 0 {\n            let prev_start = state[left_start].prev;\n            potential_merge(&mut state, &mut heap, prev_start, right_end);\n        }\n        // Invalidate the merge starting at right_start, so we ignore it when it comes off the heap\n        state[right_start].next_rank = Rank::MAX;\n    }\n\n    let mut result = Vec::new();\n    let mut i = 0;\n    while i < state.len() {\n        if state[i].cur_rank != Rank::MAX {\n            result.push(state[i].cur_rank);\n        } else {\n            result.push(ranks[&piece[i..state[i].end]]);\n        }\n        i = state[i].end;\n    }\n    result\n}\n\nfn _byte_pair_merge(ranks: &HashMap<Vec<u8>, Rank>, piece: &[u8]) -> Vec<(usize, Rank)> {\n    // This is a vector of (start, rank).\n    // The rank is of the pair starting at position start.\n    let mut parts = Vec::with_capacity(piece.len() + 1);\n\n    // Note that we hash bytes when indexing into `ranks`, not token pairs. As long as we train BPE\n    // the way we currently do, this is equivalent. An easy way to break this would be to decouple\n    // merge priority from token index or to prevent specific token merges.\n    let mut min_rank: (Rank, usize) = (Rank::MAX, usize::MAX);\n    for i in 0..piece.len() - 1 {\n        let rank = *ranks.get(&piece[i..i + 2]).unwrap_or(&Rank::MAX);\n        if rank < min_rank.0 {\n            min_rank = (rank, i);\n        }\n        parts.push((i, rank));\n    }\n    parts.push((piece.len() - 1, Rank::MAX));\n    parts.push((piece.len(), Rank::MAX));\n\n    let get_rank = {\n        #[inline(always)]\n        |parts: &Vec<(usize, Rank)>, i: usize| {\n            if (i + 3) < parts.len() {\n                // Similar to `piece[i..i + 2]` above. The +3 is because we haven't yet deleted\n                // parts[i + 1], see comment in the main loop.\n                *ranks\n                    .get(&piece[parts[i].0..parts[i + 3].0])\n                    .unwrap_or(&Rank::MAX)\n            } else {\n                Rank::MAX\n            }\n        }\n    };\n\n    // If you have n parts and m merges, this does O(mn) work.\n    // We could do something with a heap and do O(m log n) work.\n    // n is often very small so considerations like cache-locality outweigh the algorithmic\n    // complexity downsides of the `parts` vector.\n    while min_rank.0 != Rank::MAX {\n        let i = min_rank.1;\n        // Update parts[i] and parts[i - 1] before removing parts[i + 1], since\n        // `parts.remove(i + 1)` will thrash the cache.\n        if i > 0 {\n            parts[i - 1].1 = get_rank(&parts, i - 1);\n        }\n        parts[i].1 = get_rank(&parts, i);\n        parts.remove(i + 1);\n\n        min_rank = (Rank::MAX, usize::MAX);\n        for (i, &(_, rank)) in parts[..parts.len() - 1].iter().enumerate() {\n            if rank < min_rank.0 {\n                min_rank = (rank, i);\n            }\n        }\n    }\n    parts\n}\n\npub fn byte_pair_encode(piece: &[u8], ranks: &HashMap<Vec<u8>, Rank>) -> Vec<Rank> {\n    let piece_len = piece.len();\n\n    if piece_len == 1 {\n        return vec![ranks[piece]];\n    }\n    if piece_len < 100 {\n        return _byte_pair_merge(ranks, piece)\n            .windows(2)\n            .map(|part| ranks[&piece[part[0].0..part[1].0]])\n            .collect();\n    }\n    _byte_pair_merge_large(ranks, piece)\n}\n\npub fn byte_pair_split<'a>(piece: &'a [u8], ranks: &HashMap<Vec<u8>, Rank>) -> Vec<&'a [u8]> {\n    assert!(piece.len() > 1);\n    _byte_pair_merge(ranks, piece)\n        .windows(2)\n        .map(|part| &piece[part[0].0..part[1].0])\n        .collect()\n}\n\n// Various performance notes:\n//\n// Regex\n// =====\n// Most of the time is spent in regex. The easiest way to speed this up is by using less fancy\n// regex features. For instance, using a regex parse-able by `regex` crate is 3x faster than\n// the usual regex we use.\n//\n// However, given that we're using a regex parse-able by `regex`, there isn't much difference\n// between using the `regex` crate and using the `fancy_regex` crate.\n//\n// There is an important interaction between threading, `regex` and `fancy_regex`.\n// When using `fancy_regex`, we hit `regex.find_at`. It turns out that this causes contention on\n// some mutable scratch space inside of `regex`. This absolutely kills performance. When using plain\n// old `regex`, we don't hit this, because `find_iter` has a different code path.\n// Related: https://github.com/rust-lang/regex/blob/master/PERFORMANCE.md\n// Anyway, the way we get around this is with having a (mostly) thread local clone of the regex for\n// each thread.\n//\n// Threading\n// =========\n// I tried using `rayon`. It wasn't really faster than using Python threads and releasing the GIL.\n// So goodbye `rayon`! Let thread count etc be in control of our Python users.\n//\n// Caching\n// =======\n// The reference tokeniser has an lru cache over the equivalent of `byte_pair_encode`.\n// Originally, we had one too! Without it, we were only vaguely faster than Python.\n// I used an RWLock to protect the cache. This didn't seem to hurt single threaded performance\n// noticeably, but it did affect multi-threaded performance. Weirdly, it seemed to affect\n// multi-threaded performance even when I only had readers (maybed I messed something up?).\n// Anyway, I realised that we could get rid of the cache, if we treat the set of tokens as a cache!\n// These are exactly the set or merges that are likely to be hot. And now we don't have to think\n// about interior mutability, memory use, or cloning.\n//\n// Hashing\n// =======\n// We use FxHashMap instead of the standard HashMap. This is maybe like a 5-10% win?\n// The current implementation ends up doing a lot of hashing of bytes. In theory, this could be made\n// to be hashing of two-tuples of ints, which looks like it may also be a couple percent faster.\n\nstruct FakeThreadId(NonZeroU64);\n\nfn hash_current_thread() -> usize {\n    // It's easier to use unsafe than to use nightly. Rust has this nice u64 thread id counter\n    // that works great for our use case of avoiding collisions in our array. Unfortunately,\n    // it's private. However, there are only so many ways you can layout a u64, so just transmute\n    // https://github.com/rust-lang/rust/issues/67939\n    const _: [u8; 8] = [0; std::mem::size_of::<std::thread::ThreadId>()];\n    const _: [u8; 8] = [0; std::mem::size_of::<FakeThreadId>()];\n    let x = unsafe {\n        std::mem::transmute::<std::thread::ThreadId, FakeThreadId>(thread::current().id()).0\n    };\n    u64::from(x) as usize\n}\n\n#[derive(Debug, Clone)]\npub struct DecodeKeyError {\n    pub token: Rank,\n}\n\nimpl std::fmt::Display for DecodeKeyError {\n    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {\n        write!(f, \"Invalid token for decoding: {}\", self.token)\n    }\n}\n\nimpl std::error::Error for DecodeKeyError {}\n\n#[derive(Debug, Clone)]\npub struct DecodeError {\n    pub message: String,\n}\n\nimpl std::fmt::Display for DecodeError {\n    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {\n        write!(f, \"Could not decode tokens: {}\", self.message)\n    }\n}\n\nimpl std::error::Error for DecodeError {}\n\n#[derive(Debug, Clone)]\npub struct EncodeError {\n    pub message: String,\n}\n\nimpl std::fmt::Display for EncodeError {\n    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {\n        write!(f, \"Could not encode string: {}\", self.message)\n    }\n}\n\nimpl std::error::Error for EncodeError {}\n\nconst MAX_NUM_THREADS: usize = 128;\n\n#[cfg_attr(feature = \"python\", pyclass(frozen))]\n#[derive(Clone)]\npub struct CoreBPE {\n    encoder: HashMap<Vec<u8>, Rank>,\n    special_tokens_encoder: HashMap<String, Rank>,\n    decoder: HashMap<Rank, Vec<u8>>,\n    special_tokens_decoder: HashMap<Rank, Vec<u8>>,\n    regex_tls: Vec<Regex>,\n    special_regex_tls: Vec<Regex>,\n    sorted_token_bytes: Vec<Vec<u8>>,\n}\n\nimpl CoreBPE {\n    fn _get_tl_regex(&self) -> &Regex {\n        // See performance notes above for what this is about\n        // It's also a little janky, please make a better version of it!\n        // However, it's nice that this doesn't leak memory to short-lived threads\n        &self.regex_tls[hash_current_thread() % MAX_NUM_THREADS]\n    }\n\n    fn _get_tl_special_regex(&self) -> &Regex {\n        &self.special_regex_tls[hash_current_thread() % MAX_NUM_THREADS]\n    }\n\n    /// Decodes tokens into a list of bytes.\n    ///\n    /// The bytes are not gauranteed to be a valid utf-8 string.\n    fn decode_bytes(&self, tokens: &[Rank]) -> Result<Vec<u8>, DecodeKeyError> {\n        let mut ret = Vec::with_capacity(tokens.len() * 2);\n        for &token in tokens {\n            let token_bytes = match self.decoder.get(&token) {\n                Some(bytes) => bytes,\n                None => self\n                    .special_tokens_decoder\n                    .get(&token)\n                    .ok_or(DecodeKeyError { token })?,\n            };\n            ret.extend(token_bytes);\n        }\n        Ok(ret)\n    }\n\n    pub fn encode_ordinary(&self, text: &str) -> Vec<Rank> {\n        // This is the core of the encoding logic; the other functions in here\n        // just make things complicated :-)\n        let regex = self._get_tl_regex();\n        let mut ret = vec![];\n        for mat in regex.find_iter(text) {\n            let piece = mat.unwrap().as_str().as_bytes();\n            match self.encoder.get(piece) {\n                Some(token) => ret.push(*token),\n                None => ret.extend(&byte_pair_encode(piece, &self.encoder)),\n            }\n        }\n        ret\n    }\n\n    pub fn encode(\n        &self,\n        text: &str,\n        allowed_special: &HashSet<&str>,\n    ) -> Result<(Vec<Rank>, usize), EncodeError> {\n        let special_regex = self._get_tl_special_regex();\n        let regex = self._get_tl_regex();\n        let mut ret = vec![];\n\n        let mut start = 0;\n        let mut last_piece_token_len = 0;\n        loop {\n            let mut next_special;\n            let mut start_find = start;\n            loop {\n                // Find the next allowed special token, if any\n                next_special = special_regex.find_from_pos(text, start_find).unwrap();\n                match next_special {\n                    Some(m) => {\n                        if allowed_special.contains(&text[m.start()..m.end()]) {\n                            break;\n                        }\n                        start_find = m.start() + 1;\n                    }\n                    None => break,\n                }\n            }\n            let end = next_special.map_or(text.len(), |m| m.start());\n\n            // Okay, here we go, compare this logic to encode_ordinary\n            for mat_res in regex.find_iter(&text[start..end]) {\n                let mat = match mat_res {\n                    Ok(m) => m,\n                    Err(e) => {\n                        return Err(EncodeError {\n                            message: format!(\"Regex error while tokenizing: {e}\"),\n                        });\n                    }\n                };\n\n                let piece = mat.as_str().as_bytes();\n                if let Some(token) = self.encoder.get(piece) {\n                    last_piece_token_len = 1;\n                    ret.push(*token);\n                    continue;\n                }\n                let tokens = byte_pair_encode(piece, &self.encoder);\n                last_piece_token_len = tokens.len();\n                ret.extend(&tokens);\n            }\n\n            match next_special {\n                // And here we push the special token\n                Some(m) => {\n                    let piece = m.as_str();\n                    let token = self.special_tokens_encoder[piece];\n                    ret.push(token);\n                    start = m.end();\n                    last_piece_token_len = 0;\n                }\n                None => break,\n            }\n        }\n\n        // last_piece_token_len is how many tokens came from the last regex split. This is used\n        // for determining unstable tokens, since you can't merge across (stable) regex splits\n        Ok((ret, last_piece_token_len))\n    }\n\n    fn _increase_last_piece_token_len(\n        &self,\n        tokens: Vec<Rank>,\n        mut last_piece_token_len: usize,\n    ) -> (Vec<Rank>, usize) {\n        // Unfortunately, the locations where our regex splits can be unstable.\n        // For the purposes of determining unstable tokens, unstable regex splitting\n        // is only a problem if a split that was present disappears, since this can\n        // lead to merging of tokens otherwise thought to be stable.\n        // cl100k_base makes our life hard by including the \\s*[\\r\\n]+\n        // pattern. This can e.g. cause \"\\n\" + \" \" to become \"\\n \\n\".\n        // Here is a quick and dirty fix:\n        {\n            let token_is_all_space = |token| {\n                self.decoder\n                    .get(token)\n                    .map(|token_bytes| {\n                        token_bytes\n                            .iter()\n                            .rev()\n                            .all(|&b| [b' ', b'\\n', b'\\t'].contains(&b))\n                    })\n                    .unwrap_or(false)\n            };\n            if last_piece_token_len > 0\n                && token_is_all_space(&tokens[tokens.len() - last_piece_token_len])\n            {\n                while (last_piece_token_len < tokens.len())\n                    && token_is_all_space(&tokens[tokens.len() - last_piece_token_len - 1])\n                {\n                    last_piece_token_len += 1;\n                }\n            }\n        }\n        debug_assert!(last_piece_token_len <= tokens.len());\n\n        (tokens, last_piece_token_len)\n    }\n\n    pub fn _encode_unstable_native(\n        &self,\n        text: &str,\n        allowed_special: &HashSet<&str>,\n    ) -> (Vec<Rank>, HashSet<Vec<Rank>>) {\n        let (tokens, last_piece_token_len) = self.encode(text, allowed_special).unwrap();\n        if last_piece_token_len == 0 {\n            // If last_piece_token_len is zero, the last token was a special token and we have\n            // no unstable bytes\n            return (tokens, HashSet::new());\n        }\n        let (mut tokens, last_piece_token_len) =\n            self._increase_last_piece_token_len(tokens, last_piece_token_len);\n\n        let unstable_bytes = self\n            .decode_bytes(&tokens[tokens.len() - last_piece_token_len..])\n            .unwrap();\n        tokens.truncate(tokens.len() - last_piece_token_len);\n\n        // TODO: we should try harder to find additional stable tokens\n        // This would reduce the amount of retokenising when determining completions\n        // Refer to the logic in an older version of this file\n\n        let mut completions = HashSet::new();\n        if unstable_bytes.is_empty() {\n            return (tokens, completions);\n        }\n\n        // This is the easy bit. Just find all single tokens that start with unstable_bytes\n        // (including tokens that exactly match unstable_bytes)\n        // Separating this from the loop below helps with performance in a common case.\n        let mut point = self\n            .sorted_token_bytes\n            .partition_point(|x| x.as_slice() < unstable_bytes.as_slice());\n        while point < self.sorted_token_bytes.len()\n            && self.sorted_token_bytes[point].starts_with(&unstable_bytes)\n        {\n            completions.insert(vec![\n                self.encoder[self.sorted_token_bytes[point].as_slice()],\n            ]);\n            point += 1;\n        }\n\n        // Now apply even more brute force. At every (other) possible position for the straddling\n        // token, concatenate additional bytes from that token (if any) to unstable_bytes,\n        // and retokenise the whole thing and see what we get.\n        for i in 1..unstable_bytes.len() {\n            let prefix = &unstable_bytes[..i];\n            let suffix = &unstable_bytes[i..];\n            let mut point = self\n                .sorted_token_bytes\n                .partition_point(|x| x.as_slice() < suffix);\n            // TODO: Perf optimisation if suffix starts with \" \"?\n            while point < self.sorted_token_bytes.len()\n                && self.sorted_token_bytes[point].starts_with(suffix)\n            {\n                let possibility = [prefix, self.sorted_token_bytes[point].as_slice()].concat();\n                let encoded = match std::str::from_utf8(&possibility) {\n                    // Morally, this is byte_pair_encode(&possibility, &self.encoder)\n                    // But we might have introduced a regex split which would prevent merges.\n                    // (particularly possible in the presence of unstable regex splits)\n                    // So convert to UTF-8 and do regex splitting.\n                    // E.g. with cl100k_base \"  !\" gets split to \" \" + \" !\",\n                    // but byte_pair_encode(\"  !\") != byte_pair_encode(\" \")\n                    Ok(s) => self.encode_ordinary(s),\n\n                    // Technically, whether or not this arm is correct depends on whether there\n                    // would be a regex split before the UTF-8 truncation point.\n                    // Probably niche enough that no one will ever notice (after all, people didn't\n                    // notice all the big holes in the previous unstable token implementation)\n                    Err(_) => byte_pair_encode(&possibility, &self.encoder),\n                    // Something like the following is intriguing but incorrect:\n                    // Err(e) => self.encode_ordinary(unsafe {\n                    //     std::str::from_utf8_unchecked(&possibility[..e.valid_up_to()])\n                    // }),\n                };\n                let mut seq = Vec::new();\n                let mut seq_len = 0;\n                for token in encoded {\n                    seq.push(token);\n                    seq_len += self.decoder[&token].len();\n                    if seq_len >= unstable_bytes.len() {\n                        break;\n                    }\n                }\n                completions.insert(seq);\n                point += 1;\n            }\n        }\n\n        // This is also not straightforward. While we generally assume that regex splits are stable,\n        // unfortunately, they are not. That is, if adding bytes were to make a split appear in\n        // unstable_bytes, this could make tokens possible which our logic would otherwise think\n        // would be merged.\n        // For example, with gpt2, the use of \\s+(?!\\S) means that \"\\n\\n\" could\n        // develop a split, e.g. \"\\n\\n0\" splits into \"\\n\"+\"\\n\"+\"0\", making \"\\n\" a possible token.\n        // Here is a quick and dirty fix:\n        // This isn't right if we ever remove \\s+(?!\\S)\n        if unstable_bytes.len() > 1 {\n            let last_decoded = bstr::decode_last_utf8(unstable_bytes.as_slice());\n            if unstable_bytes.len() - last_decoded.1 > 0\n                && last_decoded.0.is_some_and(|c| c.is_whitespace())\n            {\n                let mut reencoded = byte_pair_encode(\n                    &unstable_bytes[..unstable_bytes.len() - last_decoded.1],\n                    &self.encoder,\n                );\n                reencoded.extend(byte_pair_encode(\n                    &unstable_bytes[unstable_bytes.len() - last_decoded.1..],\n                    &self.encoder,\n                ));\n                completions.insert(reencoded);\n            }\n        }\n\n        (tokens, completions)\n    }\n\n    pub fn new<E, SE, NSE>(\n        encoder: E,\n        special_tokens_encoder: SE,\n        pattern: &str,\n    ) -> Result<Self, Box<dyn std::error::Error + Send + Sync>>\n    where\n        E: IntoIterator<Item = (Vec<u8>, Rank)>,\n        SE: IntoIterator<Item = (String, Rank)>,\n        NSE: IntoIterator<Item = (String, (Rank, Rank))>,\n    {\n        Self::new_internal(\n            HashMap::from_iter(encoder),\n            HashMap::from_iter(special_tokens_encoder),\n            pattern,\n        )\n    }\n\n    fn new_internal(\n        encoder: HashMap<Vec<u8>, Rank>,\n        special_tokens_encoder: HashMap<String, Rank>,\n        pattern: &str,\n    ) -> Result<Self, Box<dyn std::error::Error + Send + Sync>> {\n        let regex = Regex::new(pattern)?;\n\n        let special_regex = {\n            let parts = special_tokens_encoder\n                .keys()\n                .map(|s| fancy_regex::escape(s))\n                .collect::<Vec<_>>();\n            Regex::new(&parts.join(\"|\"))?\n        };\n\n        let decoder: HashMap<Rank, Vec<u8>> =\n            encoder.iter().map(|(k, v)| (*v, k.clone())).collect();\n\n        assert!(\n            encoder.len() == decoder.len(),\n            \"Encoder and decoder must be of equal length. Encoder length: {}, decoder length: {}.\\nMaybe you had duplicate token indices in your encoder?\",\n            encoder.len(),\n            decoder.len()\n        );\n\n        let special_tokens_decoder: HashMap<Rank, Vec<u8>> = special_tokens_encoder\n            .iter()\n            .map(|(k, v)| (*v, k.as_bytes().to_vec()))\n            .collect();\n\n        // Clone because I don't know how to tell Rust I'm not going to change the map\n        let mut sorted_token_bytes: Vec<Vec<u8>> = encoder.keys().cloned().collect();\n        sorted_token_bytes.sort();\n\n        Ok(Self {\n            encoder,\n            special_tokens_encoder,\n            decoder,\n            special_tokens_decoder,\n            regex_tls: (0..MAX_NUM_THREADS).map(|_| regex.clone()).collect(),\n            special_regex_tls: (0..MAX_NUM_THREADS)\n                .map(|_| special_regex.clone())\n                .collect(),\n            sorted_token_bytes,\n        })\n    }\n\n    pub fn special_tokens(&self) -> HashSet<&str> {\n        self.special_tokens_encoder\n            .keys()\n            .map(|s| s.as_str())\n            .collect()\n    }\n\n    pub fn encode_with_special_tokens(&self, text: &str) -> Vec<Rank> {\n        let allowed_special = self.special_tokens();\n        self.encode(text, &allowed_special).unwrap().0\n    }\n}\n\n#[cfg(test)]\nmod tests {\n    use fancy_regex::Regex;\n    use rustc_hash::FxHashMap as HashMap;\n\n    use crate::{Rank, byte_pair_split};\n\n    fn setup_ranks() -> HashMap<Vec<u8>, Rank> {\n        HashMap::from_iter([(b\"ab\".to_vec(), 0), (b\"cd\".to_vec(), 1)])\n    }\n\n    #[test]\n    fn test_simple_characters() {\n        let ranks = setup_ranks();\n        let res = byte_pair_split(b\"abcd\", &ranks);\n        assert_eq!(res, vec![b\"ab\", b\"cd\"]);\n    }\n\n    #[test]\n    fn test_repeated_characters() {\n        let ranks = setup_ranks();\n        let res = byte_pair_split(b\"abab\", &ranks);\n        assert_eq!(res, vec![b\"ab\", b\"ab\"]);\n    }\n}\n"
  },
  {
    "path": "src/py.rs",
    "content": "use std::collections::HashSet;\n\nuse pyo3::{\n    IntoPyObjectExt, PyResult, exceptions,\n    prelude::*,\n    pybacked::PyBackedStr,\n    types::{PyBytes, PyList},\n};\nuse rustc_hash::FxHashMap as HashMap;\n\nuse crate::{CoreBPE, Rank, byte_pair_encode};\n\n#[pymethods]\nimpl CoreBPE {\n    #[new]\n    fn py_new(\n        encoder: HashMap<Vec<u8>, Rank>,\n        special_tokens_encoder: HashMap<String, Rank>,\n        pattern: &str,\n    ) -> PyResult<Self> {\n        Self::new_internal(encoder, special_tokens_encoder, pattern)\n            .map_err(|e| PyErr::new::<exceptions::PyValueError, _>(e.to_string()))\n    }\n\n    // ====================\n    // Encoding\n    // ====================\n\n    #[pyo3(name = \"encode_ordinary\")]\n    fn py_encode_ordinary(&self, py: Python, text: &str) -> Vec<Rank> {\n        py.detach(|| self.encode_ordinary(text))\n    }\n\n    #[pyo3(name = \"encode\")]\n    fn py_encode(\n        &self,\n        py: Python,\n        text: &str,\n        allowed_special: HashSet<PyBackedStr>,\n    ) -> PyResult<Vec<Rank>> {\n        py.detach(|| {\n            let allowed_special: HashSet<&str> =\n                allowed_special.iter().map(|s| s.as_ref()).collect();\n            match self.encode(text, &allowed_special) {\n                Ok((tokens, _)) => Ok(tokens),\n                Err(e) => Err(PyErr::new::<exceptions::PyValueError, _>(e.message)),\n            }\n        })\n    }\n\n    fn encode_to_tiktoken_buffer(\n        &self,\n        py: Python,\n        text: &str,\n        allowed_special: HashSet<PyBackedStr>,\n    ) -> PyResult<Py<PyAny>> {\n        let tokens_res = py.detach(|| {\n            let allowed_special: HashSet<&str> =\n                allowed_special.iter().map(|s| s.as_ref()).collect();\n            self.encode(text, &allowed_special)\n        });\n\n        let tokens = match tokens_res {\n            Ok((tokens, _)) => tokens,\n            Err(e) => return Err(PyErr::new::<exceptions::PyValueError, _>(e.message)),\n        };\n\n        let buffer = TiktokenBuffer { tokens };\n        buffer.into_py_any(py)\n    }\n\n    fn _encode_bytes(&self, py: Python, bytes: &[u8]) -> Vec<Rank> {\n        py.detach(|| {\n            match std::str::from_utf8(bytes) {\n                // Straightforward case\n                Ok(text) => self.encode_ordinary(text),\n                // Oops, don't actually have UTF-8. But we need to do the regex splitting in\n                // Unicode space, so we make our best guess at where we would have splits\n                Err(e) => {\n                    let text = unsafe { std::str::from_utf8_unchecked(&bytes[..e.valid_up_to()]) };\n                    let (tokens, last_piece_token_len) =\n                        self.encode(text, &HashSet::new()).unwrap();\n                    let (mut tokens, last_piece_token_len) =\n                        self._increase_last_piece_token_len(tokens, last_piece_token_len);\n\n                    let mut unstable_bytes;\n                    if !tokens.is_empty() && last_piece_token_len > 0 {\n                        // Lop off the tokens from the last piece and run BPE on the remaining bytes\n                        // This likely matches what models see better, e.g. if you assume we're\n                        // dealing with truncated UTF-8 bytes.\n                        // Niche, but note this may not be correct if we'd have had a regex\n                        // split between the valid UTF-8 and the invalid bytes.\n                        unstable_bytes = self\n                            .decode_bytes(&tokens[tokens.len() - last_piece_token_len..])\n                            .unwrap();\n                        unstable_bytes.extend_from_slice(&bytes[e.valid_up_to()..]);\n\n                        tokens.truncate(tokens.len() - last_piece_token_len);\n                    } else {\n                        unstable_bytes = bytes[e.valid_up_to()..].to_vec();\n                    }\n\n                    if !unstable_bytes.is_empty() {\n                        match self.encoder.get(&unstable_bytes) {\n                            Some(token) => tokens.push(*token),\n                            None => {\n                                tokens.extend(&byte_pair_encode(&unstable_bytes, &self.encoder))\n                            }\n                        }\n                    }\n                    tokens\n                }\n            }\n        })\n    }\n\n    #[pyo3(name = \"encode_with_unstable\")]\n    fn py_encode_with_unstable(\n        &self,\n        py: Python,\n        text: &str,\n        allowed_special: HashSet<PyBackedStr>,\n    ) -> PyResult<(Vec<Rank>, Py<PyList>)> {\n        let (tokens, completions): (Vec<Rank>, HashSet<Vec<Rank>>) = py.detach(|| {\n            let allowed_special: HashSet<&str> =\n                allowed_special.iter().map(|s| s.as_ref()).collect();\n            self._encode_unstable_native(text, &allowed_special)\n        });\n        let py_completions = PyList::new(py, completions.into_iter())?;\n        Ok((tokens, py_completions.into()))\n    }\n\n    fn encode_single_token(&self, piece: &[u8]) -> PyResult<Rank> {\n        if let Some(token) = self.encoder.get(piece).copied() {\n            return Ok(token);\n        }\n        if let Ok(piece_str) = std::str::from_utf8(piece) {\n            if let Some(token) = self.special_tokens_encoder.get(piece_str).copied() {\n                return Ok(token);\n            }\n        }\n        Err(PyErr::new::<exceptions::PyKeyError, _>(piece.to_owned()))\n    }\n\n    fn encode_single_piece(&self, piece: &[u8]) -> Vec<Rank> {\n        if let Some(token) = self.encoder.get(piece) {\n            return vec![*token];\n        }\n        byte_pair_encode(piece, &self.encoder)\n    }\n\n    // ====================\n    // Decoding\n    // ====================\n\n    #[pyo3(name = \"decode_bytes\")]\n    fn py_decode_bytes(&self, py: Python, tokens: Vec<Rank>) -> Result<Py<PyBytes>, PyErr> {\n        match py.detach(|| self.decode_bytes(&tokens)) {\n            Ok(bytes) => Ok(PyBytes::new(py, &bytes).into()),\n            Err(e) => Err(pyo3::exceptions::PyKeyError::new_err(format!(\"{}\", e))),\n        }\n    }\n\n    fn decode_single_token_bytes(&self, py: Python, token: Rank) -> PyResult<Py<PyBytes>> {\n        if let Some(bytes) = self.decoder.get(&token) {\n            return Ok(PyBytes::new(py, bytes).into());\n        }\n        if let Some(bytes) = self.special_tokens_decoder.get(&token) {\n            return Ok(PyBytes::new(py, bytes).into());\n        }\n        Err(PyErr::new::<exceptions::PyKeyError, _>(token.to_string()))\n    }\n\n    // ====================\n    // Miscellaneous\n    // ====================\n\n    fn token_byte_values(&self, py: Python) -> Vec<Py<PyBytes>> {\n        self.sorted_token_bytes\n            .iter()\n            .map(|x| PyBytes::new(py, x).into())\n            .collect()\n    }\n}\n\n#[pyclass(frozen)]\nstruct TiktokenBuffer {\n    tokens: Vec<Rank>,\n}\n\n#[pymethods]\nimpl TiktokenBuffer {\n    // Based on https://github.com/PyO3/pyo3/blob/v0.22.2/tests/test_buffer_protocol.rs#L25\n    unsafe fn __getbuffer__(\n        slf: Bound<'_, Self>,\n        view: *mut pyo3::ffi::Py_buffer,\n        flags: std::os::raw::c_int,\n    ) -> PyResult<()> {\n        if view.is_null() {\n            return Err(pyo3::exceptions::PyBufferError::new_err(\"View is null\"));\n        }\n        if (flags & pyo3::ffi::PyBUF_WRITABLE) == pyo3::ffi::PyBUF_WRITABLE {\n            return Err(pyo3::exceptions::PyBufferError::new_err(\n                \"Object is not writable\",\n            ));\n        }\n        unsafe {\n            let view_ref = &mut *view;\n            view_ref.obj = slf.clone().into_any().into_ptr();\n\n            let data = &slf.borrow().tokens;\n            view_ref.buf = data.as_ptr() as *mut std::os::raw::c_void;\n            view_ref.len = (data.len() * std::mem::size_of::<Rank>()) as isize;\n            view_ref.readonly = 1;\n            view_ref.itemsize = std::mem::size_of::<Rank>() as isize;\n            view_ref.format = if (flags & pyo3::ffi::PyBUF_FORMAT) == pyo3::ffi::PyBUF_FORMAT {\n                let msg = std::ffi::CString::new(\"I\").unwrap();\n                msg.into_raw()\n            } else {\n                std::ptr::null_mut()\n            };\n            view_ref.ndim = 1;\n            view_ref.shape = if (flags & pyo3::ffi::PyBUF_ND) == pyo3::ffi::PyBUF_ND {\n                &mut view_ref.len\n            } else {\n                std::ptr::null_mut()\n            };\n            view_ref.strides = if (flags & pyo3::ffi::PyBUF_STRIDES) == pyo3::ffi::PyBUF_STRIDES {\n                &mut view_ref.itemsize\n            } else {\n                std::ptr::null_mut()\n            };\n            view_ref.suboffsets = std::ptr::null_mut();\n            view_ref.internal = std::ptr::null_mut();\n        }\n\n        Ok(())\n    }\n\n    unsafe fn __releasebuffer__(&self, view: *mut pyo3::ffi::Py_buffer) {\n        // Note that Py_buffer doesn't have a Drop impl\n        unsafe {\n            let view_ref = &mut *view;\n            if !view_ref.format.is_null() {\n                std::mem::drop(std::ffi::CString::from_raw(view_ref.format));\n            }\n        }\n    }\n}\n\n#[pymodule(gil_used = false)]\nfn _tiktoken(_py: Python, m: &Bound<PyModule>) -> PyResult<()> {\n    m.add_class::<CoreBPE>()?;\n    Ok(())\n}\n"
  },
  {
    "path": "tests/__init__.py",
    "content": ""
  },
  {
    "path": "tests/test_encoding.py",
    "content": "# Note that there are more actual tests, they're just not currently public :-)\n\nfrom typing import Callable\n\nimport hypothesis\nimport hypothesis.strategies as st\nimport pytest\n\nimport tiktoken\n\nfrom .test_helpers import ENCODING_FACTORIES, MAX_EXAMPLES\n\n\ndef test_simple():\n    enc = tiktoken.get_encoding(\"gpt2\")\n    assert enc.encode(\"hello world\") == [31373, 995]\n    assert enc.decode([31373, 995]) == \"hello world\"\n    assert enc.encode(\"hello <|endoftext|>\", allowed_special=\"all\") == [31373, 220, 50256]\n\n    enc = tiktoken.get_encoding(\"cl100k_base\")\n    assert enc.encode(\"hello world\") == [15339, 1917]\n    assert enc.decode([15339, 1917]) == \"hello world\"\n    assert enc.encode(\"hello <|endoftext|>\", allowed_special=\"all\") == [15339, 220, 100257]\n\n    for enc_name in tiktoken.list_encoding_names():\n        enc = tiktoken.get_encoding(enc_name)\n        for token in range(min(10_000, enc.max_token_value - 1)):\n            assert enc.encode_single_token(enc.decode_single_token_bytes(token)) == token\n\n\ndef test_simple_repeated():\n    enc = tiktoken.get_encoding(\"gpt2\")\n    assert enc.encode(\"0\") == [15]\n    assert enc.encode(\"00\") == [405]\n    assert enc.encode(\"000\") == [830]\n    assert enc.encode(\"0000\") == [2388]\n    assert enc.encode(\"00000\") == [20483]\n    assert enc.encode(\"000000\") == [10535]\n    assert enc.encode(\"0000000\") == [24598]\n    assert enc.encode(\"00000000\") == [8269]\n    assert enc.encode(\"000000000\") == [10535, 830]\n    assert enc.encode(\"0000000000\") == [8269, 405]\n    assert enc.encode(\"00000000000\") == [8269, 830]\n    assert enc.encode(\"000000000000\") == [8269, 2388]\n    assert enc.encode(\"0000000000000\") == [8269, 20483]\n    assert enc.encode(\"00000000000000\") == [8269, 10535]\n    assert enc.encode(\"000000000000000\") == [8269, 24598]\n    assert enc.encode(\"0000000000000000\") == [25645]\n    assert enc.encode(\"00000000000000000\") == [8269, 10535, 830]\n\n\ndef test_large_repeated():\n    enc = tiktoken.get_encoding(\"o200k_base\")\n\n    # Large inputs should be handled without raising.\n    tokens = enc.encode(\"x\" * 1_000_000)\n    assert tokens\n\n\ndef test_simple_regex():\n    enc = tiktoken.get_encoding(\"cl100k_base\")\n    assert enc.encode(\"rer\") == [38149]\n    assert enc.encode(\"'rer\") == [2351, 81]\n    assert enc.encode(\"today\\n \") == [31213, 198, 220]\n    assert enc.encode(\"today\\n \\n\") == [31213, 27907]\n    assert enc.encode(\"today\\n  \\n\") == [31213, 14211]\n\n\ndef test_basic_encode():\n    enc = tiktoken.get_encoding(\"r50k_base\")\n    assert enc.encode(\"hello world\") == [31373, 995]\n\n    enc = tiktoken.get_encoding(\"p50k_base\")\n    assert enc.encode(\"hello world\") == [31373, 995]\n\n    enc = tiktoken.get_encoding(\"cl100k_base\")\n    assert enc.encode(\"hello world\") == [15339, 1917]\n    assert enc.encode(\" \\x850\") == [220, 126, 227, 15]\n\n\ndef test_encode_empty():\n    enc = tiktoken.get_encoding(\"r50k_base\")\n    assert enc.encode(\"\") == []\n\n\ndef test_encode_bytes():\n    enc = tiktoken.get_encoding(\"cl100k_base\")\n    assert enc._encode_bytes(b\" \\xec\\x8b\\xa4\\xed\") == [62085]\n    for i in range(10):\n        bytestring = b\"\\x80\" * i\n        assert enc.decode_bytes(enc._encode_bytes(bytestring)) == bytestring\n\n\n@pytest.mark.parametrize(\"make_enc\", ENCODING_FACTORIES)\n@hypothesis.given(bytestring=st.binary())\n@hypothesis.settings(deadline=None, max_examples=MAX_EXAMPLES)\ndef test_hyp_encode_bytes(make_enc: Callable[[], tiktoken.Encoding], bytestring: bytes):\n    enc = make_enc()\n    assert enc.decode_bytes(enc._encode_bytes(bytestring)) == bytestring\n\n\ndef test_encode_surrogate_pairs():\n    enc = tiktoken.get_encoding(\"cl100k_base\")\n\n    assert enc.encode(\"👍\") == [9468, 239, 235]\n    # surrogate pair gets converted to codepoint\n    assert enc.encode(\"\\ud83d\\udc4d\") == [9468, 239, 235]\n\n    # lone surrogate just gets replaced\n    assert enc.encode(\"\\ud83d\") == enc.encode(\"�\")\n\n\n@pytest.mark.parametrize(\"make_enc\", ENCODING_FACTORIES)\ndef test_catastrophically_repetitive(make_enc: Callable[[], tiktoken.Encoding]):\n    enc = make_enc()\n    for c in [\"^\", \"0\", \"a\", \"'s\", \" \", \"\\n\"]:\n        big_value = c * 10_000\n        assert big_value == enc.decode(enc.encode(big_value))\n\n        big_value = \" \" + big_value\n        assert big_value == enc.decode(enc.encode(big_value))\n\n        big_value = big_value + \"\\n\"\n        assert big_value == enc.decode(enc.encode(big_value))\n\n\n# ====================\n# Roundtrip\n# ====================\n\n\n@pytest.mark.parametrize(\"make_enc\", ENCODING_FACTORIES)\ndef test_basic_roundtrip(make_enc):\n    enc = make_enc()\n    for value in (\n        \"hello\",\n        \"hello \",\n        \"hello  \",\n        \" hello\",\n        \" hello \",\n        \" hello  \",\n        \"hello world\",\n        \"请考试我的软件！12345\",\n    ):\n        assert value == enc.decode(enc.encode(value))\n        assert value == enc.decode(enc.encode_ordinary(value))\n\n\n@pytest.mark.parametrize(\"make_enc\", ENCODING_FACTORIES)\n@hypothesis.given(text=st.text())\n@hypothesis.settings(deadline=None, max_examples=MAX_EXAMPLES)\ndef test_hyp_roundtrip(make_enc: Callable[[], tiktoken.Encoding], text):\n    enc = make_enc()\n\n    assert text == enc.decode(enc.encode(text))\n\n\n@pytest.mark.parametrize(\"make_enc\", ENCODING_FACTORIES)\ndef test_single_token_roundtrip(make_enc: Callable[[], tiktoken.Encoding]):\n    enc = make_enc()\n\n    for token in range(enc.n_vocab):\n        try:\n            token_bytes = enc.decode_single_token_bytes(token)\n        except KeyError:\n            continue\n        assert enc.encode_single_token(token_bytes) == token\n\n\n# ====================\n# Special tokens\n# ====================\n\n\ndef test_special_token():\n    enc = tiktoken.get_encoding(\"cl100k_base\")\n\n    eot = enc.encode_single_token(\"<|endoftext|>\")\n    assert eot == enc.eot_token\n    fip = enc.encode_single_token(\"<|fim_prefix|>\")\n    fim = enc.encode_single_token(\"<|fim_middle|>\")\n\n    text = \"<|endoftext|> hello <|fim_prefix|>\"\n    assert eot not in enc.encode(text, disallowed_special=())\n    with pytest.raises(ValueError):\n        enc.encode(text)\n    with pytest.raises(ValueError):\n        enc.encode(text, disallowed_special=\"all\")\n    with pytest.raises(ValueError):\n        enc.encode(text, disallowed_special={\"<|endoftext|>\"})\n    with pytest.raises(ValueError):\n        enc.encode(text, disallowed_special={\"<|fim_prefix|>\"})\n\n    text = \"<|endoftext|> hello <|fim_prefix|> there <|fim_middle|>\"\n    tokens = enc.encode(text, disallowed_special=())\n    assert eot not in tokens\n    assert fip not in tokens\n    assert fim not in tokens\n\n    tokens = enc.encode(text, allowed_special=\"all\", disallowed_special=())\n    assert eot in tokens\n    assert fip in tokens\n    assert fim in tokens\n\n    tokens = enc.encode(text, allowed_special=\"all\", disallowed_special=\"all\")\n    assert eot in tokens\n    assert fip in tokens\n    assert fim in tokens\n\n    tokens = enc.encode(text, allowed_special={\"<|fim_prefix|>\"}, disallowed_special=())\n    assert eot not in tokens\n    assert fip in tokens\n    assert fim not in tokens\n\n    tokens = enc.encode(text, allowed_special={\"<|endoftext|>\"}, disallowed_special=())\n    assert eot in tokens\n    assert fip not in tokens\n    assert fim not in tokens\n\n    tokens = enc.encode(text, allowed_special={\"<|fim_middle|>\"}, disallowed_special=())\n    assert eot not in tokens\n    assert fip not in tokens\n    assert fim in tokens\n\n\n@pytest.mark.parametrize(\"make_enc\", ENCODING_FACTORIES)\n@hypothesis.given(text=st.text())\n@hypothesis.settings(deadline=None, max_examples=MAX_EXAMPLES)\ndef test_hyp_special_ordinary(make_enc, text: str):\n    enc = make_enc()\n    assert enc.encode_ordinary(text) == enc.encode(text, disallowed_special=())\n\n\n# ====================\n# Batch encoding\n# ====================\n\n\n@pytest.mark.parametrize(\"make_enc\", ENCODING_FACTORIES)\ndef test_batch_encode(make_enc: Callable[[], tiktoken.Encoding]):\n    enc = make_enc()\n    text1 = \"hello world\"\n    text2 = \"goodbye world\"\n\n    assert enc.encode_batch([text1]) == [enc.encode(text1)]\n    assert enc.encode_batch([text1, text2]) == [enc.encode(text1), enc.encode(text2)]\n\n    assert enc.encode_ordinary_batch([text1]) == [enc.encode_ordinary(text1)]\n    assert enc.encode_ordinary_batch([text1, text2]) == [\n        enc.encode_ordinary(text1),\n        enc.encode_ordinary(text2),\n    ]\n\n\n@pytest.mark.parametrize(\"make_enc\", ENCODING_FACTORIES)\n@hypothesis.given(batch=st.lists(st.text()))\n@hypothesis.settings(deadline=None, max_examples=MAX_EXAMPLES)\ndef test_hyp_batch_roundtrip(make_enc: Callable[[], tiktoken.Encoding], batch):\n    enc = make_enc()\n\n    encoded = enc.encode_batch(batch, allowed_special=\"all\")\n    assert encoded == [enc.encode(t, allowed_special=\"all\") for t in batch]\n    decoded = enc.decode_batch(encoded)\n    assert decoded == batch\n"
  },
  {
    "path": "tests/test_helpers.py",
    "content": "import bisect\nimport functools\nimport os\n\nimport pytest\n\nimport tiktoken\n\nMAX_EXAMPLES: int = int(os.environ.get(\"TIKTOKEN_MAX_EXAMPLES\", \"100\"))\n\nENCODINGS = [\"r50k_base\", \"cl100k_base\"]\nSOME_ENCODINGS = [\"cl100k_base\"]\n\n\nENCODING_FACTORIES = [\n    pytest.param(functools.partial(tiktoken.get_encoding, name), id=name) for name in ENCODINGS\n]\nSOME_ENCODING_FACTORIES = [\n    pytest.param(functools.partial(tiktoken.get_encoding, name), id=name) for name in SOME_ENCODINGS\n]\n\n\n"
  },
  {
    "path": "tests/test_misc.py",
    "content": "import subprocess\nimport sys\n\nimport tiktoken\n\n\ndef test_encoding_for_model():\n    enc = tiktoken.encoding_for_model(\"gpt2\")\n    assert enc.name == \"gpt2\"\n    enc = tiktoken.encoding_for_model(\"text-davinci-003\")\n    assert enc.name == \"p50k_base\"\n    enc = tiktoken.encoding_for_model(\"text-davinci-edit-001\")\n    assert enc.name == \"p50k_edit\"\n    enc = tiktoken.encoding_for_model(\"gpt-3.5-turbo-0301\")\n    assert enc.name == \"cl100k_base\"\n    enc = tiktoken.encoding_for_model(\"gpt-4\")\n    assert enc.name == \"cl100k_base\"\n    enc = tiktoken.encoding_for_model(\"gpt-4o\")\n    assert enc.name == \"o200k_base\"\n    enc = tiktoken.encoding_for_model(\"gpt-oss-120b\")\n    assert enc.name == \"o200k_harmony\"\n\n\ndef test_optional_blobfile_dependency():\n    prog = \"\"\"\nimport tiktoken\nimport sys\nassert \"blobfile\" not in sys.modules\n\"\"\"\n    subprocess.check_call([sys.executable, \"-c\", prog])\n"
  },
  {
    "path": "tests/test_offsets.py",
    "content": "from typing import Callable\n\nimport hypothesis\nimport pytest\nfrom hypothesis import strategies as st\n\nimport tiktoken\n\nfrom .test_helpers import MAX_EXAMPLES, SOME_ENCODING_FACTORIES\n\n\ndef _common_prefix_len(a, b):\n    i = 0\n    while i < len(a) and i < len(b) and a[i] == b[i]:\n        i += 1\n    return i\n\n\ndef _token_offsets_reference(enc, tokens):\n    text = enc.decode(tokens, errors=\"strict\")\n    res = []\n    for i in range(len(tokens)):\n        prefix = enc.decode(tokens[:i], errors=\"ignore\")\n        res.append(_common_prefix_len(text, prefix))\n    return res\n\n\n@pytest.mark.parametrize(\"make_enc\", SOME_ENCODING_FACTORIES)\n@hypothesis.given(data=st.data())\n@hypothesis.settings(deadline=None, max_examples=MAX_EXAMPLES)\ndef test_hyp_offsets(make_enc: Callable[[], tiktoken.Encoding], data):\n    enc = make_enc()\n\n    tokens_st = st.lists(\n        st.integers(0, enc.n_vocab - 1).filter(\n            lambda x: x in enc._special_tokens.values() or x in enc._mergeable_ranks.values()\n        ),\n        min_size=1,\n        max_size=20,\n    )\n    tokens = data.draw(tokens_st)\n\n    # This is a dumb hack to make sure that our tokens are a valid UTF-8 string\n    # We could potentially drop this, see the TODO in decode_with_offsets\n    tokens = enc.encode(enc.decode(tokens, errors=\"ignore\"), allowed_special=\"all\")\n    assert enc.decode_with_offsets(tokens)[1] == _token_offsets_reference(enc, tokens)\n\n\ndef test_basic_offsets():\n    enc = tiktoken.get_encoding(\"cl100k_base\")\n\n    prompt = \"hello world\"\n    p, o = enc.decode_with_offsets(enc.encode(prompt))\n    assert p == prompt\n    assert o == [0, 5]\n\n    prompt = \"hello world<|endoftext|> green cow\"\n    p, o = enc.decode_with_offsets(enc.encode(prompt, allowed_special=\"all\"))\n    assert p == prompt\n    assert o == [0, 5, 11, 24, 30]\n\n    prompt = \"我非常渴望与人工智能一起工作\"\n    p, o = enc.decode_with_offsets(enc.encode(prompt))\n    assert p == prompt\n    assert o == [0, 1, 2, 3, 3, 4, 4, 5, 6, 7, 8, 8, 9, 10, 11, 12, 13]\n\n    # contains the interesting tokens b'\\xe0\\xae\\xbf\\xe0\\xae' and b'\\xe0\\xaf\\x8d\\xe0\\xae'\n    # in which \\xe0 is the start of a 3-byte UTF-8 character\n    prompt = \"நடிகர் சூர்யா\"\n    p, o = enc.decode_with_offsets(enc.encode(prompt))\n    assert p == prompt\n    assert o == [0, 0, 1, 1, 2, 3, 4, 4, 5, 6, 7, 8, 8, 9, 9, 10, 11, 12, 12]\n\n    # contains the interesting token b'\\xa0\\xe9\\x99\\xa4'\n    # in which \\xe9 is the start of a 3-byte UTF-8 character and \\xa0 is a continuation byte\n    prompt = \" Ġ除\"\n    p, o = enc.decode_with_offsets(enc.encode(prompt))\n    assert p == prompt\n    assert o == [0, 1]\n"
  },
  {
    "path": "tests/test_pickle.py",
    "content": "import tiktoken\n\n\ndef test_pickle():\n    import pickle\n\n    enc_old = tiktoken.get_encoding(\"r50k_base\")\n    enc_new = pickle.loads(pickle.dumps(enc_old))\n    assert enc_old.encode(\"hello world\") == enc_new.encode(\"hello world\")\n\n    enc_old = tiktoken.Encoding(\n        name=\"custom_enc\",\n        pat_str=enc_old._pat_str,\n        mergeable_ranks=enc_old._mergeable_ranks,\n        special_tokens={\"<|pickle|>\": 100_000},\n    )\n    enc_new = pickle.loads(pickle.dumps(enc_old))\n    assert enc_old.encode(\"hello world\") == enc_new.encode(\"hello world\")\n    assert (\n        enc_old.encode(\"<|pickle|>\", allowed_special=\"all\")\n        == enc_new.encode(\"<|pickle|>\", allowed_special=\"all\")\n        == [100_000]\n    )\n"
  },
  {
    "path": "tests/test_simple_public.py",
    "content": "import subprocess\nimport sys\n\nimport tiktoken\n\n\ndef test_simple():\n    # Note that there are more actual tests, they're just not currently public :-)\n    enc = tiktoken.get_encoding(\"gpt2\")\n    assert enc.encode(\"hello world\") == [31373, 995]\n    assert enc.decode([31373, 995]) == \"hello world\"\n    assert enc.encode(\"hello <|endoftext|>\", allowed_special=\"all\") == [31373, 220, 50256]\n\n    enc = tiktoken.get_encoding(\"cl100k_base\")\n    assert enc.encode(\"hello world\") == [15339, 1917]\n    assert enc.decode([15339, 1917]) == \"hello world\"\n    assert enc.encode(\"hello <|endoftext|>\", allowed_special=\"all\") == [15339, 220, 100257]\n\n    for enc_name in tiktoken.list_encoding_names():\n        enc = tiktoken.get_encoding(enc_name)\n        for token in range(10_000):\n            assert enc.encode_single_token(enc.decode_single_token_bytes(token)) == token\n\n\ndef test_encoding_for_model():\n    enc = tiktoken.encoding_for_model(\"gpt2\")\n    assert enc.name == \"gpt2\"\n    enc = tiktoken.encoding_for_model(\"text-davinci-003\")\n    assert enc.name == \"p50k_base\"\n    enc = tiktoken.encoding_for_model(\"text-davinci-edit-001\")\n    assert enc.name == \"p50k_edit\"\n    enc = tiktoken.encoding_for_model(\"gpt-3.5-turbo-0301\")\n    assert enc.name == \"cl100k_base\"\n\n\ndef test_optional_blobfile_dependency():\n    prog = \"\"\"\nimport tiktoken\nimport sys\nassert \"blobfile\" not in sys.modules\n\"\"\"\n    subprocess.check_call([sys.executable, \"-c\", prog])\n"
  },
  {
    "path": "tiktoken/__init__.py",
    "content": "# This is the public API of tiktoken\nfrom .core import Encoding as Encoding\nfrom .model import encoding_for_model as encoding_for_model\nfrom .model import encoding_name_for_model as encoding_name_for_model\nfrom .registry import get_encoding as get_encoding\nfrom .registry import list_encoding_names as list_encoding_names\n\n__version__ = \"0.12.0\"\n"
  },
  {
    "path": "tiktoken/_educational.py",
    "content": "\"\"\"This is an educational implementation of the byte pair encoding algorithm.\"\"\"\n\nfrom __future__ import annotations\n\nimport collections\n\nimport regex\n\nimport tiktoken\n\n\nclass SimpleBytePairEncoding:\n    def __init__(self, *, pat_str: str, mergeable_ranks: dict[bytes, int]) -> None:\n        \"\"\"Creates an Encoding object.\"\"\"\n        # A regex pattern string that is used to split the input text\n        self.pat_str = pat_str\n        # A dictionary mapping token bytes to their ranks. The ranks correspond to merge priority\n        self.mergeable_ranks = mergeable_ranks\n\n        self._decoder = {token: token_bytes for token_bytes, token in mergeable_ranks.items()}\n        self._pat = regex.compile(pat_str)\n\n    def encode(self, text: str, visualise: str | None = \"colour\") -> list[int]:\n        \"\"\"Encodes a string into tokens.\n\n        >>> enc.encode(\"hello world\")\n        [388, 372]\n        \"\"\"\n        # Use the regex to split the text into (approximately) words\n        words = self._pat.findall(text)\n        tokens = []\n        for word in words:\n            # Turn each word into tokens, using the byte pair encoding algorithm\n            word_bytes = word.encode(\"utf-8\")\n            word_tokens = bpe_encode(self.mergeable_ranks, word_bytes, visualise=visualise)\n            tokens.extend(word_tokens)\n        return tokens\n\n    def decode_bytes(self, tokens: list[int]) -> bytes:\n        \"\"\"Decodes a list of tokens into bytes.\n\n        >>> enc.decode_bytes([388, 372])\n        b'hello world'\n        \"\"\"\n        return b\"\".join(self._decoder[token] for token in tokens)\n\n    def decode(self, tokens: list[int]) -> str:\n        \"\"\"Decodes a list of tokens into a string.\n\n        Decoded bytes are not guaranteed to be valid UTF-8. In that case, we replace\n        the invalid bytes with the replacement character \"�\".\n\n        >>> enc.decode([388, 372])\n        'hello world'\n        \"\"\"\n        return self.decode_bytes(tokens).decode(\"utf-8\", errors=\"replace\")\n\n    def decode_tokens_bytes(self, tokens: list[int]) -> list[bytes]:\n        \"\"\"Decodes a list of tokens into a list of bytes.\n\n        Useful for visualising how a string is tokenised.\n\n        >>> enc.decode_tokens_bytes([388, 372])\n        [b'hello', b' world']\n        \"\"\"\n        return [self._decoder[token] for token in tokens]\n\n    @staticmethod\n    def train(training_data: str, vocab_size: int, pat_str: str):\n        \"\"\"Train a BPE tokeniser on some data!\"\"\"\n        mergeable_ranks = bpe_train(data=training_data, vocab_size=vocab_size, pat_str=pat_str)\n        return SimpleBytePairEncoding(pat_str=pat_str, mergeable_ranks=mergeable_ranks)\n\n    @staticmethod\n    def from_tiktoken(encoding):\n        if isinstance(encoding, str):\n            encoding = tiktoken.get_encoding(encoding)\n        return SimpleBytePairEncoding(\n            pat_str=encoding._pat_str, mergeable_ranks=encoding._mergeable_ranks\n        )\n\n\ndef bpe_encode(\n    mergeable_ranks: dict[bytes, int], input: bytes, visualise: str | None = \"colour\"\n) -> list[int]:\n    parts = [bytes([b]) for b in input]\n    while True:\n        # See the intermediate merges play out!\n        if visualise:\n            if visualise in [\"colour\", \"color\"]:\n                visualise_tokens(parts)\n            elif visualise == \"simple\":\n                print(parts)\n\n        # Iterate over all pairs and find the pair we want to merge the most\n        min_idx = None\n        min_rank = None\n        for i, pair in enumerate(zip(parts[:-1], parts[1:])):\n            rank = mergeable_ranks.get(pair[0] + pair[1])\n            if rank is not None and (min_rank is None or rank < min_rank):\n                min_idx = i\n                min_rank = rank\n\n        # If there were no pairs we could merge, we're done!\n        if min_rank is None:\n            break\n        assert min_idx is not None\n\n        # Otherwise, merge that pair and leave the rest unchanged. Then repeat.\n        parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2 :]\n\n    if visualise:\n        print()\n\n    tokens = [mergeable_ranks[part] for part in parts]\n    return tokens\n\n\ndef bpe_train(\n    data: str, vocab_size: int, pat_str: str, visualise: str | None = \"colour\"\n) -> dict[bytes, int]:\n    # First, add tokens for each individual byte value\n    if vocab_size < 2**8:\n        raise ValueError(\"vocab_size must be at least 256, so we can encode all bytes\")\n    ranks = {}\n    for i in range(2**8):\n        ranks[bytes([i])] = i\n\n    # Splinter up our data into lists of bytes\n    # data = \"Hello world\"\n    # words = [\n    #     [b'H', b'e', b'l', b'l', b'o'],\n    #     [b' ', b'w', b'o', b'r', b'l', b'd']\n    # ]\n    words: list[list[bytes]] = [\n        [bytes([b]) for b in word.encode(\"utf-8\")] for word in regex.findall(pat_str, data)\n    ]\n\n    # Now, use our data to figure out which merges we should make\n    while len(ranks) < vocab_size:\n        # Find the most common pair. This will become our next token\n        stats = collections.Counter()\n        for piece in words:\n            for pair in zip(piece[:-1], piece[1:]):\n                stats[pair] += 1\n\n        most_common_pair = max(stats, key=lambda x: stats[x])\n        token_bytes = most_common_pair[0] + most_common_pair[1]\n        token = len(ranks)\n        # Add the new token!\n        ranks[token_bytes] = token\n\n        # Now merge that most common pair in all the words. That is, update our training data\n        # to reflect our decision to make that pair into a new token.\n        new_words = []\n        for word in words:\n            new_word = []\n            i = 0\n            while i < len(word) - 1:\n                if (word[i], word[i + 1]) == most_common_pair:\n                    # We found our pair! Merge it\n                    new_word.append(token_bytes)\n                    i += 2\n                else:\n                    new_word.append(word[i])\n                    i += 1\n            if i == len(word) - 1:\n                new_word.append(word[i])\n            new_words.append(new_word)\n        words = new_words\n\n        # See the intermediate merges play out!\n        if visualise:\n            print(f\"The current most common pair is {most_common_pair[0]} + {most_common_pair[1]}\")\n            print(f\"So we made {token_bytes} our {len(ranks)}th token\")\n            if visualise in [\"colour\", \"color\"]:\n                print(\"Now the first fifty words in our training data look like:\")\n                visualise_tokens([token for word in words[:50] for token in word])\n            elif visualise == \"simple\":\n                print(\"Now the first twenty words in our training data look like:\")\n                for word in words[:20]:\n                    print(word)\n            print(\"\\n\")\n\n    return ranks\n\n\ndef visualise_tokens(token_values: list[bytes]) -> None:\n    background = [f\"\\u001b[48;5;{i}m\" for i in [167, 179, 185, 77, 80, 68, 134]]\n    # If token boundaries do not occur at unicode character boundaries, it's unclear how best to\n    # visualise the token. Here, we'll just use the unicode replacement character to represent some\n    # fraction of a character.\n    unicode_token_values = [x.decode(\"utf-8\", errors=\"replace\") for x in token_values]\n\n    running_length = 0\n    last_color = None\n    for token in unicode_token_values:\n        color = background[running_length % len(background)]\n        if color == last_color:\n            color = background[(running_length + 1) % len(background)]\n            assert color != last_color\n        last_color = color\n        running_length += len(token)\n        print(color + token, end=\"\")\n    print(\"\\u001b[0m\")\n\n\ndef train_simple_encoding():\n    gpt2_pattern = (\n        r\"\"\"'s|'t|'re|'ve|'m|'ll|'d| ?[\\p{L}]+| ?[\\p{N}]+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+\"\"\"\n    )\n    with open(__file__) as f:\n        data = f.read()\n\n    enc = SimpleBytePairEncoding.train(data, vocab_size=600, pat_str=gpt2_pattern)\n\n    print(\"This is the sequence of merges performed in order to encode 'hello world':\")\n    tokens = enc.encode(\"hello world\")\n    assert enc.decode(tokens) == \"hello world\"\n    assert enc.decode_bytes(tokens) == b\"hello world\"\n    assert enc.decode_tokens_bytes(tokens) == [b\"hello\", b\" world\"]\n\n    return enc\n"
  },
  {
    "path": "tiktoken/core.py",
    "content": "from __future__ import annotations\n\nimport functools\nfrom concurrent.futures import ThreadPoolExecutor\nfrom typing import TYPE_CHECKING, AbstractSet, Collection, Literal, NoReturn, Sequence\n\nfrom tiktoken import _tiktoken\n\nif TYPE_CHECKING:\n    import re\n\n    import numpy as np\n    import numpy.typing as npt\n\n\nclass Encoding:\n    def __init__(\n        self,\n        name: str,\n        *,\n        pat_str: str,\n        mergeable_ranks: dict[bytes, int],\n        special_tokens: dict[str, int],\n        explicit_n_vocab: int | None = None,\n    ):\n        \"\"\"Creates an Encoding object.\n\n        See openai_public.py for examples of how to construct an Encoding object.\n\n        Args:\n            name: The name of the encoding. It should be clear from the name of the encoding\n                what behaviour to expect, in particular, encodings with different special tokens\n                should have different names.\n            pat_str: A regex pattern string that is used to split the input text.\n            mergeable_ranks: A dictionary mapping mergeable token bytes to their ranks. The ranks\n                must correspond to merge priority.\n            special_tokens: A dictionary mapping special token strings to their token values.\n            explicit_n_vocab: The number of tokens in the vocabulary. If provided, it is checked\n                that the number of mergeable tokens and special tokens is equal to this number.\n        \"\"\"\n        self.name = name\n\n        self._pat_str = pat_str\n        self._mergeable_ranks = mergeable_ranks\n        self._special_tokens = special_tokens\n\n        self.max_token_value = max(\n            max(mergeable_ranks.values()), max(special_tokens.values(), default=0)\n        )\n        if explicit_n_vocab:\n            assert len(mergeable_ranks) + len(special_tokens) == explicit_n_vocab\n            assert self.max_token_value == explicit_n_vocab - 1\n\n        self._core_bpe = _tiktoken.CoreBPE(mergeable_ranks, special_tokens, pat_str)\n\n    def __repr__(self) -> str:\n        return f\"<Encoding {self.name!r}>\"\n\n    # ====================\n    # Encoding\n    # ====================\n\n    def encode_ordinary(self, text: str) -> list[int]:\n        \"\"\"Encodes a string into tokens, ignoring special tokens.\n\n        This is equivalent to `encode(text, disallowed_special=())` (but slightly faster).\n\n        ```\n        >>> enc.encode_ordinary(\"hello world\")\n        [31373, 995]\n        \"\"\"\n        try:\n            return self._core_bpe.encode_ordinary(text)\n        except UnicodeEncodeError:\n            # See comment in encode\n            text = text.encode(\"utf-16\", \"surrogatepass\").decode(\"utf-16\", \"replace\")\n            return self._core_bpe.encode_ordinary(text)\n\n    def encode(\n        self,\n        text: str,\n        *,\n        allowed_special: Literal[\"all\"] | AbstractSet[str] = set(),  # noqa: B006\n        disallowed_special: Literal[\"all\"] | Collection[str] = \"all\",\n    ) -> list[int]:\n        \"\"\"Encodes a string into tokens.\n\n        Special tokens are artificial tokens used to unlock capabilities from a model,\n        such as fill-in-the-middle. So we want to be careful about accidentally encoding special\n        tokens, since they can be used to trick a model into doing something we don't want it to do.\n\n        Hence, by default, encode will raise an error if it encounters text that corresponds\n        to a special token. This can be controlled on a per-token level using the `allowed_special`\n        and `disallowed_special` parameters. In particular:\n        - Setting `disallowed_special` to () will prevent this function from raising errors and\n          cause all text corresponding to special tokens to be encoded as natural text.\n        - Setting `allowed_special` to \"all\" will cause this function to treat all text\n          corresponding to special tokens to be encoded as special tokens.\n\n        ```\n        >>> enc.encode(\"hello world\")\n        [31373, 995]\n        >>> enc.encode(\"<|endoftext|>\", allowed_special={\"<|endoftext|>\"})\n        [50256]\n        >>> enc.encode(\"<|endoftext|>\", allowed_special=\"all\")\n        [50256]\n        >>> enc.encode(\"<|endoftext|>\")\n        # Raises ValueError\n        >>> enc.encode(\"<|endoftext|>\", disallowed_special=())\n        [27, 91, 437, 1659, 5239, 91, 29]\n        ```\n        \"\"\"\n        if allowed_special == \"all\":\n            allowed_special = self.special_tokens_set\n        if disallowed_special == \"all\":\n            disallowed_special = self.special_tokens_set - allowed_special\n        if disallowed_special:\n            if not isinstance(disallowed_special, frozenset):\n                disallowed_special = frozenset(disallowed_special)\n            if match := _special_token_regex(disallowed_special).search(text):\n                raise_disallowed_special_token(match.group())\n\n        try:\n            return self._core_bpe.encode(text, allowed_special)\n        except UnicodeEncodeError:\n            # BPE operates on bytes, but the regex operates on unicode. If we pass a str that is\n            # invalid UTF-8 to Rust, it will rightfully complain. Here we do a quick and dirty\n            # fixup for any surrogate pairs that may have sneaked their way into the text.\n            # Technically, this introduces a place where encode + decode doesn't roundtrip a Python\n            # string, but given that this is input we want to support, maybe that's okay.\n            # Also we use errors=\"replace\" to handle weird things like lone surrogates.\n            text = text.encode(\"utf-16\", \"surrogatepass\").decode(\"utf-16\", \"replace\")\n            return self._core_bpe.encode(text, allowed_special)\n\n    def encode_to_numpy(\n        self,\n        text: str,\n        *,\n        allowed_special: Literal[\"all\"] | AbstractSet[str] = set(),  # noqa: B006\n        disallowed_special: Literal[\"all\"] | Collection[str] = \"all\",\n    ) -> npt.NDArray[np.uint32]:\n        \"\"\"Encodes a string into tokens, returning a numpy array.\n\n        Avoids the overhead of copying the token buffer into a Python list.\n        \"\"\"\n        if allowed_special == \"all\":\n            allowed_special = self.special_tokens_set\n        if disallowed_special == \"all\":\n            disallowed_special = self.special_tokens_set - allowed_special\n        if disallowed_special:\n            if not isinstance(disallowed_special, frozenset):\n                disallowed_special = frozenset(disallowed_special)\n            if match := _special_token_regex(disallowed_special).search(text):\n                raise_disallowed_special_token(match.group())\n\n        import numpy as np\n\n        buffer = self._core_bpe.encode_to_tiktoken_buffer(text, allowed_special)\n        return np.frombuffer(buffer, dtype=np.uint32)\n\n    def encode_ordinary_batch(self, text: list[str], *, num_threads: int = 8) -> list[list[int]]:\n        \"\"\"Encodes a list of strings into tokens, in parallel, ignoring special tokens.\n\n        This is equivalent to `encode_batch(text, disallowed_special=())` (but slightly faster).\n\n        ```\n        >>> enc.encode_ordinary_batch([\"hello world\", \"goodbye world\"])\n        [[31373, 995], [11274, 16390, 995]]\n        ```\n        \"\"\"\n        encoder = functools.partial(self.encode_ordinary)\n        with ThreadPoolExecutor(num_threads) as e:\n            return list(e.map(encoder, text))\n\n    def encode_batch(\n        self,\n        text: list[str],\n        *,\n        num_threads: int = 8,\n        allowed_special: Literal[\"all\"] | AbstractSet[str] = set(),  # noqa: B006\n        disallowed_special: Literal[\"all\"] | Collection[str] = \"all\",\n    ) -> list[list[int]]:\n        \"\"\"Encodes a list of strings into tokens, in parallel.\n\n        See `encode` for more details on `allowed_special` and `disallowed_special`.\n\n        ```\n        >>> enc.encode_batch([\"hello world\", \"goodbye world\"])\n        [[31373, 995], [11274, 16390, 995]]\n        ```\n        \"\"\"\n        if allowed_special == \"all\":\n            allowed_special = self.special_tokens_set\n        if disallowed_special == \"all\":\n            disallowed_special = self.special_tokens_set - allowed_special\n        if not isinstance(disallowed_special, frozenset):\n            disallowed_special = frozenset(disallowed_special)\n\n        encoder = functools.partial(\n            self.encode, allowed_special=allowed_special, disallowed_special=disallowed_special\n        )\n        with ThreadPoolExecutor(num_threads) as e:\n            return list(e.map(encoder, text))\n\n    def encode_with_unstable(\n        self,\n        text: str,\n        *,\n        allowed_special: Literal[\"all\"] | AbstractSet[str] = set(),  # noqa: B006\n        disallowed_special: Literal[\"all\"] | Collection[str] = \"all\",\n    ) -> tuple[list[int], list[list[int]]]:\n        \"\"\"Encodes a string into stable tokens and possible completion sequences.\n\n        Note that the stable tokens will only represent a substring of `text`.\n\n        See `encode` for more details on `allowed_special` and `disallowed_special`.\n\n        This API should itself be considered unstable.\n\n        ```\n        >>> enc.encode_with_unstable(\"hello fanta\")\n        ([31373], [(277, 4910), (5113, 265), ..., (8842,)])\n\n        >>> text = \"...\"\n        >>> stable_tokens, completions = enc.encode_with_unstable(text)\n        >>> assert text.encode().startswith(enc.decode_bytes(stable_tokens))\n        >>> assert all(enc.decode_bytes(stable_tokens + seq).startswith(text.encode()) for seq in completions)\n        ```\n        \"\"\"\n        if allowed_special == \"all\":\n            allowed_special = self.special_tokens_set\n        if disallowed_special == \"all\":\n            disallowed_special = self.special_tokens_set - allowed_special\n        if disallowed_special:\n            if not isinstance(disallowed_special, frozenset):\n                disallowed_special = frozenset(disallowed_special)\n            if match := _special_token_regex(disallowed_special).search(text):\n                raise_disallowed_special_token(match.group())\n\n        return self._core_bpe.encode_with_unstable(text, allowed_special)\n\n    def encode_single_token(self, text_or_bytes: str | bytes) -> int:\n        \"\"\"Encodes text corresponding to a single token to its token value.\n\n        NOTE: this will encode all special tokens.\n\n        Raises `KeyError` if the token is not in the vocabulary.\n\n        ```\n        >>> enc.encode_single_token(\"hello\")\n        31373\n        ```\n        \"\"\"\n        if isinstance(text_or_bytes, str):\n            text_or_bytes = text_or_bytes.encode(\"utf-8\")\n        return self._core_bpe.encode_single_token(text_or_bytes)\n\n    # ====================\n    # Decoding\n    # ====================\n\n    def decode_bytes(self, tokens: Sequence[int]) -> bytes:\n        \"\"\"Decodes a list of tokens into bytes.\n\n        ```\n        >>> enc.decode_bytes([31373, 995])\n        b'hello world'\n        ```\n        \"\"\"\n        return self._core_bpe.decode_bytes(tokens)\n\n    def decode(self, tokens: Sequence[int], errors: str = \"replace\") -> str:\n        \"\"\"Decodes a list of tokens into a string.\n\n        WARNING: the default behaviour of this function is lossy, since decoded bytes are not\n        guaranteed to be valid UTF-8. You can control this behaviour using the `errors` parameter,\n        for instance, setting `errors=strict`.\n\n        ```\n        >>> enc.decode([31373, 995])\n        'hello world'\n        ```\n        \"\"\"\n        return self._core_bpe.decode_bytes(tokens).decode(\"utf-8\", errors=errors)\n\n    def decode_single_token_bytes(self, token: int) -> bytes:\n        \"\"\"Decodes a token into bytes.\n\n        NOTE: this will decode all special tokens.\n\n        Raises `KeyError` if the token is not in the vocabulary.\n\n        ```\n        >>> enc.decode_single_token_bytes(31373)\n        b'hello'\n        ```\n        \"\"\"\n        return self._core_bpe.decode_single_token_bytes(token)\n\n    def decode_tokens_bytes(self, tokens: Sequence[int]) -> list[bytes]:\n        \"\"\"Decodes a list of tokens into a list of bytes.\n\n        Useful for visualising tokenisation.\n        >>> enc.decode_tokens_bytes([31373, 995])\n        [b'hello', b' world']\n        \"\"\"\n        return [self.decode_single_token_bytes(token) for token in tokens]\n\n    def decode_with_offsets(self, tokens: Sequence[int]) -> tuple[str, list[int]]:\n        \"\"\"Decodes a list of tokens into a string and a list of offsets.\n\n        Each offset is the index into text corresponding to the start of each token.\n        If UTF-8 character boundaries do not line up with token boundaries, the offset is the index\n        of the first character that contains bytes from the token.\n\n        This will currently raise if given tokens that decode to invalid UTF-8; this behaviour may\n        change in the future to be more permissive.\n\n        >>> enc.decode_with_offsets([31373, 995])\n        ('hello world', [0, 5])\n        \"\"\"\n        token_bytes = self.decode_tokens_bytes(tokens)\n\n        text_len = 0\n        offsets = []\n        for token in token_bytes:\n            offsets.append(max(0, text_len - (0x80 <= token[0] < 0xC0)))\n            text_len += sum(1 for c in token if not 0x80 <= c < 0xC0)\n\n        # TODO: assess correctness for errors=\"ignore\" and errors=\"replace\"\n        text = b\"\".join(token_bytes).decode(\"utf-8\", errors=\"strict\")\n        return text, offsets\n\n    def decode_batch(\n        self, batch: Sequence[Sequence[int]], *, errors: str = \"replace\", num_threads: int = 8\n    ) -> list[str]:\n        \"\"\"Decodes a batch (list of lists of tokens) into a list of strings.\"\"\"\n        decoder = functools.partial(self.decode, errors=errors)\n        with ThreadPoolExecutor(num_threads) as e:\n            return list(e.map(decoder, batch))\n\n    def decode_bytes_batch(\n        self, batch: Sequence[Sequence[int]], *, num_threads: int = 8\n    ) -> list[bytes]:\n        \"\"\"Decodes a batch (list of lists of tokens) into a list of bytes.\"\"\"\n        with ThreadPoolExecutor(num_threads) as e:\n            return list(e.map(self.decode_bytes, batch))\n\n    # ====================\n    # Miscellaneous\n    # ====================\n\n    def token_byte_values(self) -> list[bytes]:\n        \"\"\"Returns the list of all token byte values.\"\"\"\n        return self._core_bpe.token_byte_values()\n\n    @property\n    def eot_token(self) -> int:\n        return self._special_tokens[\"<|endoftext|>\"]\n\n    @functools.cached_property\n    def special_tokens_set(self) -> set[str]:\n        return set(self._special_tokens.keys())\n\n    def is_special_token(self, token: int) -> bool:\n        assert isinstance(token, int)\n        return token in self._special_token_values\n\n    @property\n    def n_vocab(self) -> int:\n        \"\"\"For backwards compatibility. Prefer to use `enc.max_token_value + 1`.\"\"\"\n        return self.max_token_value + 1\n\n    # ====================\n    # Private\n    # ====================\n\n    def _encode_single_piece(self, text_or_bytes: str | bytes) -> list[int]:\n        \"\"\"Encodes text corresponding to bytes without a regex split.\n\n        NOTE: this will not encode any special tokens.\n\n        ```\n        >>> enc.encode_single_piece(\"helloqqqq\")\n        [31373, 38227, 38227]\n        ```\n        \"\"\"\n        if isinstance(text_or_bytes, str):\n            text_or_bytes = text_or_bytes.encode(\"utf-8\")\n        return self._core_bpe.encode_single_piece(text_or_bytes)\n\n    def _encode_only_native_bpe(self, text: str) -> list[int]:\n        \"\"\"Encodes a string into tokens, but do regex splitting in Python.\"\"\"\n        # We need specifically `regex` in order to compile pat_str due to e.g. \\p\n        import regex\n\n        _unused_pat = regex.compile(self._pat_str)\n        ret = []\n        for piece in regex.findall(_unused_pat, text):\n            ret.extend(self._core_bpe.encode_single_piece(piece.encode(\"utf-8\")))\n        return ret\n\n    def _encode_bytes(self, text: bytes) -> list[int]:\n        return self._core_bpe._encode_bytes(text)\n\n    def __getstate__(self) -> object:\n        import tiktoken.registry\n\n        # As an optimisation, pickle registered encodings by reference\n        if self is tiktoken.registry.ENCODINGS.get(self.name):\n            return self.name\n        return {\n            \"name\": self.name,\n            \"pat_str\": self._pat_str,\n            \"mergeable_ranks\": self._mergeable_ranks,\n            \"special_tokens\": self._special_tokens,\n        }\n\n    def __setstate__(self, value: object) -> None:\n        import tiktoken.registry\n\n        if isinstance(value, str):\n            self.__dict__ = tiktoken.registry.get_encoding(value).__dict__\n            return\n        self.__init__(**value)\n\n\n@functools.lru_cache(maxsize=128)\ndef _special_token_regex(tokens: frozenset[str]) -> re.Pattern[str]:\n    try:\n        import regex as re\n    except ImportError:\n        import re\n    inner = \"|\".join(re.escape(token) for token in tokens)\n    return re.compile(f\"({inner})\")\n\n\ndef raise_disallowed_special_token(token: str) -> NoReturn:\n    raise ValueError(\n        f\"Encountered text corresponding to disallowed special token {token!r}.\\n\"\n        \"If you want this text to be encoded as a special token, \"\n        f\"pass it to `allowed_special`, e.g. `allowed_special={{{token!r}, ...}}`.\\n\"\n        f\"If you want this text to be encoded as normal text, disable the check for this token \"\n        f\"by passing `disallowed_special=(enc.special_tokens_set - {{{token!r}}})`.\\n\"\n        \"To disable this check for all special tokens, pass `disallowed_special=()`.\\n\"\n    )\n"
  },
  {
    "path": "tiktoken/load.py",
    "content": "from __future__ import annotations\n\nimport base64\nimport hashlib\nimport os\n\n\ndef read_file(blobpath: str) -> bytes:\n    if \"://\" not in blobpath:\n        with open(blobpath, \"rb\", buffering=0) as f:\n            return f.read()\n\n    if blobpath.startswith((\"http://\", \"https://\")):\n        # avoiding blobfile for public files helps avoid auth issues, like MFA prompts.\n        import requests\n\n        resp = requests.get(blobpath)\n        resp.raise_for_status()\n        return resp.content\n\n    try:\n        import blobfile\n    except ImportError as e:\n        raise ImportError(\n            \"blobfile is not installed. Please install it by running `pip install blobfile`.\"\n        ) from e\n    return blobfile.read_bytes(blobpath)\n\n\ndef check_hash(data: bytes, expected_hash: str) -> bool:\n    actual_hash = hashlib.sha256(data).hexdigest()\n    return actual_hash == expected_hash\n\n\ndef read_file_cached(blobpath: str, expected_hash: str | None = None) -> bytes:\n    user_specified_cache = True\n    if \"TIKTOKEN_CACHE_DIR\" in os.environ:\n        cache_dir = os.environ[\"TIKTOKEN_CACHE_DIR\"]\n    elif \"DATA_GYM_CACHE_DIR\" in os.environ:\n        cache_dir = os.environ[\"DATA_GYM_CACHE_DIR\"]\n    else:\n        import tempfile\n\n        cache_dir = os.path.join(tempfile.gettempdir(), \"data-gym-cache\")\n        user_specified_cache = False\n\n    if cache_dir == \"\":\n        # disable caching\n        return read_file(blobpath)\n\n    cache_key = hashlib.sha1(blobpath.encode()).hexdigest()\n\n    cache_path = os.path.join(cache_dir, cache_key)\n    if os.path.exists(cache_path):\n        with open(cache_path, \"rb\", buffering=0) as f:\n            data = f.read()\n        if expected_hash is None or check_hash(data, expected_hash):\n            return data\n\n        # the cached file does not match the hash, remove it and re-fetch\n        try:\n            os.remove(cache_path)\n        except OSError:\n            pass\n\n    contents = read_file(blobpath)\n    if expected_hash and not check_hash(contents, expected_hash):\n        raise ValueError(\n            f\"Hash mismatch for data downloaded from {blobpath} (expected {expected_hash}). \"\n            f\"This may indicate a corrupted download. Please try again.\"\n        )\n\n    import uuid\n\n    try:\n        os.makedirs(cache_dir, exist_ok=True)\n        tmp_filename = cache_path + \".\" + str(uuid.uuid4()) + \".tmp\"\n        with open(tmp_filename, \"wb\") as f:\n            f.write(contents)\n        os.rename(tmp_filename, cache_path)\n    except OSError:\n        # don't raise if we can't write to the default cache, e.g. issue #75\n        if user_specified_cache:\n            raise\n\n    return contents\n\n\ndef data_gym_to_mergeable_bpe_ranks(\n    vocab_bpe_file: str,\n    encoder_json_file: str,\n    vocab_bpe_hash: str | None = None,\n    encoder_json_hash: str | None = None,\n    clobber_one_byte_tokens: bool = False,\n) -> dict[bytes, int]:\n    # NB: do not add caching to this function\n    rank_to_intbyte = [b for b in range(2**8) if chr(b).isprintable() and chr(b) != \" \"]\n\n    data_gym_byte_to_byte = {chr(b): b for b in rank_to_intbyte}\n    n = 0\n    for b in range(2**8):\n        if b not in rank_to_intbyte:\n            rank_to_intbyte.append(b)\n            data_gym_byte_to_byte[chr(2**8 + n)] = b\n            n += 1\n    assert len(rank_to_intbyte) == 2**8\n\n    # vocab_bpe contains the merges along with associated ranks\n    vocab_bpe_contents = read_file_cached(vocab_bpe_file, vocab_bpe_hash).decode()\n    bpe_merges = [tuple(merge_str.split()) for merge_str in vocab_bpe_contents.split(\"\\n\")[1:-1]]\n\n    def decode_data_gym(value: str) -> bytes:\n        return bytes(data_gym_byte_to_byte[b] for b in value)\n\n    # add the single byte tokens\n    # if clobber_one_byte_tokens is True, we'll replace these with ones from the encoder json\n    bpe_ranks = {bytes([b]): i for i, b in enumerate(rank_to_intbyte)}\n    del rank_to_intbyte\n\n    # add the merged tokens\n    n = len(bpe_ranks)\n    for first, second in bpe_merges:\n        bpe_ranks[decode_data_gym(first) + decode_data_gym(second)] = n\n        n += 1\n\n    import json\n\n    # check that the encoder file matches the merges file\n    # this sanity check is important since tiktoken assumes that ranks are ordered the same\n    # as merge priority\n    encoder_json = json.loads(read_file_cached(encoder_json_file, encoder_json_hash))\n    encoder_json_loaded = {decode_data_gym(k): v for k, v in encoder_json.items()}\n    # drop these two special tokens if present, since they're not mergeable bpe tokens\n    encoder_json_loaded.pop(b\"<|endoftext|>\", None)\n    encoder_json_loaded.pop(b\"<|startoftext|>\", None)\n\n    if clobber_one_byte_tokens:\n        for k in encoder_json_loaded:\n            if len(k) == 1:\n                bpe_ranks[k] = encoder_json_loaded[k]\n\n    assert bpe_ranks == encoder_json_loaded\n\n    return bpe_ranks\n\n\ndef dump_tiktoken_bpe(bpe_ranks: dict[bytes, int], tiktoken_bpe_file: str) -> None:\n    try:\n        import blobfile\n    except ImportError as e:\n        raise ImportError(\n            \"blobfile is not installed. Please install it by running `pip install blobfile`.\"\n        ) from e\n    with blobfile.BlobFile(tiktoken_bpe_file, \"wb\") as f:\n        for token, rank in sorted(bpe_ranks.items(), key=lambda x: x[1]):\n            f.write(base64.b64encode(token) + b\" \" + str(rank).encode() + b\"\\n\")\n\n\ndef load_tiktoken_bpe(tiktoken_bpe_file: str, expected_hash: str | None = None) -> dict[bytes, int]:\n    # NB: do not add caching to this function\n    contents = read_file_cached(tiktoken_bpe_file, expected_hash)\n    ret = {}\n    for line in contents.splitlines():\n        if not line:\n            continue\n        try:\n            token, rank = line.split()\n            ret[base64.b64decode(token)] = int(rank)\n        except Exception as e:\n            raise ValueError(f\"Error parsing line {line!r} in {tiktoken_bpe_file}\") from e\n    return ret\n"
  },
  {
    "path": "tiktoken/model.py",
    "content": "from __future__ import annotations\n\nfrom .core import Encoding\nfrom .registry import get_encoding\n\n# TODO: these will likely be replaced by an API endpoint\nMODEL_PREFIX_TO_ENCODING: dict[str, str] = {\n    \"o1-\": \"o200k_base\",\n    \"o3-\": \"o200k_base\",\n    \"o4-mini-\": \"o200k_base\",\n    # chat\n    \"gpt-5-\": \"o200k_base\",\n    \"gpt-4.5-\": \"o200k_base\",\n    \"gpt-4.1-\": \"o200k_base\",\n    \"chatgpt-4o-\": \"o200k_base\",\n    \"gpt-4o-\": \"o200k_base\",  # e.g., gpt-4o-2024-05-13\n    \"gpt-4-\": \"cl100k_base\",  # e.g., gpt-4-0314, etc., plus gpt-4-32k\n    \"gpt-3.5-turbo-\": \"cl100k_base\",  # e.g, gpt-3.5-turbo-0301, -0401, etc.\n    \"gpt-35-turbo-\": \"cl100k_base\",  # Azure deployment name\n    \"gpt-oss-\": \"o200k_harmony\",\n    # fine-tuned\n    \"ft:gpt-4o\": \"o200k_base\",\n    \"ft:gpt-4\": \"cl100k_base\",\n    \"ft:gpt-3.5-turbo\": \"cl100k_base\",\n    \"ft:davinci-002\": \"cl100k_base\",\n    \"ft:babbage-002\": \"cl100k_base\",\n}\n\nMODEL_TO_ENCODING: dict[str, str] = {\n    # reasoning\n    \"o1\": \"o200k_base\",\n    \"o3\": \"o200k_base\",\n    \"o4-mini\": \"o200k_base\",\n    # chat\n    \"gpt-5\": \"o200k_base\",\n    \"gpt-4.1\": \"o200k_base\",\n    \"gpt-4o\": \"o200k_base\",\n    \"gpt-4\": \"cl100k_base\",\n    \"gpt-3.5-turbo\": \"cl100k_base\",\n    \"gpt-3.5\": \"cl100k_base\",  # Common shorthand\n    \"gpt-35-turbo\": \"cl100k_base\",  # Azure deployment name\n    # base\n    \"davinci-002\": \"cl100k_base\",\n    \"babbage-002\": \"cl100k_base\",\n    # embeddings\n    \"text-embedding-ada-002\": \"cl100k_base\",\n    \"text-embedding-3-small\": \"cl100k_base\",\n    \"text-embedding-3-large\": \"cl100k_base\",\n    # DEPRECATED MODELS\n    # text (DEPRECATED)\n    \"text-davinci-003\": \"p50k_base\",\n    \"text-davinci-002\": \"p50k_base\",\n    \"text-davinci-001\": \"r50k_base\",\n    \"text-curie-001\": \"r50k_base\",\n    \"text-babbage-001\": \"r50k_base\",\n    \"text-ada-001\": \"r50k_base\",\n    \"davinci\": \"r50k_base\",\n    \"curie\": \"r50k_base\",\n    \"babbage\": \"r50k_base\",\n    \"ada\": \"r50k_base\",\n    # code (DEPRECATED)\n    \"code-davinci-002\": \"p50k_base\",\n    \"code-davinci-001\": \"p50k_base\",\n    \"code-cushman-002\": \"p50k_base\",\n    \"code-cushman-001\": \"p50k_base\",\n    \"davinci-codex\": \"p50k_base\",\n    \"cushman-codex\": \"p50k_base\",\n    # edit (DEPRECATED)\n    \"text-davinci-edit-001\": \"p50k_edit\",\n    \"code-davinci-edit-001\": \"p50k_edit\",\n    # old embeddings (DEPRECATED)\n    \"text-similarity-davinci-001\": \"r50k_base\",\n    \"text-similarity-curie-001\": \"r50k_base\",\n    \"text-similarity-babbage-001\": \"r50k_base\",\n    \"text-similarity-ada-001\": \"r50k_base\",\n    \"text-search-davinci-doc-001\": \"r50k_base\",\n    \"text-search-curie-doc-001\": \"r50k_base\",\n    \"text-search-babbage-doc-001\": \"r50k_base\",\n    \"text-search-ada-doc-001\": \"r50k_base\",\n    \"code-search-babbage-code-001\": \"r50k_base\",\n    \"code-search-ada-code-001\": \"r50k_base\",\n    # open source\n    \"gpt2\": \"gpt2\",\n    \"gpt-2\": \"gpt2\",  # Maintains consistency with gpt-4\n}\n\n\ndef encoding_name_for_model(model_name: str) -> str:\n    \"\"\"Returns the name of the encoding used by a model.\n\n    Raises a KeyError if the model name is not recognised.\n    \"\"\"\n    encoding_name = None\n    if model_name in MODEL_TO_ENCODING:\n        encoding_name = MODEL_TO_ENCODING[model_name]\n    else:\n        # Check if the model matches a known prefix\n        # Prefix matching avoids needing library updates for every model version release\n        # Note that this can match on non-existent models (e.g., gpt-3.5-turbo-FAKE)\n        for model_prefix, model_encoding_name in MODEL_PREFIX_TO_ENCODING.items():\n            if model_name.startswith(model_prefix):\n                return model_encoding_name\n\n    if encoding_name is None:\n        raise KeyError(\n            f\"Could not automatically map {model_name} to a tokeniser. \"\n            \"Please use `tiktoken.get_encoding` to explicitly get the tokeniser you expect.\"\n        ) from None\n\n    return encoding_name\n\n\ndef encoding_for_model(model_name: str) -> Encoding:\n    \"\"\"Returns the encoding used by a model.\n\n    Raises a KeyError if the model name is not recognised.\n    \"\"\"\n    return get_encoding(encoding_name_for_model(model_name))\n"
  },
  {
    "path": "tiktoken/py.typed",
    "content": ""
  },
  {
    "path": "tiktoken/registry.py",
    "content": "from __future__ import annotations\n\nimport functools\nimport importlib\nimport pkgutil\nimport threading\nfrom typing import Any, Callable, Sequence\n\nimport tiktoken_ext\n\nimport tiktoken\nfrom tiktoken.core import Encoding\n\n_lock = threading.RLock()\nENCODINGS: dict[str, Encoding] = {}\nENCODING_CONSTRUCTORS: dict[str, Callable[[], dict[str, Any]]] | None = None\n\n\n@functools.lru_cache\ndef _available_plugin_modules() -> Sequence[str]:\n    # tiktoken_ext is a namespace package\n    # submodules inside tiktoken_ext will be inspected for ENCODING_CONSTRUCTORS attributes\n    # - we use namespace package pattern so `pkgutil.iter_modules` is fast\n    # - it's a separate top-level package because namespace subpackages of non-namespace\n    #   packages don't quite do what you want with editable installs\n    mods = []\n    plugin_mods = pkgutil.iter_modules(tiktoken_ext.__path__, tiktoken_ext.__name__ + \".\")\n    for _, mod_name, _ in plugin_mods:\n        mods.append(mod_name)\n    return mods\n\n\ndef _find_constructors() -> None:\n    global ENCODING_CONSTRUCTORS\n    with _lock:\n        if ENCODING_CONSTRUCTORS is not None:\n            return\n        ENCODING_CONSTRUCTORS = {}\n\n        try:\n            for mod_name in _available_plugin_modules():\n                mod = importlib.import_module(mod_name)\n                try:\n                    constructors = mod.ENCODING_CONSTRUCTORS\n                except AttributeError as e:\n                    raise ValueError(\n                        f\"tiktoken plugin {mod_name} does not define ENCODING_CONSTRUCTORS\"\n                    ) from e\n                for enc_name, constructor in constructors.items():\n                    if enc_name in ENCODING_CONSTRUCTORS:\n                        raise ValueError(\n                            f\"Duplicate encoding name {enc_name} in tiktoken plugin {mod_name}\"\n                        )\n                    ENCODING_CONSTRUCTORS[enc_name] = constructor\n        except Exception:\n            # Ensure we idempotently raise errors\n            ENCODING_CONSTRUCTORS = None\n            raise\n\n\n\n\ndef get_encoding(encoding_name: str) -> Encoding:\n    if not isinstance(encoding_name, str):\n        raise ValueError(f\"Expected a string in get_encoding, got {type(encoding_name)}\")\n\n    if encoding_name in ENCODINGS:\n        return ENCODINGS[encoding_name]\n\n    with _lock:\n        if encoding_name in ENCODINGS:\n            return ENCODINGS[encoding_name]\n\n        if ENCODING_CONSTRUCTORS is None:\n            _find_constructors()\n            assert ENCODING_CONSTRUCTORS is not None\n\n        if encoding_name not in ENCODING_CONSTRUCTORS:\n            raise ValueError(\n                f\"Unknown encoding {encoding_name}.\\n\"\n                f\"Plugins found: {_available_plugin_modules()}\\n\"\n                f\"tiktoken version: {tiktoken.__version__} (are you on latest?)\"\n            )\n\n        constructor = ENCODING_CONSTRUCTORS[encoding_name]\n        enc = Encoding(**constructor())\n        ENCODINGS[encoding_name] = enc\n        return enc\n\n\ndef list_encoding_names() -> list[str]:\n    with _lock:\n        if ENCODING_CONSTRUCTORS is None:\n            _find_constructors()\n            assert ENCODING_CONSTRUCTORS is not None\n        return list(ENCODING_CONSTRUCTORS)\n"
  },
  {
    "path": "tiktoken_ext/openai_public.py",
    "content": "from tiktoken.load import data_gym_to_mergeable_bpe_ranks, load_tiktoken_bpe\n\nENDOFTEXT = \"<|endoftext|>\"\nFIM_PREFIX = \"<|fim_prefix|>\"\nFIM_MIDDLE = \"<|fim_middle|>\"\nFIM_SUFFIX = \"<|fim_suffix|>\"\nENDOFPROMPT = \"<|endofprompt|>\"\n\n# The pattern in the original GPT-2 release is:\n# r\"\"\"'s|'t|'re|'ve|'m|'ll|'d| ?[\\p{L}]+| ?[\\p{N}]+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)|\\s+\"\"\"\n# This is equivalent, but executes faster:\nr50k_pat_str = (\n    r\"\"\"'(?:[sdmt]|ll|ve|re)| ?\\p{L}++| ?\\p{N}++| ?[^\\s\\p{L}\\p{N}]++|\\s++$|\\s+(?!\\S)|\\s\"\"\"\n)\n\n\ndef gpt2():\n    mergeable_ranks = data_gym_to_mergeable_bpe_ranks(\n        vocab_bpe_file=\"https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/vocab.bpe\",\n        encoder_json_file=\"https://openaipublic.blob.core.windows.net/gpt-2/encodings/main/encoder.json\",\n        vocab_bpe_hash=\"1ce1664773c50f3e0cc8842619a93edc4624525b728b188a9e0be33b7726adc5\",\n        encoder_json_hash=\"196139668be63f3b5d6574427317ae82f612a97c5d1cdaf36ed2256dbf636783\",\n    )\n    return {\n        \"name\": \"gpt2\",\n        \"explicit_n_vocab\": 50257,\n        \"pat_str\": r50k_pat_str,\n        \"mergeable_ranks\": mergeable_ranks,\n        \"special_tokens\": {ENDOFTEXT: 50256},\n    }\n\n\ndef r50k_base():\n    mergeable_ranks = load_tiktoken_bpe(\n        \"https://openaipublic.blob.core.windows.net/encodings/r50k_base.tiktoken\",\n        expected_hash=\"306cd27f03c1a714eca7108e03d66b7dc042abe8c258b44c199a7ed9838dd930\",\n    )\n    return {\n        \"name\": \"r50k_base\",\n        \"explicit_n_vocab\": 50257,\n        \"pat_str\": r50k_pat_str,\n        \"mergeable_ranks\": mergeable_ranks,\n        \"special_tokens\": {ENDOFTEXT: 50256},\n    }\n\n\ndef p50k_base():\n    mergeable_ranks = load_tiktoken_bpe(\n        \"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken\",\n        expected_hash=\"94b5ca7dff4d00767bc256fdd1b27e5b17361d7b8a5f968547f9f23eb70d2069\",\n    )\n    return {\n        \"name\": \"p50k_base\",\n        \"explicit_n_vocab\": 50281,\n        \"pat_str\": r50k_pat_str,\n        \"mergeable_ranks\": mergeable_ranks,\n        \"special_tokens\": {ENDOFTEXT: 50256},\n    }\n\n\ndef p50k_edit():\n    mergeable_ranks = load_tiktoken_bpe(\n        \"https://openaipublic.blob.core.windows.net/encodings/p50k_base.tiktoken\",\n        expected_hash=\"94b5ca7dff4d00767bc256fdd1b27e5b17361d7b8a5f968547f9f23eb70d2069\",\n    )\n    special_tokens = {ENDOFTEXT: 50256, FIM_PREFIX: 50281, FIM_MIDDLE: 50282, FIM_SUFFIX: 50283}\n    return {\n        \"name\": \"p50k_edit\",\n        \"pat_str\": r50k_pat_str,\n        \"mergeable_ranks\": mergeable_ranks,\n        \"special_tokens\": special_tokens,\n    }\n\n\ndef cl100k_base():\n    mergeable_ranks = load_tiktoken_bpe(\n        \"https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken\",\n        expected_hash=\"223921b76ee99bde995b7ff738513eef100fb51d18c93597a113bcffe865b2a7\",\n    )\n    special_tokens = {\n        ENDOFTEXT: 100257,\n        FIM_PREFIX: 100258,\n        FIM_MIDDLE: 100259,\n        FIM_SUFFIX: 100260,\n        ENDOFPROMPT: 100276,\n    }\n    return {\n        \"name\": \"cl100k_base\",\n        \"pat_str\": r\"\"\"'(?i:[sdmt]|ll|ve|re)|[^\\r\\n\\p{L}\\p{N}]?+\\p{L}++|\\p{N}{1,3}+| ?[^\\s\\p{L}\\p{N}]++[\\r\\n]*+|\\s++$|\\s*[\\r\\n]|\\s+(?!\\S)|\\s\"\"\",\n        \"mergeable_ranks\": mergeable_ranks,\n        \"special_tokens\": special_tokens,\n    }\n\n\ndef o200k_base():\n    mergeable_ranks = load_tiktoken_bpe(\n        \"https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken\",\n        expected_hash=\"446a9538cb6c348e3516120d7c08b09f57c36495e2acfffe59a5bf8b0cfb1a2d\",\n    )\n    special_tokens = {ENDOFTEXT: 199999, ENDOFPROMPT: 200018}\n    # This regex could be made more efficient. If I was the one working on this encoding, I would\n    # have done a few other things differently too, e.g. I think you can allocate tokens more\n    # efficiently across languages.\n    pat_str = \"|\".join(\n        [\n            r\"\"\"[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?\"\"\",\n            r\"\"\"[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?\"\"\",\n            r\"\"\"\\p{N}{1,3}\"\"\",\n            r\"\"\" ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*\"\"\",\n            r\"\"\"\\s*[\\r\\n]+\"\"\",\n            r\"\"\"\\s+(?!\\S)\"\"\",\n            r\"\"\"\\s+\"\"\",\n        ]\n    )\n    return {\n        \"name\": \"o200k_base\",\n        \"pat_str\": pat_str,\n        \"mergeable_ranks\": mergeable_ranks,\n        \"special_tokens\": special_tokens,\n    }\n\n\ndef o200k_harmony():\n    base_enc = o200k_base()\n    name = \"o200k_harmony\"\n    pat_str = base_enc[\"pat_str\"]\n    mergeable_ranks = base_enc[\"mergeable_ranks\"]\n    special_tokens = {\n        **base_enc[\"special_tokens\"],\n        \"<|startoftext|>\": 199998,\n        \"<|endoftext|>\": 199999,\n        \"<|reserved_200000|>\": 200000,\n        \"<|reserved_200001|>\": 200001,\n        \"<|return|>\": 200002,\n        \"<|constrain|>\": 200003,\n        \"<|reserved_200004|>\": 200004,\n        \"<|channel|>\": 200005,\n        \"<|start|>\": 200006,\n        \"<|end|>\": 200007,\n        \"<|message|>\": 200008,\n        \"<|reserved_200009|>\": 200009,\n        \"<|reserved_200010|>\": 200010,\n        \"<|reserved_200011|>\": 200011,\n        \"<|call|>\": 200012,\n    } | {f\"<|reserved_{i}|>\": i for i in range(200013, 201088)}\n    return {\n        \"name\": name,\n        \"pat_str\": pat_str,\n        \"mergeable_ranks\": mergeable_ranks,\n        \"special_tokens\": special_tokens,\n    }\n\n\nENCODING_CONSTRUCTORS = {\n    \"gpt2\": gpt2,\n    \"r50k_base\": r50k_base,\n    \"p50k_base\": p50k_base,\n    \"p50k_edit\": p50k_edit,\n    \"cl100k_base\": cl100k_base,\n    \"o200k_base\": o200k_base,\n    \"o200k_harmony\": o200k_harmony,\n}\n"
  }
]