[
  {
    "path": ".github/workflows/build-site.yaml",
    "content": "name: Build site and push to gh-pages\n\non:\n  push:\n    branches:\n      - main\n\njobs:\n  build:\n    name: Build site\n    runs-on: ubuntu-latest\n\n    steps:\n    - uses: actions/checkout@v2\n\n    - name: Configuring build Environment\n      run: |\n        sudo apt-get update\n        python -m pip install -U pip\n\n    - name: Setup Ruby\n      uses: ruby/setup-ruby@v1\n      with:\n        ruby-version: '3.0'\n\n    - name: Installing dependencies\n      run: |\n        python -m pip install -r docs/requirements.txt\n        gem install jekyll jekyll-remote-theme jekyll-sass-converter\n\n    - name: Build and deploy site\n      if: github.ref == 'refs/heads/main'\n      run: |\n        git remote set-url origin https://x-access-token:${{ secrets.MLC_GITHUB_TOKEN }}@github.com/$GITHUB_REPOSITORY\n        git config --global user.email \"mlc-gh-actions-bot@nomail\"\n        git config --global user.name \"mlc-gh-actions-bot\"\n\n        ./scripts/gh_deploy_site.sh"
  },
  {
    "path": ".github/workflows/build.yaml",
    "content": "name: Build\n\non:\n  pull_request:\n    branches:\n      - main\n  push:\n    branches:\n      - main\n  workflow_dispatch:\n\nconcurrency:\n  group: ${{ github.workflow }}-${{ github.ref }}\n  cancel-in-progress: true\n\njobs:\n  build:\n    runs-on: ubuntu-latest\n    timeout-minutes: 10\n\n    steps:\n      - name: Checkout code\n        uses: actions/checkout@v4\n\n      - name: Set up Node.js\n        uses: actions/setup-node@v4\n        with:\n          node-version-file: \".nvmrc\"\n          cache: npm\n\n      - name: Install dependencies\n        run: npm ci\n\n      - name: Build package\n        run: npm run build\n\n      - name: Validate package contents\n        run: npm pack --dry-run\n"
  },
  {
    "path": ".github/workflows/linter.yaml",
    "content": "name: Linter\n\non:\n  push:\n    branches:\n      - main\n  pull_request:\n    branches:\n      - main\n\njobs:\n  lint:\n    runs-on: ubuntu-latest\n    steps:\n      - name: Checkout code\n        uses: actions/checkout@v3\n\n      - name: Set up Node.js\n        uses: actions/setup-node@v3\n        with:\n          node-version-file: \".nvmrc\"\n\n      - name: Install dependencies\n        run: npm install\n\n      - name: Run lint\n        run: npm run lint\n"
  },
  {
    "path": ".github/workflows/security.yaml",
    "content": "name: Security\n\non:\n  pull_request:\n    branches:\n      - main\n  push:\n    branches:\n      - main\n  schedule:\n    - cron: \"25 5 * * 1\"\n  workflow_dispatch:\n\npermissions:\n  contents: read\n\njobs:\n  dependency-review:\n    if: github.event_name == 'pull_request'\n    runs-on: ubuntu-latest\n    permissions:\n      contents: read\n      pull-requests: read\n\n    steps:\n      - name: Dependency review\n        uses: actions/dependency-review-action@v4\n        with:\n          fail-on-severity: high\n\n  npm-audit:\n    runs-on: ubuntu-latest\n    timeout-minutes: 20\n\n    steps:\n      - name: Checkout code\n        uses: actions/checkout@v4\n\n      - name: Set up Node.js\n        uses: actions/setup-node@v4\n        with:\n          node-version-file: \".nvmrc\"\n          cache: npm\n\n      - name: Install dependencies\n        run: npm ci\n\n      - name: Run npm audit (production dependencies)\n        run: npm audit --omit=dev --audit-level=high\n\n  codeql:\n    if: github.event_name != 'pull_request'\n    runs-on: ubuntu-latest\n    timeout-minutes: 30\n    permissions:\n      actions: read\n      contents: read\n      security-events: write\n\n    steps:\n      - name: Checkout code\n        uses: actions/checkout@v4\n\n      - name: Initialize CodeQL\n        uses: github/codeql-action/init@v3\n        with:\n          languages: javascript-typescript\n\n      - name: Autobuild\n        uses: github/codeql-action/autobuild@v3\n\n      - name: Analyze\n        uses: github/codeql-action/analyze@v3\n"
  },
  {
    "path": ".github/workflows/tests.yaml",
    "content": "name: Tests\n\non:\n  pull_request:\n    branches:\n      - main\n  push:\n    branches:\n      - main\n  workflow_dispatch:\n\nconcurrency:\n  group: ${{ github.workflow }}-${{ github.ref }}\n  cancel-in-progress: true\n\njobs:\n  test:\n    runs-on: ubuntu-latest\n    timeout-minutes: 10\n\n    steps:\n      - name: Checkout code\n        uses: actions/checkout@v4\n\n      - name: Set up Node.js\n        uses: actions/setup-node@v4\n        with:\n          node-version-file: \".nvmrc\"\n          cache: npm\n\n      - name: Install dependencies\n        run: npm ci\n\n      - name: Run test suite\n        env:\n          CI: \"true\"\n        run: npm run test -- --ci\n\n      - name: Upload coverage artifact\n        if: always()\n        uses: actions/upload-artifact@v4\n        with:\n          name: coverage-${{ github.run_id }}\n          path: coverage\n          if-no-files-found: ignore\n"
  },
  {
    "path": ".gitignore",
    "content": "scratch/\ndist/\nparams/\n*.bak\n# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n*.S\n# C extensions\n*.so\n\n\n*.ll\n.npm\n# Distribution / packaging\n.Python\nenv/\nbuild/\nbuild-*/\ndevelop-eggs/\ndist/\ndownloads/\neggs/\n.eggs/\nlib/\nlib64/\nparts/\nsdist/\nvar/\nwheels/\npip-wheel-metadata/\nshare/python-wheels/\n*.egg-info/\n.installed.cfg\n*.egg\nMANIFEST\n\n.conda/\n# PyInstaller\n#  Usually these files are written by a python script from a template\n#  before PyInstaller builds the exe, so as to inject date/other infos into it.\n*.manifest\n*.spec\n\n# Generated by python/gen_requirements.py\npython/requirements/*.txt\n\n# Installer logs\npip-log.txt\npip-delete-this-directory.txt\n\n# Unit test / coverage reports\nhtmlcov/\n.tox/\n.nox/\n.coverage\n.coverage.*\n.cache\nnosetests.xml\ncoverage.xml\n*.cover\n*.py,cover\n.hypothesis/\n.pytest_cache/\n\n# Translations\n*.mo\n*.pot\n\n# Django stuff:\n*.log\nlocal_settings.py\ndb.sqlite3\ndb.sqlite3-journal\n\n# Flask stuff:\ninstance/\n.webassets-cache\n\n# Scrapy stuff:\n.scrapy\n\n# Sphinx documentation\ndocs/_build/\ndocs/_staging/\n\n# PyBuilder\ntarget/\n/target/\n\n# Jupyter Notebook\n.ipynb_checkpoints\n\n# IPython\nprofile_default/\nipython_config.py\n\n# pyenv\n.python-version\n\n# pipenv\n#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.\n#   However, in case of collaboration, if having platform-specific dependencies or dependencies\n#   having no cross-platform support, pipenv may install dependencies that don't work, or not\n#   install all needed dependencies.\n#Pipfile.lock\n\n# PEP 582; used by e.g. github.com/David-OConnor/pyflow\n__pypackages__/\n\n# Celery stuff\ncelerybeat-schedule\ncelerybeat.pid\n\n# SageMath parsed files\n*.sage.py\n\n# Environments\n.env\n.venv\nenv/\nvenv/\nENV/\nenv.bak/\nvenv.bak/\n\n# Spyder project settings\n.spyderproject\n.spyproject\n\n# Rope project settings\n.ropeproject\n*~\n*.pyc\n*~\nconfig.mk\nconfig.cmake\nWin32\n*.dir\nperf\n*.wasm\n.emscripten\n\n## IOS\nDerivedData/\n\n## Java\n*.class\njvm/*/target/\njvm/*/*/target/\njvm/native/*/generated\njvm/native/src/main/native/org_apache_tvm_native_c_api.h\n*.worksheet\n*.idea\n*.iml\n*.classpath\n*.project\n*.settings\n*/node_modules/\n\n## Various settings\n*.pbxuser\n!default.pbxuser\n*.mode1v3\n!default.mode1v3\n*.mode2v3\n!default.mode2v3\n*.perspectivev3\n!default.perspectivev3\nxcuserdata/\n.pkl_memoize_*\n\n.emscripten*\n.m2\n\n# Compiled Dynamic libraries\n*.so\n*.dylib\n*.dll\n\n# Compiled Object files\n*.slo\n*.lo\n*.o\n*.obj\n\n# Precompiled Headers\n*.gch\n*.pch\n\n# Compiled Static libraries\n*.lai\n*.la\n*.a\n*.lib\n\n# Executables\n*.exe\n*.out\n*.app\n\n## Other\n*.moved-aside\n*.xccheckout\n*.xcscmblueprint\n.DS_Store\ntags\ncscope*\n*.lock\n\n# vim temporary files\n*.swp\n*.swo\n\n# TVM generated code\nperf\n.bash_history\n# *.json\n*.params\n*.ro\n*.onnx\n*.h5\nsynset.txt\ncat.jpg\ncat.png\ndocs.tgz\ncat.png\n*.mlmodel\ntvm_u.*\ntvm_t.*\n# Mac OS X\n.DS_Store\n\n# Jetbrain\n.idea\n.ipython\n.jupyter\n.nv\n.pylint.d\n.python_history\n.pytest_cache\n.local\ncmake-build-debug\n\n# Visual Studio\n.vs\n\n# Visual Studio Code\n.vscode\n\n# tmp file\n.nfs*\n\n# keys\n*.pem\n*.p12\n*.pfx\n*.cer\n*.crt\n*.der\n\n# patch sentinel\npatched.txt\n\n# Python type checking\n.mypy_cache/\n.pyre/\n\n# pipenv files\nPipfile\nPipfile.lock\n\n# conda package artifacts\nconda/Dockerfile.cuda*\nconda/pkg\n.node_repl_history\n# nix files\n.envrc\n*.nix\n\n# Docker files\n.sudo_as_admin_successful\n\n# Downloaded models/datasets\n.tvm_test_data\n.dgl\n.caffe2\n\n# Local docs build\n_docs/\njvm/target\n.config/configstore/\n.ci-py-scripts/\n\n# Generated Hexagon files\nsrc/runtime/hexagon/rpc/hexagon_rpc.h\nsrc/runtime/hexagon/rpc/hexagon_rpc_skel.c\nsrc/runtime/hexagon/rpc/hexagon_rpc_stub.c\n\n# Local tvm-site checkout\ntvm-site/\n\n# Generated docs files\ngallery/how_to/work_with_microtvm/micro_tvmc.py\n\n# Test sample data files\n!tests/python/ci/sample_prs/*.json\n\n# Used in CI to communicate between Python and Jenkins\n.docker-image-names/\n\n# Printed TIR code on disk\n*.tir\n\n# GDB history file\n.gdb_history\n\n3rdparty\ndist\ntvm_home\nnode_modules\nlib\n.parcel-cache\n\n**/.next\ncoverage"
  },
  {
    "path": ".gitmodules",
    "content": ""
  },
  {
    "path": ".husky/pre-commit",
    "content": "npx lint-staged\n"
  },
  {
    "path": ".lintstagedrc.json",
    "content": "{\n  \"./**/*.{js,ts,jsx,tsx,json}\": [\"eslint --fix\", \"prettier --write\"]\n}\n"
  },
  {
    "path": ".nvmrc",
    "content": "v24.11.1"
  },
  {
    "path": ".prettierignore",
    "content": "dist\ndebug\nlib\nbuild\nnode_modules\n3rdparty\n.eslintrc.cjs\n**/.next"
  },
  {
    "path": ".prettierrc",
    "content": "{\n  \"trailingComma\": \"all\"\n}\n"
  },
  {
    "path": "CONTRIBUTING.md",
    "content": "# Contributing to WebLLM\n\nThank you for your interest in contributing to WebLLM. This guide helps contributors get set up quickly and make high-impact changes that are easy to review and merge.\n\n## Ways To Contribute\n\nWe welcome contributions across the project, including:\n\n- Bug reports with clear reproduction steps\n- Bug fixes and reliability improvements\n- New features and API improvements\n- Performance and memory optimizations\n- Tests and test coverage improvements\n- Documentation updates and tutorials\n- New or improved examples in `examples/`\n- Model integration and configuration improvements\n- Code review and issue triage support\n\nIf you are unsure where to start, look for open issues in the repository and propose a plan in the issue thread before implementation.\n\n## Community Principles\n\nWebLLM is part of a broader open-source ecosystem and follows collaborative, public-first development norms.\n\n- Keep technical discussion in public, archivable channels (issues and pull requests)\n- Use clear technical reasoning and seek consensus on non-trivial changes\n- For major design changes, start with an issue or RFC-style proposal before coding\n- Review other contributors' PRs when possible\n\nAdditional reference: Apache TVM community guidelines\n\n- https://tvm.apache.org/docs/contribute/community.html\n\n## Development Setup\n\n### Prerequisites\n\n- Node.js (see `.nvmrc` for the required version)\n- npm\n- Git\n\nOptional:\n\n- Python 3 (for docs build)\n- Emscripten/toolchain setup\n\n### Local Setup\n\n```bash\ngit clone https://github.com/mlc-ai/web-llm.git\ncd web-llm\nnpm install\n```\n\n### Build, Lint, and Test\n\n```bash\nnpm run build\nnpm run lint\nnpm test\n```\n\nNotes:\n\n- `npm test` runs Jest with coverage thresholds.\n- For quick iteration on a single test file, you can run:\n\n```bash\nnpx jest --coverage=false tests/<file>.test.ts\n```\n\n### Auto-formatting\n\nIf lint or style checks fail, run:\n\n```bash\nnpm run format\n```\n\nPre-commit hooks (Husky + lint-staged) are configured in this repo.\n\n## Testing Changes In Examples\n\nTo test local package changes inside an example app:\n\n1. Edit `examples/<example>/package.json` and set `\"@mlc-ai/web-llm\"` to `\"../..\"` (or `\"file:../..\"` if needed).\n2. Install and run the example.\n\n```bash\ncd examples/<example>\nnpm install\nnpm run start\n```\n\n## Documentation Contributions\n\nDocs are in `docs/` and built with Sphinx.\n\n```bash\ncd docs\npip3 install -r requirements.txt\nmake html\n```\n\nOpen the built docs from `docs/_build/html`.\n\n## Pull Request Guidelines\n\nBefore opening a PR:\n\n1. Keep the change scoped to one problem or feature.\n2. Add or update tests for behavior changes.\n3. Update docs/examples for user-facing changes.\n4. Run `npm run lint` and `npm test` locally.\n5. Include a clear PR description with:\n   - Problem statement\n   - Proposed solution\n   - Validation steps and results\n   - Backward-compatibility considerations\n\nDuring review:\n\n- Respond to comments with concrete follow-ups\n- Prefer additional tests over assumptions\n- Keep commit history understandable (small, logical commits)\n\n## Reporting Bugs and Requesting Features\n\n- Use GitHub Issues for bug reports and feature requests.\n- Include environment details, expected vs. actual behavior, and minimal reproduction steps.\n- For substantial feature additions, open an issue first to align on design and scope.\n\n## Security Reporting\n\nPlease do not report security vulnerabilities in public issues. Report vulnerabilities via email to `mlc-llm-private@googlegroups.com`.\n\nReference:\n\n- https://github.com/mlc-ai/web-llm/blob/main/SECURITY.md\n\n## License\n\nBy contributing, you agree that your contributions are provided under the repository's Apache-2.0 license.\n"
  },
  {
    "path": "LICENSE",
    "content": "                                 Apache License\n                           Version 2.0, January 2004\n                        http://www.apache.org/licenses/\n\n   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n   1. Definitions.\n\n      \"License\" shall mean the terms and conditions for use, reproduction,\n      and distribution as defined by Sections 1 through 9 of this document.\n\n      \"Licensor\" shall mean the copyright owner or entity authorized by\n      the copyright owner that is granting the License.\n\n      \"Legal Entity\" shall mean the union of the acting entity and all\n      other entities that control, are controlled by, or are under common\n      control with that entity. For the purposes of this definition,\n      \"control\" means (i) the power, direct or indirect, to cause the\n      direction or management of such entity, whether by contract or\n      otherwise, or (ii) ownership of fifty percent (50%) or more of the\n      outstanding shares, or (iii) beneficial ownership of such entity.\n\n      \"You\" (or \"Your\") shall mean an individual or Legal Entity\n      exercising permissions granted by this License.\n\n      \"Source\" form shall mean the preferred form for making modifications,\n      including but not limited to software source code, documentation\n      source, and configuration files.\n\n      \"Object\" form shall mean any form resulting from mechanical\n      transformation or translation of a Source form, including but\n      not limited to compiled object code, generated documentation,\n      and conversions to other media types.\n\n      \"Work\" shall mean the work of authorship, whether in Source or\n      Object form, made available under the License, as indicated by a\n      copyright notice that is included in or attached to the work\n      (an example is provided in the Appendix below).\n\n      \"Derivative Works\" shall mean any work, whether in Source or Object\n      form, that is based on (or derived from) the Work and for which the\n      editorial revisions, annotations, elaborations, or other modifications\n      represent, as a whole, an original work of authorship. For the purposes\n      of this License, Derivative Works shall not include works that remain\n      separable from, or merely link (or bind by name) to the interfaces of,\n      the Work and Derivative Works thereof.\n\n      \"Contribution\" shall mean any work of authorship, including\n      the original version of the Work and any modifications or additions\n      to that Work or Derivative Works thereof, that is intentionally\n      submitted to Licensor for inclusion in the Work by the copyright owner\n      or by an individual or Legal Entity authorized to submit on behalf of\n      the copyright owner. For the purposes of this definition, \"submitted\"\n      means any form of electronic, verbal, or written communication sent\n      to the Licensor or its representatives, including but not limited to\n      communication on electronic mailing lists, source code control systems,\n      and issue tracking systems that are managed by, or on behalf of, the\n      Licensor for the purpose of discussing and improving the Work, but\n      excluding communication that is conspicuously marked or otherwise\n      designated in writing by the copyright owner as \"Not a Contribution.\"\n\n      \"Contributor\" shall mean Licensor and any individual or Legal Entity\n      on behalf of whom a Contribution has been received by Licensor and\n      subsequently incorporated within the Work.\n\n   2. Grant of Copyright License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      copyright license to reproduce, prepare Derivative Works of,\n      publicly display, publicly perform, sublicense, and distribute the\n      Work and such Derivative Works in Source or Object form.\n\n   3. Grant of Patent License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      (except as stated in this section) patent license to make, have made,\n      use, offer to sell, sell, import, and otherwise transfer the Work,\n      where such license applies only to those patent claims licensable\n      by such Contributor that are necessarily infringed by their\n      Contribution(s) alone or by combination of their Contribution(s)\n      with the Work to which such Contribution(s) was submitted. If You\n      institute patent litigation against any entity (including a\n      cross-claim or counterclaim in a lawsuit) alleging that the Work\n      or a Contribution incorporated within the Work constitutes direct\n      or contributory patent infringement, then any patent licenses\n      granted to You under this License for that Work shall terminate\n      as of the date such litigation is filed.\n\n   4. Redistribution. You may reproduce and distribute copies of the\n      Work or Derivative Works thereof in any medium, with or without\n      modifications, and in Source or Object form, provided that You\n      meet the following conditions:\n\n      (a) You must give any other recipients of the Work or\n          Derivative Works a copy of this License; and\n\n      (b) You must cause any modified files to carry prominent notices\n          stating that You changed the files; and\n\n      (c) You must retain, in the Source form of any Derivative Works\n          that You distribute, all copyright, patent, trademark, and\n          attribution notices from the Source form of the Work,\n          excluding those notices that do not pertain to any part of\n          the Derivative Works; and\n\n      (d) If the Work includes a \"NOTICE\" text file as part of its\n          distribution, then any Derivative Works that You distribute must\n          include a readable copy of the attribution notices contained\n          within such NOTICE file, excluding those notices that do not\n          pertain to any part of the Derivative Works, in at least one\n          of the following places: within a NOTICE text file distributed\n          as part of the Derivative Works; within the Source form or\n          documentation, if provided along with the Derivative Works; or,\n          within a display generated by the Derivative Works, if and\n          wherever such third-party notices normally appear. The contents\n          of the NOTICE file are for informational purposes only and\n          do not modify the License. You may add Your own attribution\n          notices within Derivative Works that You distribute, alongside\n          or as an addendum to the NOTICE text from the Work, provided\n          that such additional attribution notices cannot be construed\n          as modifying the License.\n\n      You may add Your own copyright statement to Your modifications and\n      may provide additional or different license terms and conditions\n      for use, reproduction, or distribution of Your modifications, or\n      for any such Derivative Works as a whole, provided Your use,\n      reproduction, and distribution of the Work otherwise complies with\n      the conditions stated in this License.\n\n   5. Submission of Contributions. Unless You explicitly state otherwise,\n      any Contribution intentionally submitted for inclusion in the Work\n      by You to the Licensor shall be under the terms and conditions of\n      this License, without any additional terms or conditions.\n      Notwithstanding the above, nothing herein shall supersede or modify\n      the terms of any separate license agreement you may have executed\n      with Licensor regarding such Contributions.\n\n   6. Trademarks. This License does not grant permission to use the trade\n      names, trademarks, service marks, or product names of the Licensor,\n      except as required for reasonable and customary use in describing the\n      origin of the Work and reproducing the content of the NOTICE file.\n\n   7. Disclaimer of Warranty. Unless required by applicable law or\n      agreed to in writing, Licensor provides the Work (and each\n      Contributor provides its Contributions) on an \"AS IS\" BASIS,\n      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n      implied, including, without limitation, any warranties or conditions\n      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n      PARTICULAR PURPOSE. You are solely responsible for determining the\n      appropriateness of using or redistributing the Work and assume any\n      risks associated with Your exercise of permissions under this License.\n\n   8. Limitation of Liability. In no event and under no legal theory,\n      whether in tort (including negligence), contract, or otherwise,\n      unless required by applicable law (such as deliberate and grossly\n      negligent acts) or agreed to in writing, shall any Contributor be\n      liable to You for damages, including any direct, indirect, special,\n      incidental, or consequential damages of any character arising as a\n      result of this License or out of the use or inability to use the\n      Work (including but not limited to damages for loss of goodwill,\n      work stoppage, computer failure or malfunction, or any and all\n      other commercial damages or losses), even if such Contributor\n      has been advised of the possibility of such damages.\n\n   9. Accepting Warranty or Additional Liability. While redistributing\n      the Work or Derivative Works thereof, You may choose to offer,\n      and charge a fee for, acceptance of support, warranty, indemnity,\n      or other liability obligations and/or rights consistent with this\n      License. However, in accepting such obligations, You may act only\n      on Your own behalf and on Your sole responsibility, not on behalf\n      of any other Contributor, and only if You agree to indemnify,\n      defend, and hold each Contributor harmless for any liability\n      incurred by, or claims asserted against, such Contributor by reason\n      of your accepting any such warranty or additional liability.\n\n   END OF TERMS AND CONDITIONS\n\n   APPENDIX: How to apply the Apache License to your work.\n\n      To apply the Apache License to your work, attach the following\n      boilerplate notice, with the fields enclosed by brackets \"[]\"\n      replaced with your own identifying information. (Don't include\n      the brackets!)  The text should be enclosed in the appropriate\n      comment syntax for the file format. We also recommend that a\n      file or class name and description of purpose be included on the\n      same \"printed page\" as the copyright notice for easier\n      identification within third-party archives.\n\n   Copyright [yyyy] [name of copyright owner]\n\n   Licensed under the Apache License, Version 2.0 (the \"License\");\n   you may not use this file except in compliance with the License.\n   You may obtain a copy of the License at\n\n       http://www.apache.org/licenses/LICENSE-2.0\n\n   Unless required by applicable law or agreed to in writing, software\n   distributed under the License is distributed on an \"AS IS\" BASIS,\n   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n   See the License for the specific language governing permissions and\n   limitations under the License.\n\n------------------------------------------------------------------------------------\nThis product bundles various third-party components under other open source licenses.\nThis section summarizes those components and their licenses. See licenses/\nfor text of these licenses.\n\nApache Software Foundation License 2.0\n--------------------------------------\n\nsrc/openai_api_protocols\n"
  },
  {
    "path": "README.md",
    "content": "<div align=\"center\" id=\"top\">\n\n# WebLLM\n[![NPM Package](https://img.shields.io/badge/NPM_Package-Published-cc3534)](https://www.npmjs.com/package/@mlc-ai/web-llm)\n[![\"WebLLM Chat Deployed\"](https://img.shields.io/badge/WebLLM_Chat-Deployed-%2332a852)](https://chat.webllm.ai/)\n[![Join Discord](https://img.shields.io/badge/Join-Discord-7289DA?logo=discord&logoColor=white)](https://discord.gg/9Xpy2HGBuD)\n[![Related Repository: WebLLM Chat](https://img.shields.io/badge/Related_Repo-WebLLM_Chat-fafbfc?logo=github)](https://github.com/mlc-ai/web-llm-chat/)\n[![Related Repository: MLC LLM](https://img.shields.io/badge/Related_Repo-MLC_LLM-fafbfc?logo=github)](https://github.com/mlc-ai/mlc-llm/)\n\n**High-Performance In-Browser LLM Inference Engine.**\n\n\n[Documentation](https://webllm.mlc.ai/docs/) | [Blogpost](https://blog.mlc.ai/2024/06/13/webllm-a-high-performance-in-browser-llm-inference-engine) | [Paper](https://arxiv.org/abs/2412.15803) | [Examples](examples)\n\n</div>\n\n## Overview\nWebLLM is a high-performance in-browser LLM inference engine that brings language model inference directly onto web browsers with hardware acceleration.\nEverything runs inside the browser with no server support and is accelerated with WebGPU.\n\nWebLLM is **fully compatible with [OpenAI API](https://platform.openai.com/docs/api-reference/chat).**\nThat is, you can use the same OpenAI API on **any open source models** locally, with functionalities\nincluding streaming, JSON-mode, function-calling (WIP), etc.\n\nWe can bring a lot of fun opportunities to build AI assistants for everyone and enable privacy while enjoying GPU acceleration.\n\nYou can use WebLLM as a base [npm package](https://www.npmjs.com/package/@mlc-ai/web-llm) and build your own web application on top of it by following the examples below. This project is a companion project of [MLC LLM](https://github.com/mlc-ai/mlc-llm), which enables universal deployment of LLM across hardware environments.\n\n<div align=\"center\">\n\n**[Check out WebLLM Chat to try it out!](https://chat.webllm.ai/)**\n\n</div>\n\n## Key Features\n- **In-Browser Inference**: WebLLM is a high-performance, in-browser language model inference engine that leverages WebGPU for hardware acceleration, enabling powerful LLM operations directly within web browsers without server-side processing.\n\n- [**Full OpenAI API Compatibility**](#full-openai-compatibility): Seamlessly integrate your app with WebLLM using OpenAI API with functionalities such as streaming, JSON-mode, logit-level control, seeding, and more.\n\n- **Structured JSON Generation**: WebLLM supports state-of-the-art JSON mode structured generation, implemented in the WebAssembly portion of the model library for optimal performance. Check [WebLLM JSON Playground](https://huggingface.co/spaces/mlc-ai/WebLLM-JSON-Playground) on HuggingFace to try generating JSON output with custom JSON schema.\n\n- [**Extensive Model Support**](#built-in-models): WebLLM natively supports a range of models including Llama 3, Phi 3, Gemma, Mistral, Qwen(通义千问), and many others, making it versatile for various AI tasks. For the complete supported model list, check [MLC Models](https://mlc.ai/models).\n\n- [**Custom Model Integration**](#custom-models): Easily integrate and deploy custom models in MLC format, allowing you to adapt WebLLM to specific needs and scenarios, enhancing flexibility in model deployment.\n\n- **Plug-and-Play Integration**: Easily integrate WebLLM into your projects using package managers like NPM and Yarn, or directly via CDN, complete with comprehensive [examples](./examples/) and a modular design for connecting with UI components.\n\n- **Streaming & Real-Time Interactions**: Supports streaming chat completions, allowing real-time output generation which enhances interactive applications like chatbots and virtual assistants.\n\n- **Web Worker & Service Worker Support**: Optimize UI performance and manage the lifecycle of models efficiently by offloading computations to separate worker threads or service workers.\n\n- **Chrome Extension Support**: Extend the functionality of web browsers through custom Chrome extensions using WebLLM, with examples available for building both basic and advanced extensions.\n\n## Built-in Models\n\nCheck the complete list of available models on [MLC Models](https://mlc.ai/models). WebLLM supports a subset of these available models and the list can be accessed at [`prebuiltAppConfig.model_list`](https://github.com/mlc-ai/web-llm/blob/main/src/config.ts#L293).\n\nHere are the primary families of models currently supported:\n\n- **Llama**: Llama 3, Llama 2, Hermes-2-Pro-Llama-3\n- **Phi**: Phi 3, Phi 2, Phi 1.5\n- **Gemma**: Gemma-2B\n- **Mistral**: Mistral-7B-v0.3, Hermes-2-Pro-Mistral-7B, NeuralHermes-2.5-Mistral-7B, OpenHermes-2.5-Mistral-7B\n- **Qwen (通义千问)**: Qwen2 0.5B, 1.5B, 7B\n\nIf you need more models, [request a new model via opening an issue](https://github.com/mlc-ai/web-llm/issues/new/choose) or check [Custom Models](#custom-models) for how to compile and use your own models with WebLLM.\n\n## Jumpstart with Examples\n\nLearn how to use WebLLM to integrate large language models into your application and generate chat completions through this simple Chatbot example: \n\n[![Example Chatbot on JSFiddle](https://img.shields.io/badge/Example-JSFiddle-blue?logo=jsfiddle&logoColor=white)](https://jsfiddle.net/neetnestor/4nmgvsa2/)\n[![Example Chatbot on Codepen](https://img.shields.io/badge/Example-Codepen-gainsboro?logo=codepen)](https://codepen.io/neetnestor/pen/vYwgZaG)\n\nFor an advanced example of a larger, more complicated project, check [WebLLM Chat](https://github.com/mlc-ai/web-llm-chat/blob/main/app/client/webllm.ts).\n\nMore examples for different use cases are available in the [examples](./examples/) folder.\n\n## Get Started\n\nWebLLM offers a minimalist and modular interface to access the chatbot in the browser.\nThe package is designed in a modular way to hook to any of the UI components.\n\n### Installation\n\n#### Package Manager\n\n```sh\n# npm\nnpm install @mlc-ai/web-llm\n# yarn\nyarn add @mlc-ai/web-llm\n# or pnpm\npnpm install @mlc-ai/web-llm\n```\n\nThen import the module in your code.\n\n```typescript\n// Import everything\nimport * as webllm from \"@mlc-ai/web-llm\";\n// Or only import what you need\nimport { CreateMLCEngine } from \"@mlc-ai/web-llm\";\n```\n\n#### CDN Delivery\n\nThanks to [jsdelivr.com](https://www.jsdelivr.com/package/npm/@mlc-ai/web-llm), WebLLM can be imported directly through URL and work out-of-the-box on cloud development platforms like [jsfiddle.net](https://jsfiddle.net/), [Codepen.io](https://codepen.io/), and [Scribbler](https://scribbler.live):\n\n```javascript\nimport * as webllm from \"https://esm.run/@mlc-ai/web-llm\";\n```\nIt can also be dynamically imported as:\n```javascript\nconst webllm = await import (\"https://esm.run/@mlc-ai/web-llm\");\n```\n\n### Create MLCEngine\n\nMost operations in WebLLM are invoked through the `MLCEngine` interface. You can create an `MLCEngine` instance and loading the model by calling the `CreateMLCEngine()` factory function.\n\n(Note that loading models requires downloading and it can take a significant amount of time for the very first run without caching previously. You should properly handle this asynchronous call.)\n\n```typescript\nimport { CreateMLCEngine } from \"@mlc-ai/web-llm\";\n\n// Callback function to update model loading progress\nconst initProgressCallback = (initProgress) => {\n  console.log(initProgress);\n}\nconst selectedModel = \"Llama-3.1-8B-Instruct-q4f32_1-MLC\";\n\nconst engine = await CreateMLCEngine(\n  selectedModel,\n  { initProgressCallback: initProgressCallback }, // engineConfig\n);\n```\n\nUnder the hood, this factory function does the following steps for first creating an engine instance (synchronous) and then loading the model (asynchronous). You can also do them separately in your application.\n\n```typescript\nimport { MLCEngine } from \"@mlc-ai/web-llm\";\n\n// This is a synchronous call that returns immediately\nconst engine = new MLCEngine({\n  initProgressCallback: initProgressCallback\n});\n\n// This is an asynchronous call and can take a long time to finish\nawait engine.reload(selectedModel);\n```\n\n### Chat Completion\nAfter successfully initializing the engine, you can now invoke chat completions using OpenAI style chat APIs through the `engine.chat.completions` interface. For the full list of parameters and their descriptions, check [section below](#full-openai-compatibility) and [OpenAI API reference](https://platform.openai.com/docs/api-reference/chat/create).\n\n(Note: The `model` parameter is not supported and will be ignored here. Instead, call `CreateMLCEngine(model)` or `engine.reload(model)` instead as shown in the [Create MLCEngine](#create-mlcengine) above.)\n\n\n```typescript\nconst messages = [\n  { role: \"system\", content: \"You are a helpful AI assistant.\" },\n  { role: \"user\", content: \"Hello!\" },\n]\n\nconst reply = await engine.chat.completions.create({\n  messages,\n});\nconsole.log(reply.choices[0].message);\nconsole.log(reply.usage);\n```\n\n### Streaming\n\nWebLLM also supports streaming chat completion generating. To use it, simply pass `stream: true` to the `engine.chat.completions.create` call.\n\n```typescript\nconst messages = [\n  { role: \"system\", content: \"You are a helpful AI assistant.\" },\n  { role: \"user\", content: \"Hello!\" },\n]\n\n// Chunks is an AsyncGenerator object\nconst chunks = await engine.chat.completions.create({\n  messages,\n  temperature: 1,\n  stream: true, // <-- Enable streaming\n  stream_options: { include_usage: true },\n});\n\nlet reply = \"\";\nfor await (const chunk of chunks) {\n  reply += chunk.choices[0]?.delta.content || \"\";\n  console.log(reply);\n  if (chunk.usage) {\n    console.log(chunk.usage); // only last chunk has usage\n  }\n}\n\nconst fullReply = await engine.getMessage();\nconsole.log(fullReply);\n```\n\n## Advanced Usage\n\n### Using Workers\n\nYou can put the heavy computation in a worker script to optimize your application performance. To do so, you need to:\n\n1. Create a handler in the worker thread that communicates with the frontend while handling the requests.\n2. Create a Worker Engine in your main application, which under the hood sends messages to the handler in the worker thread.\n\nFor detailed implementations of different kinds of Workers, check the following sections.\n\n#### Dedicated Web Worker\n\nWebLLM comes with API support for WebWorker so you can hook\nthe generation process into a separate worker thread so that\nthe computing in the worker thread won't disrupt the UI.\n\nWe create a handler in the worker thread that communicates with the frontend while handling the requests.\n\n```typescript\n// worker.ts\nimport { WebWorkerMLCEngineHandler } from \"@mlc-ai/web-llm\";\n\n// A handler that resides in the worker thread\nconst handler = new WebWorkerMLCEngineHandler();\nself.onmessage = (msg: MessageEvent) => {\n  handler.onmessage(msg);\n};\n```\n\nIn the main logic, we create a `WebWorkerMLCEngine` that\nimplements the same `MLCEngineInterface`. The rest of the logic remains the same.\n\n```typescript\n// main.ts\nimport { CreateWebWorkerMLCEngine } from \"@mlc-ai/web-llm\";\n\nasync function main() {\n  // Use a WebWorkerMLCEngine instead of MLCEngine here\n  const engine = await CreateWebWorkerMLCEngine(\n    new Worker(\n      new URL(\"./worker.ts\", import.meta.url), \n      {\n        type: \"module\",\n      }\n    ),\n    selectedModel,\n    { initProgressCallback }, // engineConfig\n  );\n\n  // everything else remains the same\n}\n```\n\n### Use Service Worker\n\nWebLLM comes with API support for ServiceWorker so you can hook the generation process\ninto a service worker to avoid reloading the model in every page visit and optimize\nyour application's offline experience.\n\n(Note, Service Worker's life cycle is managed by the browser and can be killed any time without notifying the webapp. `ServiceWorkerMLCEngine` will try to keep the service worker thread alive by periodically sending heartbeat events, but your application should also include proper error handling. Check `keepAliveMs` and `missedHeatbeat` in [`ServiceWorkerMLCEngine`](https://github.com/mlc-ai/web-llm/blob/main/src/service_worker.ts#L234) for more details.)\n\nWe create a handler in the worker thread that communicates with the frontend while handling the requests.\n\n\n```typescript\n// sw.ts\nimport { ServiceWorkerMLCEngineHandler } from \"@mlc-ai/web-llm\";\n\nlet handler: ServiceWorkerMLCEngineHandler;\n\nself.addEventListener(\"activate\", function (event) {\n  handler = new ServiceWorkerMLCEngineHandler();\n  console.log(\"Service Worker is ready\");\n});\n```\n\nThen in the main logic, we register the service worker and create the engine using\n`CreateServiceWorkerMLCEngine` function. The rest of the logic remains the same.\n\n```typescript\n// main.ts\nimport { MLCEngineInterface, CreateServiceWorkerMLCEngine } from \"@mlc-ai/web-llm\";\n\nif (\"serviceWorker\" in navigator) {\n  navigator.serviceWorker.register(\n    new URL(\"sw.ts\", import.meta.url),  // worker script\n    { type: \"module\" },\n  );\n}\n\nconst engine: MLCEngineInterface =\n  await CreateServiceWorkerMLCEngine(\n    selectedModel,\n    { initProgressCallback }, // engineConfig\n  );\n```\n\nYou can find a complete example on how to run WebLLM in service worker in [examples/service-worker](examples/service-worker/).\n\n### Chrome Extension\nYou can also find examples of building Chrome extension with WebLLM in [examples/chrome-extension](examples/chrome-extension/) and [examples/chrome-extension-webgpu-service-worker](examples/chrome-extension-webgpu-service-worker/). The latter one leverages service worker, so the extension is persistent in the background. Additionally, you can explore another full project of a Chrome extension, WebLLM Assistant, which leverages WebLLM [here](https://github.com/mlc-ai/web-llm-assistant).\n\n## Full OpenAI Compatibility\nWebLLM is designed to be fully compatible with [OpenAI API](https://platform.openai.com/docs/api-reference/chat). Thus, besides building a simple chatbot, you can also have the following functionalities with WebLLM:\n\n- [streaming](examples/streaming): return output as chunks in real-time in the form of an AsyncGenerator\n- [json-mode](examples/json-mode): efficiently ensure output is in JSON format, see [OpenAI Reference](https://platform.openai.com/docs/guides/text-generation/chat-completions-api) for more.\n- [seed-to-reproduce](examples/seed-to-reproduce): use seeding to ensure a reproducible output with fields `seed`.\n- [function-calling](examples/function-calling) (WIP): function calling with fields `tools` and `tool_choice` (with preliminary support); or manual function calling without `tools` or `tool_choice` (keeps the most flexibility).\n\n## Custom Models\n\nWebLLM works as a companion project of [MLC LLM](https://github.com/mlc-ai/mlc-llm) and it supports custom models in MLC format. \nIt reuses the model artifact and builds the flow of MLC LLM. To compile and use your own models with WebLLM, please check out\n[MLC LLM document](https://llm.mlc.ai/docs/deploy/webllm.html)\non how to compile and deploy new model weights and libraries to WebLLM. \n\nHere, we go over the high-level idea. There are two elements of the WebLLM package that enable new models and weight variants.\n\n- `model`: Contains a URL to model artifacts, such as weights and meta-data.\n- `model_lib`: A URL to the web assembly library (i.e. wasm file) that contains the executables to accelerate the model computations.\n\nBoth are customizable in the WebLLM.\n\n```typescript\nimport { CreateMLCEngine } from \"@mlc-ai/web-llm\";\n\nasync main() {\n  const appConfig = {\n    \"model_list\": [\n      {\n        \"model\": \"/url/to/my/llama\",\n        \"model_id\": \"MyLlama-3b-v1-q4f32_0\",\n        \"model_lib\": \"/url/to/myllama3b.wasm\",\n      }\n    ],\n  };\n  // override default\n  const chatOpts = {\n    \"repetition_penalty\": 1.01\n  };\n\n  // load a prebuilt model\n  // with a chat option override and app config\n  // under the hood, it will load the model from myLlamaUrl\n  // and cache it in the browser cache\n  // The chat will also load the model library from \"/url/to/myllama3b.wasm\",\n  // assuming that it is compatible to the model in myLlamaUrl.\n  const engine = await CreateMLCEngine(\n    \"MyLlama-3b-v1-q4f32_0\",\n    { appConfig }, // engineConfig\n    chatOpts,\n  );\n}\n```\n\nIn many cases, we only want to supply the model weight variant, but\nnot necessarily a new model (e.g. `NeuralHermes-Mistral` can reuse `Mistral`'s\nmodel library). For examples of how a model library can be shared by different model variants,\nsee `webllm.prebuiltAppConfig`.\n\n## Build WebLLM Package From Source\n\nNOTE: you don't need to build from source unless you would like to modify the WebLLM package.\nTo use the npm, simply follow [Get Started](#get-started) or any of the [examples](examples) instead.\n\nTo build from source, simply run:\n\n```bash\nnpm install\nnpm run build\n```\n\nThen, to test the effects of your code change in an example, inside `examples/get-started/package.json`, change from `\"@mlc-ai/web-llm\": \"^0.2.82\"` to `\"@mlc-ai/web-llm\": ../..`.\n\nThen run:\n\n```bash\ncd examples/get-started\nnpm install\nnpm start\n```\n\nNote that sometimes you would need to switch between `file:../..` and `../..` to trigger npm to recognize new changes. In the worst case, you can run:\n\n```bash\ncd examples/get-started\nrm -rf node_modules dist package-lock.json .parcel-cache\nnpm install\nnpm start\n```\n\n### In case you need to build TVMjs from source\n\nWebLLM's runtime largely depends on TVMjs: https://github.com/apache/tvm/tree/main/web\n\nWhile it is also available as an npm package: https://www.npmjs.com/package/@mlc-ai/web-runtime, you can build it from source if needed by following the steps below.\n\n1. Install [emscripten](https://emscripten.org). It is an LLVM-based compiler that compiles C/C++ source code to WebAssembly.\n    - Follow the [installation instruction](https://emscripten.org/docs/getting_started/downloads.html#installation-instructions-using-the-emsdk-recommended) to install the latest emsdk.\n    - Source `emsdk_env.sh` by `source path/to/emsdk_env.sh`, so that `emcc` is reachable from PATH and the command `emcc` works.\n\n    We can verify the successful installation by trying out `emcc` terminal.\n\n    Note: We recently found that using the latest `emcc` version may run into issues during runtime. Use `./emsdk install 3.1.56` instead of `./emsdk install latest` for now as a workaround. The error may look like\n    ```\n    Init error, LinkError: WebAssembly.instantiate(): Import #6 module=\"wasi_snapshot_preview1\"\n    function=\"proc_exit\": function import requires a callable\n    ```\n\n2. In `./package.json`, change from `\"@mlc-ai/web-runtime\": \"0.18.0-dev2\",` to `\"@mlc-ai/web-runtime\": \"file:./tvm_home/web\",`.\n\n3. Setup necessary environment\n\n   Prepare all the necessary dependencies for web build:\n\n   ```shell\n   ./scripts/prep_deps.sh\n   ```\n\n   In this step, if `$TVM_SOURCE_DIR` is not defined in the environment, we will execute the following line to build `tvmjs` dependency:\n   ```shell\n   git clone https://github.com/mlc-ai/relax 3rdparty/tvm-unity --recursive\n   ```\n\n   This clones the current HEAD of `mlc-ai/relax`. However, it may not always be the correct branch or commit to clone. To build a specific npm version from source, refer to the version bump PR, which states which branch (i.e. `mlc-ai/relax` or `apache/tvm`) and which commit the current WebLLM version depends on. For instance, version 0.2.52, according to its version bump PR https://github.com/mlc-ai/web-llm/pull/521, is built by checking out the following commit https://github.com/apache/tvm/commit/e6476847753c80e054719ac47bc2091c888418b6 in `apache/tvm`, rather than the HEAD of `mlc-ai/relax`.\n\n   Besides, `--recursive` is necessary and important. Otherwise, you may encounter errors like `fatal error: 'dlpack/dlpack.h' file not found`.\n\n4. Build WebLLM Package\n\n   ```shell\n   npm run build\n   ```\n\n5. Validate some of the sub-packages\n\n   You can then go to the subfolders in [examples](examples) to validate some of the sub-packages.\n   We use Parcelv2 for bundling. Although Parcel is not very good at tracking parent directory\n   changes sometimes. When you make a change in the WebLLM package, try to edit the `package.json`\n   of the subfolder and save it, which will trigger Parcel to rebuild.\n\n## Links\n\n- [Demo App: WebLLM Chat](https://chat.webllm.ai/)\n- If you want to run LLM on native runtime, check out [MLC-LLM](https://github.com/mlc-ai/mlc-llm)\n- You might also be interested in [Web Stable Diffusion](https://github.com/mlc-ai/web-stable-diffusion/).\n\n## Acknowledgement\n\nThis project is initiated by members from CMU Catalyst, UW SAMPL, SJTU, OctoML, and the MLC community. We would love to continue developing and supporting the open-source ML community.\n\nThis project is only possible thanks to the shoulders open-source ecosystems that we stand on. We want to thank the Apache TVM community and developers of the TVM Unity effort. The open-source ML community members made these models publicly available. PyTorch and Hugging Face communities make these models accessible. We would like to thank the teams behind Vicuna, SentencePiece, LLaMA, and Alpaca. We also would like to thank the WebAssembly, Emscripten, and WebGPU communities. Finally, thanks to Dawn and WebGPU developers.\n\n## Citation\nIf you find this project to be useful, please cite:\n\n```\n@misc{ruan2024webllmhighperformanceinbrowserllm,\n      title={WebLLM: A High-Performance In-Browser LLM Inference Engine}, \n      author={Charlie F. Ruan and Yucheng Qin and Xun Zhou and Ruihang Lai and Hongyi Jin and Yixin Dong and Bohan Hou and Meng-Shiun Yu and Yiyan Zhai and Sudeep Agarwal and Hangrui Cao and Siyuan Feng and Tianqi Chen},\n      year={2024},\n      eprint={2412.15803},\n      archivePrefix={arXiv},\n      primaryClass={cs.LG},\n      url={https://arxiv.org/abs/2412.15803}, \n}\n```\n\n## Contributors\n\n<a href=\"https://github.com/mlc-ai/web-llm/graphs/contributors\">\n  <img alt=\"contributors\" src=\"https://contrib.rocks/image?repo=mlc-ai/web-llm\"/>\n</a>\n\n<p align=\"right\">\n  <a href=\"#top\">⬆ Back to Top ⬆</a>\n</p>\n"
  },
  {
    "path": "SECURITY.md",
    "content": "# Security Policy\n\n## Reporting a Vulnerability\n\nFor security concerns or vulnerability reports, please send email to `mlc-llm-private@googlegroups.com`.\n"
  },
  {
    "path": "cleanup-index-js.sh",
    "content": "# Remove instances of string \"const{createRequire:createRequire}=await import('module');\"\n# This is required to allow background workers packaged with Parcel for the chrome extension\n# to run the `ChatModule`.\nsed -e s/\"const{createRequire:createRequire}=await import('module');\"//g -i.backup lib/index.js\nsed -e s/\"const{createRequire:createRequire}=await import('module');\"//g -i.backup lib/index.js.map\n\n# Replace scriptDirectory init that Parcel cannot resolve (\"new URL('./', import.meta.url)\") with a plain relative string\nsed -e s~\"require(\\\\\\\"url\\\\\\\").fileURLToPath(new URL(\\\\\\\"\\\\.\\\\/\\\\\\\",import.meta.url))\"~\"\\\\\\\"./\\\\\\\"\"~g -i.backup lib/index.js\nsed -e s~\"require(\\\\\\\"url\\\\\\\").fileURLToPath(new URL(\\\\\\\"\\\\.\\\\/\\\\\\\",import.meta.url))\"~'\\\\\\\".\\\\\\\"'~g -i.backup lib/index.js.map\n\n# Replace string \"new (require('u' + 'rl').URL)('file:' + __filename).href\" with \"MLC_DUMMY_PATH\"\n# This is required for building nextJS projects -- its compile time would complain about `require()`\n# See https://github.com/mlc-ai/web-llm/issues/383 and the fixing PR's description for more.\nsed -e s/\"new (require('u' + 'rl').URL)('file:' + __filename).href\"/\"\\\"MLC_DUMMY_PATH\\\"\"/g -i.backup lib/index.js\n# Replace with \\\"MLC_DUMMY_PATH\\\"\nsed -e s/\"new (require('u' + 'rl').URL)('file:' + __filename).href\"/'\\\\\\\"MLC_DUMMY_PATH\\\\\\\"'/g -i.backup lib/index.js.map\n\n# Replace \"import require$$3 from 'perf_hooks';\" with a string \"const require$$3 = \"MLC_DUMMY_REQUIRE_VAR\"\"\n# This is to prevent `perf_hooks` not found error\n# For more see https://github.com/mlc-ai/web-llm/issues/258 and https://github.com/mlc-ai/web-llm/issues/127\nsed -e s/\"import require\\$\\$3 from 'perf_hooks';\"/\"const require\\$\\$3 = \\\"MLC_DUMMY_REQUIRE_VAR\\\"\"/g -i.backup lib/index.js\n# Similarly replace `const performanceNode = require(\\\"perf_hooks\\\")` with `const performanceNode = \\\"MLC_DUMMY_REQUIRE_VAR\\\"`\nsed -e s/'require(\\\\\\\"perf_hooks\\\\\\\")'/'\\\\\\\"MLC_DUMMY_REQUIRE_VAR\\\\\\\"'/g -i.backup lib/index.js.map\n\n# Below is added when we include dependency @mlc-ai/web-runtime, rather than using local tvm_home\n# Replace \"import require$$4 from 'ws'\" with a string \"const require$$3 = \"MLC_DUMMY_REQUIRE_VAR\"\"\n# This is to prevent error `Cannot find module 'ws'`\nsed -e s/\"import require\\$\\$4 from 'ws';\"/\"const require\\$\\$4 = \\\"MLC_DUMMY_REQUIRE_VAR\\\"\"/g -i.backup lib/index.js\n# Similarly replace `const WebSocket = require(\\\"ws\\\")` with `const WebSocket = \\\"MLC_DUMMY_REQUIRE_VAR\\\"`\nsed -e s/'require(\\\\\\\"ws\\\\\\\")'/'\\\\\\\"MLC_DUMMY_REQUIRE_VAR\\\\\\\"'/g -i.backup lib/index.js.map\n\n# Cleanup backup files\nrm -f lib/index.js.backup\nrm -f lib/index.js.map.backup\n"
  },
  {
    "path": "docs/Makefile",
    "content": "# Minimal makefile for Sphinx documentation\n#\n\n# You can set these variables from the command line, and also\n# from the environment for the first two.\nSPHINXOPTS    ?=\nSPHINXBUILD   ?= python -m sphinx\nSOURCEDIR     = .\nBUILDDIR      = _build\n\n# Put it first so that \"make\" without argument is like \"make help\".\nhelp:\n\t@$(SPHINXBUILD) -M help \"$(SOURCEDIR)\" \"$(BUILDDIR)\" $(SPHINXOPTS) $(O)\n\n.PHONY: help Makefile\n\n# Catch-all target: route all unknown targets to Sphinx using the new\n# \"make mode\" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).\n%: Makefile\n\t@$(SPHINXBUILD) -M $@ \"$(SOURCEDIR)\" \"$(BUILDDIR)\" $(SPHINXOPTS) $(O)\n"
  },
  {
    "path": "docs/README.md",
    "content": "# WebLLM Documentation\n\nThe documentation was built upon [Sphinx](https://www.sphinx-doc.org/en/master/).\n\n## Dependencies\n\nRun the following command in this directory to install dependencies first:\n\n```bash\npip3 install -r requirements.txt\n```\n\n## Build the Documentation\n\nThen you can build the documentation by running:\n\n```bash\nmake html\n```\n\n## View the Documentation\n\nRun the following command to start a simple HTTP server:\n\n```bash\ncd _build/html\npython3 -m http.server\n```\n\nThen you can view the documentation in your browser at `http://localhost:8000` (the port can be customized by appending ` -p PORT_NUMBER` in the python command above).\n"
  },
  {
    "path": "docs/conf.py",
    "content": "# -*- coding: utf-8 -*-\nimport os\nimport sys\n\nimport tlcpack_sphinx_addon\n\n# -- General configuration ------------------------------------------------\n\nsys.path.insert(0, os.path.abspath(\"../python\"))\nsys.path.insert(0, os.path.abspath(\"../\"))\nautodoc_mock_imports = [\"torch\"]\n\n# General information about the project.\nproject = \"web-llm\"\nauthor = \"WebLLM Contributors\"\ncopyright = \"2023, %s\" % author\n\n# Version information.\n\nversion = \"0.2.82\"\nrelease = \"0.2.82\"\n\nextensions = [\n    \"sphinx_tabs.tabs\",\n    \"sphinx_toolbox.collapse\",\n    \"sphinxcontrib.httpdomain\",\n    \"sphinx.ext.autodoc\",\n    \"sphinx.ext.napoleon\",\n    \"sphinx_reredirects\",\n]\n\nredirects = {\"get_started/try_out\": \"../index.html#getting-started\"}\n\nsource_suffix = [\".rst\"]\n\nlanguage = \"en\"\n\nexclude_patterns = [\"_build\", \"Thumbs.db\", \".DS_Store\"]\n\n# The name of the Pygments (syntax highlighting) style to use.\npygments_style = \"sphinx\"\n\n# A list of ignored prefixes for module index sorting.\n# If true, `todo` and `todoList` produce output, else they produce nothing.\ntodo_include_todos = False\n\n# -- Options for HTML output ----------------------------------------------\n\n# The theme is set by the make target\nimport sphinx_rtd_theme\n\nhtml_theme = \"sphinx_rtd_theme\"\nhtml_theme_path = [sphinx_rtd_theme.get_html_theme_path()]\n\ntemplates_path = []\n\nhtml_static_path = []\n\nfooter_copyright = \"© 2023 MLC LLM\"\nfooter_note = \" \"\n\nhtml_logo = \"_static/img/mlc-logo-with-text-landscape.svg\"\n\nhtml_theme_options = {\n    \"logo_only\": True,\n}\n\nheader_links = [\n    (\"Home\", \"https://webllm.mlc.ai/\"),\n    (\"GitHub\", \"https://github.com/mlc-ai/web-llm\"),\n    (\"Discord\", \"https://discord.gg/9Xpy2HGBuD\"),\n]\n\nheader_dropdown = {\n    \"name\": \"Other Resources\",\n    \"items\": [\n        (\"WebLLM Chat\", \"https://chat.webllm.ai/\"),\n        (\"MLC Course\", \"https://mlc.ai/\"),\n        (\"MLC Blog\", \"https://blog.mlc.ai/\"),\n        (\"MLC LLM\", \"https://llm.mlc.ai/\"),\n    ],\n}\n\nhtml_context = {\n    \"footer_copyright\": footer_copyright,\n    \"footer_note\": footer_note,\n    \"header_links\": header_links,\n    \"header_dropdown\": header_dropdown,\n    \"display_github\": True,\n    \"github_user\": \"mlc-ai\",\n    \"github_repo\": \"web-llm\",\n    \"github_version\": \"main/docs/\",\n    \"theme_vcs_pageview_mode\": \"edit\",\n    # \"header_logo\": \"/path/to/logo\",\n    # \"header_logo_link\": \"\",\n    # \"version_selecter\": \"\",\n}\n\n\n# add additional overrides\ntemplates_path += [tlcpack_sphinx_addon.get_templates_path()]\nhtml_static_path += [tlcpack_sphinx_addon.get_static_path()]\n"
  },
  {
    "path": "docs/developer/add_models.rst",
    "content": "Adding Models\n=============\n\nWebLLM allows you to compile custom language models using `MLC-LLM <https://llm.mlc.ai/>`_ and then serve the compiled model through WebLLM.\n\nFor instructions on how to compile and add custom models to WebLLM, please refer to the `MLC-LLM documentation <https://llm.mlc.ai/docs/deploy/webllm.html>`_. "
  },
  {
    "path": "docs/developer/building_from_source.rst",
    "content": "Building From Source\n====================\n\nClone the Repository\n---------------------\n.. code-block:: bash\n\n   git clone https://github.com/mlc-ai/web-llm.git\n   cd web-llm\n\nInstall Dependencies\n---------------------\n.. code-block:: bash\n\n   npm install\n\nBuild the Project\n-----------------\n.. code-block:: bash\n\n   npm run build\n\nTest Changes\n------------\n\nTo test your changes, you can reuse an existing example or create a new example that specifically tests the new functionality you wish to provide.\n\nTo test the effects of your code change in an example, inside ``examples/<example>/package.json``, change ``\"@mlc-ai/web-llm\": \"^0.2.xx\"`` to ``\"@mlc-ai/web-llm\": \"../..\"`` to let it reference your local code. Note that sometimes you may need to switch between ``\"file:../..\"`` and ``\"../..\"`` to trigger npm to recognize new changes.\n\n.. code-block:: bash\n\n   cd examples/<example>\n   # Modify package.json as described\n   npm install\n   npm start\n"
  },
  {
    "path": "docs/index.rst",
    "content": "👋 Welcome to WebLLM\n====================\n\n`GitHub <https://github.com/mlc-ai/web-llm>`_ | `WebLLM Chat <https://chat.webllm.ai/>`_ | `NPM <https://www.npmjs.com/package/@mlc-ai/web-llm>`_ | `Discord <https://discord.gg/9Xpy2HGBuD>`_\n\nWebLLM is a high-performance in-browser language model inference engine that brings large language models (LLMs) to web browsers with hardware acceleration. With WebGPU support, it allows developers to build AI-powered applications directly within the browser environment, removing the need for server-side processing and ensuring privacy.\n\nIt provides a specialized runtime for the web backend of MLCEngine, leverages\n`WebGPU <https://www.w3.org/TR/webgpu/>`_ for local acceleration, offers OpenAI-compatible API,\nand provides built-in support for web workers to separate heavy computation from the UI flow.\n\nKey Features\n------------\n- 🌐 In-Browser Inference: Run LLMs directly in the browser\n- 🚀 WebGPU Acceleration: Leverage hardware acceleration for optimal performance\n- 🔄 OpenAI API Compatibility: Seamless integration with standard AI workflows\n- 📦 Multiple Model Support: Works with Llama, Phi, Gemma, Mistral, and more\n\nStart exploring WebLLM by `chatting with WebLLM Chat <https://chat.webllm.ai/>`_, and start building webapps with high-performance local LLM inference with the following guides and tutorials.\n\n.. toctree::\n   :maxdepth: 2\n   :caption: User Guide\n\n   user/get_started.rst\n   user/basic_usage.rst\n   user/advanced_usage.rst\n   user/api_reference.rst\n\n.. toctree::\n   :maxdepth: 2\n   :caption: Developer Guide\n\n   developer/building_from_source.rst\n   developer/add_models.rst\n"
  },
  {
    "path": "docs/make.bat",
    "content": "@ECHO OFF\n\npushd %~dp0\n\nREM Command file for Sphinx documentation\n\nif \"%SPHINXBUILD%\" == \"\" (\n\tset SPHINXBUILD=sphinx-build\n)\nset SOURCEDIR=.\nset BUILDDIR=_build\n\n%SPHINXBUILD% >NUL 2>NUL\nif errorlevel 9009 (\n\techo.\n\techo.The 'sphinx-build' command was not found. Make sure you have Sphinx\n\techo.installed, then set the SPHINXBUILD environment variable to point\n\techo.to the full path of the 'sphinx-build' executable. Alternatively you\n\techo.may add the Sphinx directory to PATH.\n\techo.\n\techo.If you don't have Sphinx installed, grab it from\n\techo.https://www.sphinx-doc.org/\n\texit /b 1\n)\n\nif \"%1\" == \"\" goto help\n\n%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%\ngoto end\n\n:help\n%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%\n\n:end\npopd\n"
  },
  {
    "path": "docs/requirements.txt",
    "content": "sphinx-tabs == 3.4.1\nsphinx-rtd-theme\nsphinx == 5.2.3\nsphinx-toolbox == 3.4.0\ntlcpack-sphinx-addon==0.2.2\nsphinxcontrib_httpdomain==1.8.1\nsphinxcontrib-napoleon==0.7\nsphinx-reredirects==0.1.2\n"
  },
  {
    "path": "docs/user/advanced_usage.rst",
    "content": "Advanced Use Cases\n==================\n\nUsing Workers\n-------------\n\nYou can put the heavy computation in a worker script to optimize your application performance. To do so, you need to:\n\nCreate a handler in the worker thread that communicates with the frontend while handling the requests.\nCreate a worker engine in your main application that sends messages to the handler in the worker thread under the hood.\nFor detailed implementations of different kinds of workers, look at the following sections.\n\nUsing Web Workers\n^^^^^^^^^^^^^^^^^\nWebLLM comes with API support for `Web Workers <https://developer.mozilla.org/en-US/docs/Web/API/Web_Workers_API/Using_web_workers>`_ so you can offload the computation-heavy generation work into a separate worker thread. WebLLM has implemented cross-thread communication through messages under the hood, so manual implementation is not required.\n\nIn the worker script, import and instantiate a ``WebWorkerMLCEngineHandler``, which handles communication with other scripts and processes incoming requests.\n\n.. code-block:: typescript\n\n   // worker.ts\n   import { WebWorkerMLCEngineHandler } from \"@mlc-ai/web-llm\";\n\n   const handler = new WebWorkerMLCEngineHandler();\n   self.onmessage = (msg: MessageEvent) => {\n       handler.onmessage(msg);\n   };\n\nIn the main script, import and instantiate a ``WebWorkerMLCEngine`` that implements the same ``MLCEngineInterface`` and exposes the same APIs. Then, simply use it as you would a normal ``MLCEngine``.\n\n.. code-block:: typescript\n\n   import { CreateWebWorkerMLCEngine } from \"@mlc-ai/web-llm\";\n\n   async function runWorker() {\n       const engine = await CreateWebWorkerMLCEngine(\n           new Worker(new URL(\"./worker.ts\", import.meta.url), { type: \"module\" }),\n           \"Llama-3.1-8B-Instruct\"\n       );\n\n       const messages = [{ role: \"user\", content: \"How does WebLLM use workers?\" }];\n       const reply = await engine.chat.completions.create({ messages });\n       console.log(reply.choices[0].message.content);\n   }\n\n   runWorker();\n\n\nUnder the hood, ``WebWorkerMLCEngine`` does **not** perform any computation. It translates all calls into messages and sends them to the ``WebWorkerMLCEngineHandler`` for processing. The worker thread receives these messages and processes the actual computation using a hidden engine, and returns the result to the main thread using messages.\n\nService Workers\n^^^^^^^^^^^^^^^\nWebLLM also supports offloading computation using `Service Workers <https://developer.mozilla.org/en-US/docs/Web/API/Service_Worker_API>`_. This allows you to avoid reloading the model between page refreshes and optimize your application's offline experience.\n\n(Note, the lifecycle of a Service Worker is managed by the browser and can be killed any time without notifying the web application. WebLLM's ``ServiceWorkerMLCEngine`` attempts to keep the service worker thread alive by periodically sending heartbeat events. However, the script could still be killed at any time by Chrome, and your application should include proper error handling. Check `keepAliveMs` and `missedHeartbeat` in `ServiceWorkerMLCEngine <https://github.com/mlc-ai/web-llm/blob/main/src/service_worker.ts#L218>`_ for more details.)\n\nIn the worker script, import and instantiate ``ServiceWorkerMLCEngineHandler``, which handles communication with page scripts and processes incoming requests.\n\n.. code-block:: typescript\n\n   // sw.ts\n   import { ServiceWorkerMLCEngineHandler } from \"@mlc-ai/web-llm\";\n\n   self.addEventListener(\"activate\", () => {\n       const handler = new ServiceWorkerMLCEngineHandler();\n       console.log(\"Service Worker activated!\");\n   });\n\n\nThen, in the main page script, register the service worker and instantiate the engine using the ``CreateServiceWorkerMLCEngine`` factory function that implements the same ``MLCEngineInterface`` and exposes the same APIs. Then, simply use it as you would a normal ``MLCEngine``.\n\n.. code-block:: typescript\n\n    // main.ts\n    import { MLCEngineInterface, CreateServiceWorkerMLCEngine } from \"@mlc-ai/web-llm\";\n\n    if (\"serviceWorker\" in navigator) {\n    navigator.serviceWorker.register(\n        new URL(\"sw.ts\", import.meta.url),  // worker script\n        { type: \"module\" },\n    );\n    }\n\n    const engine: MLCEngineInterface =\n    await CreateServiceWorkerMLCEngine(\n        selectedModel,\n        { initProgressCallback }, // engineConfig\n    );\n\nSimilar to the ``WebWorkerMLCEngine`` above, the ``ServiceWorkerMLCEngine`` is also a proxy and does not perform any actual computation. Instead, it forwards all calls to the service worker thread and receives the result through messages.\n\nChrome Extension\n----------------\n\nWebLLM can be used in Chrome extensions to empower local LLM inference. You can find examples of building Chrome extension using WebLLM in `examples/chrome-extension <https://github.com/mlc-ai/web-llm/blob/main/examples/chrome-extension>`_ and `examples/chrome-extension-webgpu-service-worker <https://github.com/mlc-ai/web-llm/blob/main/examples/chrome-extension-webgpu-service-worker>`_. The latter leverages Service Worker, so the extension is persistent in the background.\n\nAdditionally, we have a full Chrome extension project, `WebLLM Assistant <https://github.com/mlc-ai/web-llm-assistant>`_, which leverages WebLLM to provide a personal web browsing copilot assistant experience. Feel free to check it out and contribute if you are interested.\n\n\nAdditional Customization\n------------------------\n\nUsing IndexedDB Cache\n^^^^^^^^^^^^^^^^^^^^^\n\nBy default, WebLLM caches model artifacts using the `Cache API <https://developer.mozilla.org/en-US/docs/Web/API/Cache>`_ for faster subsequent model loads. You can alternatively use `IndexedDB caching <https://developer.mozilla.org/en-US/docs/Web/API/IndexedDB_API>`_ by setting the `useIndexedDBCache` field in `appConfig` of `MLCEngineConfig` to `true`.\n\n.. code-block:: typescript\n\n   const engine = await CreateMLCEngine(\"Llama-3.1-8B-Instruct\", {\n       appConfig: {\n           useIndexedDBCache: true,\n           models: [\n               { model_id: \"Llama-3.1-8B\", model_path: \"/models/llama3\" },\n           ],\n       },\n   });\n\nCustomizing Token Behavior\n^^^^^^^^^^^^^^^^^^^^^^^^^^\n\nYou can modify `logit_bias` in `GenerationConfig` to control token likelihood. Setting a token's bias to a positive value increases its likelihood of being generated, while a negative value decreases it. A large negative value (e.g., -100) can effectively prevent the token from being generated.\n\n.. code-block:: typescript\n\n   const messages = [\n       { role: \"user\", content: \"Describe WebLLM in detail.\" },\n   ];\n\n   const response = await engine.chatCompletion({\n       messages,\n       logit_bias: { \"50256\": -100 }, // Example: Prevent specific token generation\n   });\n"
  },
  {
    "path": "docs/user/api_reference.rst",
    "content": ".. _api-reference:\n\nWebLLM API Reference\n====================\n\nThe ``MLCEngine`` class is the core interface of WebLLM. It enables model loading, chat completions, embeddings, and other operations. Below, we document its methods, along with the associated configuration interfaces.\n\nInterfaces\n----------\n\nThe following interfaces are used as parameters or configurations within ``MLCEngine`` methods. They are linked to their respective methods for reference.\n\nMLCEngineConfig\n^^^^^^^^^^^^^^^\n\nOptional configurations for ``CreateMLCEngine()`` and ``CreateWebWorkerMLCEngine()``.\n\n\n- **Fields**:\n    - ``appConfig``: Configure the app, including the list of models and whether to use IndexedDB cache.\n    - ``initProgressCallback``: A callback for showing model loading progress.\n    - ``logitProcessorRegistry``: A registry for stateful logit processors (see ``webllm.LogitProcessor``).\n\n\n- **Usage**:\n    - ``appConfig``: Contains application-specific settings, including:\n        - Model configurations.\n        - IndexedDB caching preferences.\n    - ``initProgressCallback``: Allows developers to visualize model loading progress by implementing a callback.\n    - ``logitProcessorRegistry``: A ``Map`` object for registering custom logit processors. Only applies to ``MLCEngine``.\n\n\n.. note:: All fields are optional, and ``logitProcessorRegistry`` is only used in ``MLCEngine``.\n\n\nExample:\n\n.. code-block:: typescript\n\n   const engine = await CreateMLCEngine(\"Llama-3.1-8B-Instruct\", {\n       appConfig: { /* app-specific config */ },\n       initProgressCallback: (progress) => console.log(progress),\n   });\n\n\nGenerationConfig\n^^^^^^^^^^^^^^^^\n\nConfigurations for a single generation task, primarily used in chat completions.\n\n- **Fields**:\n    - ``repetition_penalty``, ``ignore_eos``: Parameters specific to MLC models.\n    - ``top_p``, ``temperature``, ``max_tokens``, ``stop``: Common parameters shared with OpenAI APIs.\n    - ``frequency_penalty``, ``presence_penalty``: Tune repetition behavior following OpenAI semantics.\n    - ``logit_bias``, ``n``, ``logprobs``, ``top_logprobs``: Advanced sampling controls.\n    - ``response_format``, ``enable_thinking``, ``enable_latency_breakdown``: Additional OpenAI-style request features.\n\n- **Usage**:\n    - Fields like ``repetition_penalty`` and ``ignore_eos`` give explicit control over repetition handling and whether the model stops at the EOS token, respectively.\n    - Common parameters shared with OpenAI APIs (e.g., ``temperature``, ``top_p``) ensure compatibility while still falling back to the values configured during ``MLCEngine.reload()`` when omitted.\n    - ``frequency_penalty`` and ``presence_penalty`` mirror OpenAI's bounds ``[-2, 2]``; providing only one will default the other to ``0``.\n    - ``response_format`` (for JSON or other schema outputs), ``enable_thinking``, and ``enable_latency_breakdown`` pass through directly to the engine and surface enhanced telemetry or structured responses when the underlying model supports them.\n\n\nExample:\n\n.. code-block:: typescript\n\n   const messages = [\n       { role: \"system\", content: \"You are a helpful assistant.\" },\n       { role: \"user\", content: \"Explain WebLLM.\" },\n   ];\n\n   const response = await engine.chatCompletion({\n       messages,\n       top_p: 0.9,\n       temperature: 0.8,\n       max_tokens: 150,\n   });\n\nChatConfig\n^^^^^^^^^^\n\nModel's baseline configuration loaded from ``mlc-chat-config.json`` when ``MLCEngine.reload()`` runs. ``ChatOptions`` (and therefore the ``chatOpts`` argument to ``reload``) can override any subset of these fields.\n\n- **Fields** (subset):\n    - ``tokenizer_files``, ``tokenizer_info``: Files and parameters required to initialize the tokenizer.\n    - ``conv_template``, ``conv_config``: Conversation templates that define prompts, separators, and role formatting.\n    - ``context_window_size``, ``sliding_window_size``, ``attention_sink_size``: KV-cache and memory settings.\n    - Default generation knobs such as ``repetition_penalty``, ``frequency_penalty``, ``presence_penalty``, ``top_p``, and ``temperature``.\n\n- **Usage**:\n    - Loaded automatically for each model; provides defaults that ``GenerationConfig`` falls back to when fields are omitted.\n    - Override selected values per model load by supplying ``chatOpts`` (``Partial<ChatConfig>``) to ``MLCEngine.reload()``.\n\n\nExample:\n\n.. code-block:: typescript\n\n   await engine.reload(\"Llama-3.1-8B-Instruct\", {\n       temperature: 0.7,\n       repetition_penalty: 1.1,\n       context_window_size: 4096,\n   });\n\nChatCompletionRequest\n^^^^^^^^^^^^^^^^^^^^^\n\nDefines the structure for chat completion requests.\n\n- **Base Interface**: ``ChatCompletionRequestBase``\n    - Contains parameters such as ``messages``, ``stream``, ``frequency_penalty``, and ``presence_penalty``.\n- **Sub-interfaces**:\n    - ``ChatCompletionRequestNonStreaming``: For non-streaming completions.\n    - ``ChatCompletionRequestStreaming``: For streaming completions.\n\n- **Usage**:\n    - Combines settings from ``GenerationConfig`` and ``ChatCompletionRequestBase`` to provide complete control over chat behavior.\n    - The ``stream`` parameter enables streaming responses, improving interactivity in conversational agents.\n    - The ``logit_bias`` feature allows controlling token generation probabilities, providing a mechanism to restrict or encourage specific outputs.\n\n\nExample:\n\n.. code-block:: typescript\n\n   const response = await engine.chatCompletion({\n       messages: [\n           { role: \"user\", content: \"Tell me about WebLLM.\" },\n       ],\n       stream: true,\n   });\n\nModel Loading\n-------------\n\n``MLCEngine.reload(modelId: string | string[], chatOpts?: ChatOptions | ChatOptions[]): Promise<void>``\n\nLoads the specified model(s) into the engine. Uses ``MLCEngineConfig`` during initialization.\n\n- Parameters:\n    - ``modelId``: Identifier(s) for the model(s) to load.\n    - ``chatOpts``: Configuration for generation (see ``ChatConfig``).\n\nExample:\n\n.. code-block:: typescript\n\n   await engine.reload([\"Llama-3.1-8B\", \"Gemma-2B\"], [\n       { temperature: 0.7 },\n       { top_p: 0.9 },\n   ]);\n\n``MLCEngine.unload(): Promise<void>``\n\nUnloads all loaded models and clears their associated configurations.\n\nExample:\n\n.. code-block:: typescript\n\n   await engine.unload();\n\n---\n\nChat Completions\n----------------\n\n``MLCEngine.chat.completions.create(request: ChatCompletionRequest): Promise<ChatCompletion | AsyncIterable<ChatCompletionChunk>>``\n\nGenerates chat-based completions using a specified request configuration.\n\n- Parameters:\n  - ``request``: A ``ChatCompletionRequest`` instance.\n\nExample:\n\n.. code-block:: typescript\n\n   const response = await engine.chat.completions.create({\n       messages: [\n           { role: \"system\", content: \"You are a helpful AI assistant.\" },\n           { role: \"user\", content: \"What is WebLLM?\" },\n       ],\n       temperature: 0.8,\n       stream: false,\n   });\n\n---\n\nUtility Methods\n^^^^^^^^^^^^^^^\n\n``MLCEngine.getMessage(modelId?: string): Promise<string>``\n\nRetrieves the current output message from the specified model.\n\n- Parameters:\n    - ``modelId``: (Optional) Identifier of model to query. Omitting modelId only works when the engine currently has a single model loaded.\n\n``MLCEngine.resetChat(keepStats?: boolean, modelId?: string): Promise<void>``\n\nResets the chat history and optionally retains usage statistics.\n\n- Parameters:\n    - ``keepStats``: (Optional) If true, retains usage statistics.\n    - ``modelId``: (Optional) Identifier of the model to reset. Omitting modelId only works when the engine currently has a single model loaded.\n\nGPU Information\n----------------\n\nThe following methods provide detailed information about the GPU used for WebLLM computations.\n\n``MLCEngine.getGPUVendor(): Promise<string>``\n\nRetrieves the vendor name of the GPU used for computations. This is useful for understanding hardware capabilities during inference.\n\n- **Returns**: A string indicating the GPU vendor (e.g., \"Intel\", \"NVIDIA\").\n\nExample:\n\n.. code-block:: typescript\n\n   const gpuVendor = await engine.getGPUVendor();\n   console.log(``GPU Vendor: ${gpuVendor}``);\n\n``MLCEngine.getMaxStorageBufferBindingSize(): Promise<number>``\n\nReturns the maximum storage buffer size supported by the GPU. This is important when working with larger models that require significant memory for processing.\n\n- **Returns**: A number representing the maximum size in bytes.\n\nExample:\n\n.. code-block:: typescript\n\n   const maxBufferSize = await engine.getMaxStorageBufferBindingSize();\n   console.log(``Max Storage Buffer Binding Size: ${maxBufferSize}``);\n"
  },
  {
    "path": "docs/user/basic_usage.rst",
    "content": "Basic Usage\n================\n\nModel Records in WebLLM\n-----------------------\n\nEach of the model available WebLLM is registered as an instance of\n``ModelRecord`` and can be accessed at\n`webllm.prebuiltAppConfig.model_list <https://github.com/mlc-ai/web-llm/blob/main/src/config.ts#L313>`__.\n\nCreating an MLCEngine\n---------------------\n\nWebLLM APIs are exposed through the ``MLCEngine`` interface. You can create an ``MLCEngine`` instance and load the model by calling the CreateMLCEngine() factory function.\n\n(Note that loading models requires downloading and it can take a significant amount of time for the very first run without previous caching. You should properly handle this asynchronous call.)\n\n``MLCEngine`` can be instantiated in two ways:\n1. Using the factory function ``CreateMLCEngine``.\n2. Instantiating the ``MLCEngine`` class directly and using ``reload()`` to load models.\n\n.. code-block:: typescript\n\n   import { CreateMLCEngine, MLCEngine } from \"@mlc-ai/web-llm\";\n\n    // Initialize with a progress callback\n    const initProgressCallback = (progress) => {\n        console.log(\"Model loading progress:\", progress);\n    };\n\n   // Using CreateMLCEngine\n   const engine = await CreateMLCEngine(\"Llama-3.1-8B-Instruct\", { initProgressCallback });\n\n   // Direct instantiation\n   const engineInstance = new MLCEngine({ initProgressCallback });\n   await engineInstance.reload(\"Llama-3.1-8B-Instruct\");\n\nUnder the hood, this factory function ``CreateMLCEngine`` does the following steps for first creating an engine instance (synchronous) and then loading the model (asynchronous). You can also do them separately in your application.\n\n.. code-block:: typescript\n\n    import { MLCEngine } from \"@mlc-ai/web-llm\";\n\n    // This is a synchronous call that returns immediately\n    const engine = new MLCEngine({\n        initProgressCallback: initProgressCallback\n    });\n\n    // This is an asynchronous call and can take a long time to finish\n    await engine.reload(selectedModel);\n\n\nChat Completion\n---------------\n\nChat completions can be invoked using OpenAI style chat APIs through the ``engine.chat.completions`` interface of an initialized ``MLCEngine``. For the full list of parameters and their descriptions, check :ref:`api-reference` for full list of parameters.\n\n(Note: Since the model is determined during ``MLCEngine`` instantiation, the ``model`` parameter is not supported and will be **ignored**. Instead, call ``CreateMLCEngine(model)`` or ``engine.reload(model)`` to reinitialize the engine to use a specific model.)\n\n.. code-block:: typescript\n\n    const messages = [\n        { role: \"system\", content: \"You are a helpful AI assistant.\" },\n        { role: \"user\", content: \"Hello!\" }\n    ];\n\n    const reply = await engine.chat.completions.create({\n        messages,\n    });\n\n    console.log(reply.choices[0].message);\n    console.log(reply.usage);\n\n\nStreaming Chat Completion\n-------------------------\n\nStreaming chat completion could be enabled by passsing ``stream: true`` parameter to the `engine.chat.completions.create` call configuration. Check :ref:`api-reference` for full list of parameters.\n\n.. code-block:: typescript\n\n    const messages = [\n        { role: \"system\", content: \"You are a helpful AI assistant.\" },\n        { role: \"user\", content: \"Hello!\" },\n    ]\n\n    // chunks is an AsyncGenerator object\n    const chunks = await engine.chat.completions.create({\n        messages,\n        temperature: 1,\n        stream: true, // <-- Enable streaming\n        stream_options: { include_usage: true },\n    });\n\n    let reply = \"\";\n    for await (const chunk of chunks) {\n        reply += chunk.choices[0]?.delta.content || \"\";\n        console.log(reply);\n        if (chunk.usage) {\n            console.log(chunk.usage); // only last chunk has usage\n        }\n    }\n\n    const fullReply = await engine.getMessage();\n    console.log(fullReply);\n\n\nChatbot Examples\n----------------\n\nLearn how to use WebLLM to integrate large language models into your applications and generate chat completions through this simple Chatbot example:\n\n- `Example in JSFiddle <https://jsfiddle.net/neetnestor/4nmgvsa2/>`_\n- `Example in CodePen <https://codepen.io/neetnestor/pen/vYwgZaG>`_\n\nFor an advanced example of a larger, more complicated project, look at `WebLLM Chat <https://github.com/mlc-ai/web-llm-chat/blob/main/app/client/webllm.ts>`_.\n\nMore examples for different use cases are available in the `WebLLM examples folder <https://github.com/mlc-ai/web-llm/tree/main/examples>`_.\n\n\n"
  },
  {
    "path": "docs/user/get_started.rst",
    "content": "Getting Started with WebLLM\n===========================\n\nThis guide will help you set up WebLLM in your project, install necessary dependencies, and verify your setup.\n\n\nWebLLM Chat\n-----------\n\nIf you want to experience AI Chat supported by local LLM inference and understand how WebLLM works, try out `WebLLM Chat <https://chat.webllm.ai/>`__, which provides a great example\nof integrating WebLLM into a full web application.\n\nA WebGPU-compatible browser is needed to run WebLLM-powered web applications.\nYou can download the latest Google Chrome and use `WebGPU Report <https://webgpureport.org/>`__\nto verify the functionality of WebGPU on your browser.\n\nInstallation\n------------\n\nWebLLM offers a minimalist and modular interface to access the chatbot in the browser. The package is designed in a modular way to hook to any of the UI components.\n\nWebLLM is available as an `npm package <https://www.npmjs.com/package/@mlc-ai/web-llm>`_ and is also CDN-delivered. Therefore, you can install WebLLM using Node.js package managers like npm, yarn, or pnpm, or directly import the pacakge via CDN.\n\nUsing Package Managers\n^^^^^^^^^^^^^^^^^^^^^^\nInstall WebLLM via your preferred package manager:\n\n.. code-block:: bash\n\n   # npm\n   npm install @mlc-ai/web-llm\n   # yarn\n   yarn add @mlc-ai/web-llm\n   # pnpm\n   pnpm install @mlc-ai/web-llm\n\nImport WebLLM into your project:\n\n.. code-block:: javascript\n\n   // Import everything\n   import * as webllm from \"@mlc-ai/web-llm\";\n\n   // Or only import what you need\n   import { CreateMLCEngine } from \"@mlc-ai/web-llm\";\n\nUsing CDN\n^^^^^^^^^\nThanks to `jsdelivr.com <https://www.jsdelivr.com/package/npm/@mlc-ai/web-llm>`_, WebLLM can be imported directly through URL and work out-of-the-box on cloud development platforms like `jsfiddle.net <https://jsfiddle.net/>`_, `Codepen.io <https://codepen.io/>`_, and `Scribbler <https://scribbler.live/>`_:\n\n.. code-block:: javascript\n\n   import * as webllm from \"https://esm.run/@mlc-ai/web-llm\";\n\nThis method is especially useful for online environments like CodePen, JSFiddle, or local experiments.\n\nVerifying Installation\n^^^^^^^^^^^^^^^^^^^^^^\nRun the following script to verify the installation:\n\n.. code-block:: javascript\n\n   import { CreateMLCEngine } from \"@mlc-ai/web-llm\";\n   console.log(\"WebLLM loaded successfully!\");\n\n\nOnline IDE Sandbox\n------------------\n\nInstead of setting WebLLM locally, you can also try it on online Javascript IDE sandboxes like:\n\n- `Example in JSFiddle <https://jsfiddle.net/neetnestor/4nmgvsa2/>`_\n- `Example in CodePen <https://codepen.io/neetnestor/pen/vYwgZaG>`_\n\n\n"
  },
  {
    "path": "eslint.config.cjs",
    "content": "const {\n    defineConfig,\n    globalIgnores,\n} = require(\"eslint/config\");\n\nconst tsParser = require(\"@typescript-eslint/parser\");\nconst typescriptEslint = require(\"@typescript-eslint/eslint-plugin\");\nconst js = require(\"@eslint/js\");\n\nconst {\n    FlatCompat,\n} = require(\"@eslint/eslintrc\");\n\nconst compat = new FlatCompat({\n    baseDirectory: __dirname,\n    recommendedConfig: js.configs.recommended,\n    allConfig: js.configs.all\n});\n\nmodule.exports = defineConfig([{\n    extends: compat.extends(\n        \"eslint:recommended\",\n        \"plugin:@typescript-eslint/recommended\",\n        \"plugin:prettier/recommended\",\n    ),\n\n    languageOptions: {\n        parser: tsParser,\n    },\n\n    plugins: {\n        \"@typescript-eslint\": typescriptEslint,\n    },\n\n    rules: {\n        \"@typescript-eslint/no-explicit-any\": \"off\",\n        \"@typescript-eslint/no-empty-function\": \"off\",\n        \"@typescript-eslint/no-non-null-assertion\": \"off\",\n    },\n}, {\n    files: [\"examples/**/*.js\", \"examples/**/*.ts\"],\n\n    \"rules\": {\n        \"no-undef\": \"off\",\n        \"@typescript-eslint/no-unused-vars\": \"off\",\n    },\n}, globalIgnores([\n    \"**/dist\",\n    \"**/debug\",\n    \"**/lib\",\n    \"**/build\",\n    \"**/node_modules\",\n    \"**/3rdparty\",\n    \"**/.eslintrc.cjs\",\n    \"**/.next\",\n])]);\n"
  },
  {
    "path": "examples/.gitignore",
    "content": "package-lock.json\n"
  },
  {
    "path": "examples/README.md",
    "content": "# Awesome WebLLM\n\nThis page contains a curated list of examples, tutorials, blogs about WebLLM usecases.\nPlease send a pull request if you find things that belong here.\n\n## Example Projects\n\nNote that all examples below run in-browser and use WebGPU as a backend.\n\n#### Project List\n\n- [get-started](get-started): minimum get started example with chat completion.\n\n  [![Open in JSFiddle](https://img.shields.io/badge/open-JSFiddle-blue?logo=jsfiddle&logoColor=white)](https://jsfiddle.net/neetnestor/yac9gbwf/)\n  [![Open in Codepen](https://img.shields.io/badge/open-codepen-gainsboro?logo=codepen)](https://codepen.io/neetnestor/pen/NWVdgey)\n\n- [simple-chat-js](simple-chat-js): a mininum and complete chat bot app in vanilla JavaScript.\n\n  [![Open in JSFiddle](https://img.shields.io/badge/open-JSFiddle-blue?logo=jsfiddle&logoColor=white)](https://jsfiddle.net/neetnestor/4nmgvsa2/)\n  [![Open in Codepen](https://img.shields.io/badge/open-codepen-gainsboro?logo=codepen)](https://codepen.io/neetnestor/pen/vYwgZaG)\n\n- [simple-chat-ts](simple-chat-ts): a mininum and complete chat bot app in TypeScript.\n- [get-started-web-worker](get-started-web-worker): same as get-started, but using web worker.\n- [next-simple-chat](next-simple-chat): a mininum and complete chat bot app with [Next.js](https://nextjs.org/).\n- [multi-round-chat](multi-round-chat): while APIs are functional, we internally optimize so that multi round chat usage can reuse KV cache\n- [text-completion](text-completion): demonstrates API `engine.completions.create()`, which is pure text completion with no conversation, as opposed to `engine.chat.completions.create()`\n- [embeddings](embeddings): demonstrates API `engine.embeddings.create()`, integration with `EmbeddingsInterface` and `MemoryVectorStore` of [Langchain.js](https://js.langchain.com), and RAG with Langchain.js using WebLLM for both LLM and Embedding in a single engine\n- [multi-models](multi-models): demonstrates loading multiple models in a single engine concurrently\n\n#### Advanced OpenAI API Capabilities\n\nThese examples demonstrate various capabilities via WebLLM's OpenAI-like API.\n\n- [streaming](streaming): return output as chunks in real-time in the form of an AsyncGenerator\n- [json-mode](json-mode): efficiently ensure output is in json format, see [OpenAI Reference](https://platform.openai.com/docs/guides/text-generation/chat-completions-api) for more.\n- [json-schema](json-schema): besides guaranteeing output to be in JSON, ensure output to adhere to a specific JSON schema specified the user\n- [seed-to-reproduce](seed-to-reproduce): use seeding to ensure reproducible output with fields `seed`.\n- [function-calling](function-calling) (WIP): function calling with fields `tools` and `tool_choice` (with preliminary support).\n- [vision-model](vision-model): process request with image input using Vision Language Model (e.g. Phi3.5-vision)\n\n#### Chrome Extension\n\n- [chrome-extension](chrome-extension): chrome extension that does not have a persistent background\n- [chrome-extension-webgpu-service-worker](chrome-extension-webgpu-service-worker): chrome extension using service worker, hence having a persistent background\n\n#### Others\n\n- [logit-processor](logit-processor): while `logit_bias` is supported, we additionally support stateful logit processing where users can specify their own rules. We also expose low-level API `forwardTokensAndSample()`.\n- [cache-usage](cache-usage): demonstrates how WebLLM supports both the [Cache API](https://developer.mozilla.org/en-US/docs/Web/API/Cache) and [IndexedDB cache](https://developer.mozilla.org/en-US/docs/Web/API/IndexedDB_API), and\n  users can pick with `appConfig.useIndexedDBCache`. Also demonstrates various cache utils such as checking\n  whether a model is cached, deleting a model's weights from cache, deleting a model library wasm from cache, etc.\n- [simple-chat-upload](simple-chat-upload): demonstrates how to upload local models to WebLLM instead of downloading via a URL link\n\n## Demo Spaces\n\n- [web-llm-embed](https://huggingface.co/spaces/matthoffner/web-llm-embed): document chat prototype using react-llm with transformers.js embeddings\n- [DeVinci](https://x6occ-biaaa-aaaai-acqzq-cai.icp0.io/): AI chat app based on WebLLM and hosted on decentralized cloud platform\n"
  },
  {
    "path": "examples/abort-reload/README.md",
    "content": "# WebLLM Get Started App\n\nThis folder provides a demo for cancelling model fetching after calling `engine.reload()`.\n\n```bash\nnpm install\nnpm start\n```\n\nNote if you would like to hack WebLLM core package.\nYou can change web-llm dependencies as `\"file:../..\"`, and follow the build from source\ninstruction in the project to build webllm locally. This option is only recommended\nif you would like to hack WebLLM core package.\n"
  },
  {
    "path": "examples/abort-reload/package.json",
    "content": "{\n  \"name\": \"get-started\",\n  \"version\": \"0.1.0\",\n  \"private\": true,\n  \"scripts\": {\n    \"start\": \"parcel src/get_started.html  --port 8887\",\n    \"build\": \"parcel build src/get_started.html --dist-dir lib\"\n  },\n  \"devDependencies\": {\n    \"buffer\": \"^5.7.1\",\n    \"parcel\": \"^2.8.3\",\n    \"process\": \"^0.11.10\",\n    \"tslib\": \"^2.3.1\",\n    \"typescript\": \"^4.9.5\",\n    \"url\": \"^0.11.3\"\n  },\n  \"dependencies\": {\n    \"@mlc-ai/web-llm\": \"^0.2.82\"\n  }\n}\n"
  },
  {
    "path": "examples/abort-reload/src/get_started.html",
    "content": "<!doctype html>\n<html>\n  <script>\n    webLLMGlobal = {};\n  </script>\n  <body>\n    <h2>WebLLM Test Page</h2>\n    Open console to see output\n    <br />\n    <br />\n    <label id=\"init-label\"> </label>\n\n    <h3>Prompt</h3>\n    <label id=\"prompt-label\"> </label>\n\n    <h3>Response</h3>\n    <label id=\"generate-label\"> </label>\n    <br />\n    <label id=\"stats-label\"> </label>\n\n    <script type=\"module\" src=\"./get_started.js\"></script>\n  </body>\n</html>\n"
  },
  {
    "path": "examples/abort-reload/src/get_started.js",
    "content": "import * as webllm from \"@mlc-ai/web-llm\";\nimport { error } from \"loglevel\";\n\nlet engine;\n\nfunction setLabel(id, text) {\n  const label = document.getElementById(id);\n  if (label == null) {\n    throw Error(\"Cannot find label \" + id);\n  }\n  label.innerText = text;\n}\n\nasync function main() {\n  const initProgressCallback = (report) => {\n    console.log(report.text);\n    setLabel(\"init-label\", report.text);\n  };\n  // Option 1: If we do not specify appConfig, we use `prebuiltAppConfig` defined in `config.ts`\n  const selectedModel = \"Llama-3.1-8B-Instruct-q4f32_1-MLC\";\n  engine = new webllm.MLCEngine({\n    initProgressCallback,\n  });\n  engine.reload(selectedModel);\n}\nmain();\nsetTimeout(() => {\n  console.log(\"calling unload\");\n  engine.unload().catch((err) => {\n    console.log(err);\n  });\n}, 5000);\n"
  },
  {
    "path": "examples/cache-usage/README.md",
    "content": "# WebLLM Cache Usage\n\nWebLLM supports both the Cache API and IndexedDB, which you can specify via `AppConfig.useIndexedDBCache`.\nThis folder provides an example on how Cache and IndexedDB Cache are used in WebLLM. We also\ndemonstrate the utility cache functions such as deleting models, checking if models are in cache, etc.\n\nFor more information about the two caches, see: https://developer.mozilla.org/en-US/docs/Web/API/Storage_API/Storage_quotas_and_eviction_criteria#what_technologies_store_data_in_the_browser.\n\nTo inspect the downloaded artifacts in your browser, open up developer console, go to application,\nand you will find the artifacts under either `IndexedDB` or `Cache storage`.\n\nTo run the exapmle, you can do the following steps under this folder\n\n```bash\nnpm install\nnpm start\n```\n\nNote if you would like to hack WebLLM core package.\nYou can change web-llm dependencies as `\"file:../..\"`, and follow the build from source\ninstruction in the project to build webllm locally. This option is only recommended\nif you would like to hack WebLLM core package.\n"
  },
  {
    "path": "examples/cache-usage/package.json",
    "content": "{\n  \"name\": \"cache-usage\",\n  \"version\": \"0.1.0\",\n  \"private\": true,\n  \"scripts\": {\n    \"start\": \"parcel src/cache_usage.html  --port 8888\",\n    \"build\": \"parcel build src/cache_usage.html --dist-dir lib\"\n  },\n  \"devDependencies\": {\n    \"buffer\": \"^5.7.1\",\n    \"parcel\": \"^2.8.3\",\n    \"process\": \"^0.11.10\",\n    \"tslib\": \"^2.3.1\",\n    \"typescript\": \"^4.9.5\",\n    \"url\": \"^0.11.3\"\n  },\n  \"dependencies\": {\n    \"@mlc-ai/web-llm\": \"^0.2.82\"\n  }\n}\n"
  },
  {
    "path": "examples/cache-usage/src/cache_usage.html",
    "content": "<!doctype html>\n<html>\n  <script>\n    webLLMGlobal = {};\n  </script>\n\n  <body>\n    <h2>WebLLM Test Page</h2>\n    Open console to see output\n    <br />\n    <br />\n    <label id=\"init-label\"> </label>\n\n    <h3>Prompt</h3>\n    <label id=\"prompt-label\"> </label>\n\n    <h3>Response</h3>\n    <label id=\"generate-label\"> </label>\n    <br />\n    <label id=\"stats-label\"> </label>\n\n    <script type=\"module\" src=\"./cache_usage.ts\"></script>\n  </body>\n</html>\n"
  },
  {
    "path": "examples/cache-usage/src/cache_usage.ts",
    "content": "import * as webllm from \"@mlc-ai/web-llm\";\n\nfunction setLabel(id: string, text: string) {\n  const label = document.getElementById(id);\n  if (label == null) {\n    throw Error(\"Cannot find label \" + id);\n  }\n  label.innerText = text;\n}\n\nconst initProgressCallback = (report: webllm.InitProgressReport) => {\n  setLabel(\"init-label\", report.text);\n};\n\nasync function main() {\n  const appConfig = webllm.prebuiltAppConfig;\n  // CHANGE THIS TO SEE EFFECTS OF BOTH, CODE BELOW DO NOT NEED TO CHANGE\n  appConfig.useIndexedDBCache = true;\n\n  if (appConfig.useIndexedDBCache) {\n    console.log(\"Using IndexedDB Cache\");\n  } else {\n    console.log(\"Using Cache API\");\n  }\n\n  // 1. This triggers downloading and caching the model with either Cache or IndexedDB Cache\n  const selectedModel = \"phi-2-q4f16_1-MLC\";\n  const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(\n    selectedModel,\n    { initProgressCallback: initProgressCallback, appConfig: appConfig },\n  );\n\n  const request: webllm.ChatCompletionRequest = {\n    stream: false,\n    messages: [\n      {\n        role: \"user\",\n        content: \"Write an analogy between mathematics and a lighthouse.\",\n      },\n    ],\n    n: 1,\n  };\n  let reply = await engine.chat.completions.create(request);\n  console.log(reply);\n\n  // 2. Check whether model weights are cached\n  let modelCached = await webllm.hasModelInCache(selectedModel, appConfig);\n  console.log(\"hasModelInCache: \", modelCached);\n  if (!modelCached) {\n    throw Error(\"Expect hasModelInCache() to be true, but got: \" + modelCached);\n  }\n\n  // 3. We reload, and we should see this time it is much faster because the weights are cached.\n  console.log(\"Reload model start\");\n  await engine.reload(selectedModel);\n  console.log(\"Reload model end\");\n  reply = await engine.chat.completions.create(request);\n  console.log(reply);\n\n  // 4. Delete every thing about this model from cache\n  // You can also delete only the model library wasm, only the model weights, or only the config file\n  await webllm.deleteModelAllInfoInCache(selectedModel, appConfig);\n  modelCached = await webllm.hasModelInCache(selectedModel, appConfig);\n  console.log(\"After deletion, hasModelInCache: \", modelCached);\n  if (modelCached) {\n    throw Error(\n      \"Expect hasModelInCache() to be false, but got: \" + modelCached,\n    );\n  }\n\n  // 5. If we reload, we should expect the model to start downloading again\n  console.log(\"Reload model start\");\n  await engine.reload(selectedModel);\n  console.log(\"Reload model end\");\n  reply = await engine.chat.completions.create(request);\n  console.log(reply);\n}\n\nmain();\n"
  },
  {
    "path": "examples/chrome-extension/README.md",
    "content": "# WebLLM Chrome Extension\n\n![Chrome Extension](https://github.com/mlc-ai/mlc-llm/assets/11940172/0d94cc73-eff1-4128-a6e4-70dc879f04e0)\n\nTo run the extension, do the following steps under this folder\n\n```bash\nnpm install\nnpm run build\n```\n\nThis will create a new directory at `chrome-extension/dist/`. To load the extension into Chrome, go to Extensions > Manage Extensions and select Load Unpacked. Add the `chrome-extension/dist/` directory. You can now pin the extension to your toolbar and use the drop-down menu to chat with your favorite model!\n"
  },
  {
    "path": "examples/chrome-extension/package.json",
    "content": "{\n  \"name\": \"chrome-extension\",\n  \"version\": \"1.0.1\",\n  \"description\": \"\",\n  \"private\": true,\n  \"scripts\": {\n    \"build\": \"parcel build src/manifest.json --config @parcel/config-webextension\"\n  },\n  \"author\": \"\",\n  \"license\": \"ISC\",\n  \"devDependencies\": {\n    \"@parcel/config-webextension\": \"^2.9.3\",\n    \"@types/chrome\": \"^0.0.242\",\n    \"buffer\": \"^6.0.3\",\n    \"parcel\": \"^2.9.3\",\n    \"process\": \"^0.11.10\",\n    \"url\": \"^0.11.1\"\n  },\n  \"dependencies\": {\n    \"@mlc-ai/web-llm\": \"^0.2.82\",\n    \"progressbar.js\": \"^1.1.0\"\n  }\n}\n"
  },
  {
    "path": "examples/chrome-extension/src/content.js",
    "content": "// Only the content script is able to access the DOM\nchrome.runtime.onConnect.addListener(function (port) {\n  port.onMessage.addListener(function (msg) {\n    port.postMessage({ contents: document.body.innerText });\n  });\n});\n"
  },
  {
    "path": "examples/chrome-extension/src/example.html",
    "content": "In the year 2154, humanity had colonized several planets in the distant reaches\nof the galaxy. The planet of Xylophia-IV was one of the most remote and\ninhospitable, with temperatures often dropping to -200 degrees Celsius. Despite\nthese harsh conditions, a team of scientists had established a research station\non the planet to study the unique geological formations and exotic flora and\nfauna. One day, while conducting a routine survey of the planet's surface, the\nteam discovered an strange object buried deep in the ice. As they examined it\ncloser, they realized it was a small, metallic capsule with a glowing blue\nsymbol etched onto its surface. The team's leader, a brilliant scientist named\nDr. Maria Rodriguez, was immediately intrigued by the capsule's mysterious\norigins. She ordered her team to bring it back to the research station for\nfurther analysis. After weeks of studying the capsule, the team finally cracked\nthe code to the symbol etched onto its surface. It was a message from an alien\nrace, warning Earth of an impending attack from an unknown threat. The team was\nshocked and dismayed by the news, but they knew they had to act quickly to warn\nthe rest of humanity. They transmitted the message to the nearest space station,\nwhich relayed it to Earth's government. As the threat of attack loomed near, the\nteam remained on high alert, ready to face whatever dangers lay ahead. They had\nuncovered a secrets of the universe, and now they were determined to protect\ntheir planet and its inhabitants at all costs.\n"
  },
  {
    "path": "examples/chrome-extension/src/manifest.json",
    "content": "{\n  \"manifest_version\": 3,\n  \"name\": \"MLCBot\",\n  \"version\": \"0.1.1\",\n  \"description\": \"Chat with your browser\",\n  \"icons\": {\n    \"16\": \"icons/icon-16.png\",\n    \"32\": \"icons/icon-32.png\",\n    \"64\": \"icons/icon-64.png\",\n    \"128\": \"icons/icon-128.png\"\n  },\n  \"content_security_policy\": {\n    \"extension_pages\": \"style-src-elem 'self' https://cdnjs.cloudflare.com; font-src 'self' https://cdnjs.cloudflare.com; script-src 'self' 'wasm-unsafe-eval'; default-src 'self' data:; connect-src 'self' data: http://localhost:8000 https://huggingface.co https://cdn-lfs.huggingface.co https://cdn-lfs-us-1.huggingface.co https://raw.githubusercontent.com https://cdn-lfs-us-1.hf.co https://cas-bridge.xethub.hf.co\"\n  },\n  \"action\": {\n    \"default_title\": \"MLCBot\",\n    \"default_popup\": \"popup.html\"\n  },\n  \"content_scripts\": [\n    {\n      \"matches\": [\"<all_urls>\"],\n      \"js\": [\"content.js\"]\n    }\n  ],\n  \"permissions\": [\"storage\", \"tabs\", \"webNavigation\", \"activeTab\", \"scripting\"],\n  \"host_permissions\": [\"http://*/\", \"https://*/\"]\n}\n"
  },
  {
    "path": "examples/chrome-extension/src/manifest_v2.json",
    "content": "{\n  \"manifest_version\": 2,\n  \"name\": \"MLCBot\",\n  \"version\": \"0.1.0\",\n  \"description\": \"Chat with your browser\",\n  \"icons\": {\n    \"16\": \"icons/icon-16.png\",\n    \"32\": \"icons/icon-32.png\",\n    \"64\": \"icons/icon-64.png\",\n    \"128\": \"icons/icon-128.png\"\n  },\n  \"content_security_policy\": \"style-src-elem 'self' https://cdnjs.cloudflare.com; font-src 'self' https://cdnjs.cloudflare.com; script-src 'self' 'unsafe-eval' 'wasm-unsafe-eval'; default-src 'self' data:; connect-src 'self' data: http://localhost:8000 https://huggingface.co https://cdn-lfs.huggingface.co https://raw.githubusercontent.com https://cdn-lfs-us-1.hf.co https://cas-bridge.xethub.hf.co\",\n  \"browser_action\": {\n    \"default_popup\": \"popup.html\"\n  },\n  \"content_scripts\": [\n    {\n      \"matches\": [\"<all_urls>\"],\n      \"js\": [\"content.js\"]\n    }\n  ],\n  \"permissions\": [\"storage\", \"tabs\", \"webNavigation\", \"activeTab\"]\n}\n"
  },
  {
    "path": "examples/chrome-extension/src/popup.css",
    "content": "*,\n*::before,\n*::after {\n  margin: 0;\n  padding: 0;\n  box-sizing: border-box;\n}\n\nhtml {\n  font-family:\n    -apple-system,\n    BlinkMacSystemFont,\n    Segoe UI,\n    Helvetica,\n    Arial,\n    sans-serif;\n  color: #222;\n}\n\nbody {\n  margin: 0;\n  padding: 0.5rem;\n  background-color: #778da9;\n  width: 335px;\n  font-size: small;\n}\n\np {\n  margin: 0;\n}\n\n/* LOADING BAR */\n#loadingContainer {\n  margin-bottom: 15px;\n  width: 315px;\n  height: 8px;\n}\n\n/* INPUT AREA */\n#query-input {\n  border: 1px solid #ccc;\n  border-radius: 4px;\n}\n\n.input-container {\n  display: flex;\n  flex-direction: row;\n  align-items: center;\n}\n\n.input-container input {\n  width: 100%;\n  outline: none;\n  padding: 0.5rem;\n  margin-right: 0.5rem;\n}\n\n/* BUTTON */\n.btn {\n  background-color: #1b263b;\n  color: white;\n  font-size: small;\n  cursor: pointer;\n  border-radius: 4px;\n  border: none;\n  padding: 0.5rem;\n}\n\n.btn:hover {\n  background-color: #d0d0d0;\n}\n\n.btn:disabled {\n  background-color: #a7a7a7;\n  color: rgb(255, 255, 255);\n  cursor: default;\n}\n\n.btn img {\n  width: 1rem;\n  height: 1rem;\n}\n\n/* LOADING */\n\n.stage {\n  display: flex;\n  justify-content: center;\n  align-items: center;\n  position: relative;\n  margin: 0 -5%;\n  overflow: hidden;\n}\n\n#loading-indicator {\n  display: none;\n  color: white;\n  margin-top: 0.5rem;\n}\n\n.dot-flashing {\n  position: relative;\n  width: 10px;\n  height: 10px;\n  border-radius: 5px;\n  background-color: #1b263b;\n  color: #1b263b;\n  animation: dot-flashing 0.4s infinite linear alternate;\n  animation-delay: 0.2s;\n}\n\n.dot-flashing::before,\n.dot-flashing::after {\n  content: \"\";\n  display: inline-block;\n  position: absolute;\n  top: 0;\n}\n\n.dot-flashing::before {\n  left: -15px;\n  width: 10px;\n  height: 10px;\n  border-radius: 5px;\n  background-color: #1b263b;\n  color: #1b263b;\n  animation: dot-flashing 0.4s infinite alternate;\n  animation-delay: 0s;\n}\n\n.dot-flashing::after {\n  left: 15px;\n  width: 10px;\n  height: 10px;\n  border-radius: 5px;\n  background-color: #1b263b;\n  color: #1b263b;\n  animation: dot-flashing 0.4s infinite alternate;\n  animation-delay: 0.4s;\n}\n\n@keyframes dot-flashing {\n  0% {\n    background-color: #1b263b;\n  }\n\n  50%,\n  100% {\n    background-color: #415a77;\n  }\n}\n\n/* ANSWERS */\n#queriesAnswersContainer {\n  display: block;\n  color: white;\n  margin-top: 0.5rem;\n}\n\n#answer {\n  color: #333333;\n}\n\n#answerWrapper {\n  display: none;\n  background-color: #ffd166;\n  border-radius: 8px;\n  padding: 0.5rem;\n  margin-top: 0.5rem;\n}\n\n.queriesAnswers {\n  border-radius: 8px;\n  background-color: #ffd166;\n  padding: 0.5rem;\n  color: #333333;\n}\n\n#lastQuery {\n  color: rgb(188, 188, 188);\n}\n\n#lastAnswer {\n  color: white;\n  margin-top: 0.5rem;\n}\n\n#lastRequest {\n  padding: 0.5rem;\n  margin-top: 0.5rem;\n  background-color: #333333;\n  border-radius: 4px;\n}\n\n/* ANSWER OPTIONS */\n.timeStamp {\n  color: #9a8c98;\n}\n\n.copyRow {\n  display: flex;\n  flex-direction: row;\n  align-items: end;\n  justify-content: space-between;\n  color: #a7a7a7;\n  margin-top: 0.5rem;\n}\n\n.copyText {\n  display: none;\n  color: #a7a7a7;\n  margin-right: 0.5rem;\n}\n\n.copyButton {\n  color: #415a77;\n  background-color: transparent;\n  border: none;\n  cursor: pointer;\n  padding: 0;\n  margin-left: 0.5rem;\n}\n\n.copyButton:hover {\n  color: #5e80a7;\n  background-color: transparent;\n}\n\n.removeButton {\n  color: #415a77;\n  background-color: transparent;\n  border: none;\n  cursor: pointer;\n  padding: 0;\n}\n\n.removeButton:hover {\n  color: #5e80a7;\n  background-color: transparent;\n}\n"
  },
  {
    "path": "examples/chrome-extension/src/popup.html",
    "content": "<!doctype html>\n<html lang=\"en\">\n  <head>\n    <meta charset=\"UTF-8\" />\n    <title>Chatbot</title>\n    <link rel=\"stylesheet\" href=\"popup.css\" />\n    <link\n      rel=\"stylesheet\"\n      href=\"https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css\"\n    />\n  </head>\n  <body>\n    <select id=\"model-selection\"></select>\n    <div id=\"loadingBox\">\n      <p id=\"init-label\">Initializing model...</p>\n      <div id=\"loadingContainer\"></div>\n    </div>\n    <p id=\"model-name\"></p>\n    <div class=\"input-container form-group\">\n      <input\n        type=\"search\"\n        id=\"query-input\"\n        placeholder=\"What's on your mind?\"\n      />\n      <button id=\"submit-button\" class=\"btn\">\n        <i class=\"fa fa-comments\"></i>\n      </button>\n    </div>\n\n    <div class=\"stage\">\n      <div id=\"loading-indicator\" class=\"dot-flashing\"></div>\n    </div>\n\n    <div id=\"answerWrapper\">\n      <div id=\"answer\"></div>\n      <div class=\"copyRow\">\n        <span id=\"timestamp\"></span>\n        <button\n          id=\"copyAnswer\"\n          class=\"btn copyButton\"\n          title=\"Copy the Answer to the Clipboard\"\n        >\n          <i class=\"fa-solid fa-copy fa-lg\"></i>\n        </button>\n      </div>\n    </div>\n\n    <script type=\"module\" src=\"./popup.ts\"></script>\n  </body>\n</html>\n"
  },
  {
    "path": "examples/chrome-extension/src/popup.ts",
    "content": "\"use strict\";\n\n// This code is partially adapted from the openai-chatgpt-chrome-extension repo:\n// https://github.com/jessedi0n/openai-chatgpt-chrome-extension\n\nimport \"./popup.css\";\n\nimport {\n  MLCEngineInterface,\n  InitProgressReport,\n  CreateMLCEngine,\n  ChatCompletionMessageParam,\n  prebuiltAppConfig,\n} from \"@mlc-ai/web-llm\";\nimport { ProgressBar, Line } from \"progressbar.js\";\n\n// modified setLabel to not throw error\nfunction setLabel(id: string, text: string) {\n  const label = document.getElementById(id);\n  if (label != null) {\n    label.innerText = text;\n  }\n}\n\nfunction getElementAndCheck(id: string): HTMLElement {\n  const element = document.getElementById(id);\n  if (element == null) {\n    throw Error(\"Cannot find element \" + id);\n  }\n  return element;\n}\n\nconst sleep = (ms: number) => new Promise((r) => setTimeout(r, ms));\n\nconst queryInput = getElementAndCheck(\"query-input\")!;\nconst submitButton = getElementAndCheck(\"submit-button\")!;\nconst modelName = getElementAndCheck(\"model-name\");\n\nlet context = \"\";\nlet modelDisplayName = \"\";\n\n// throws runtime.lastError if you refresh extension AND try to access a webpage that is already open\nfetchPageContents();\n\n(<HTMLButtonElement>submitButton).disabled = true;\n\nlet progressBar: ProgressBar = new Line(\"#loadingContainer\", {\n  strokeWidth: 4,\n  easing: \"easeInOut\",\n  duration: 1400,\n  color: \"#ffd166\",\n  trailColor: \"#eee\",\n  trailWidth: 1,\n  svgStyle: { width: \"100%\", height: \"100%\" },\n});\n\nlet isLoadingParams = true;\n\nlet initProgressCallback = (report: InitProgressReport) => {\n  setLabel(\"init-label\", report.text);\n  progressBar.animate(report.progress, {\n    duration: 50,\n  });\n  if (report.progress == 1.0) {\n    enableInputs();\n  }\n};\n\n// initially selected model\nlet selectedModel = \"Qwen2-0.5B-Instruct-q4f16_1-MLC\";\n\n// populate model-selection\nconst modelSelector = getElementAndCheck(\n  \"model-selection\",\n) as HTMLSelectElement;\nfor (let i = 0; i < prebuiltAppConfig.model_list.length; ++i) {\n  const model = prebuiltAppConfig.model_list[i];\n  const opt = document.createElement(\"option\");\n  opt.value = model.model_id;\n  opt.innerHTML = model.model_id;\n  opt.selected = false;\n\n  // set initial selection as the initially selected model\n  if (model.model_id == selectedModel) {\n    opt.selected = true;\n  }\n\n  modelSelector.appendChild(opt);\n}\n\nmodelName.innerText = \"Loading initial model...\";\nconst engine: MLCEngineInterface = await CreateMLCEngine(selectedModel, {\n  initProgressCallback: initProgressCallback,\n});\nmodelName.innerText = \"Now chatting with \" + modelDisplayName;\n\nlet chatHistory: ChatCompletionMessageParam[] = [];\n\nfunction enableInputs() {\n  if (isLoadingParams) {\n    sleep(500);\n    isLoadingParams = false;\n  }\n\n  // remove loading bar and loading bar descriptors, if exists\n  const initLabel = document.getElementById(\"init-label\");\n  initLabel?.remove();\n  const loadingBarContainer = document.getElementById(\"loadingContainer\")!;\n  loadingBarContainer?.remove();\n  queryInput.focus();\n\n  const modelNameArray = selectedModel.split(\"-\");\n  modelDisplayName = modelNameArray[0];\n  let j = 1;\n  while (j < modelNameArray.length && modelNameArray[j][0] != \"q\") {\n    modelDisplayName = modelDisplayName + \"-\" + modelNameArray[j];\n    j++;\n  }\n}\n\nlet requestInProgress = false;\n\n// Disable submit button if input field is empty\nqueryInput.addEventListener(\"keyup\", () => {\n  if (\n    (<HTMLInputElement>queryInput).value === \"\" ||\n    requestInProgress ||\n    isLoadingParams\n  ) {\n    (<HTMLButtonElement>submitButton).disabled = true;\n  } else {\n    (<HTMLButtonElement>submitButton).disabled = false;\n  }\n});\n\n// If user presses enter, click submit button\nqueryInput.addEventListener(\"keyup\", (event) => {\n  if (event.code === \"Enter\") {\n    event.preventDefault();\n    submitButton.click();\n  }\n});\n\n// Listen for clicks on submit button\nasync function handleClick() {\n  requestInProgress = true;\n  (<HTMLButtonElement>submitButton).disabled = true;\n\n  // Get the message from the input field\n  const message = (<HTMLInputElement>queryInput).value;\n  console.log(\"message\", message);\n  // Clear the answer\n  document.getElementById(\"answer\")!.innerHTML = \"\";\n  // Hide the answer\n  document.getElementById(\"answerWrapper\")!.style.display = \"none\";\n  // Show the loading indicator\n  document.getElementById(\"loading-indicator\")!.style.display = \"block\";\n\n  // Generate response\n  let inp = message;\n  if (context.length > 0) {\n    inp =\n      \"Use only the following context when answering the question at the end. Don't use any other knowledge.\\n\" +\n      context +\n      \"\\n\\nQuestion: \" +\n      message +\n      \"\\n\\nHelpful Answer: \";\n  }\n  console.log(\"Input:\", inp);\n  chatHistory.push({ role: \"user\", content: inp });\n\n  let curMessage = \"\";\n  const completion = await engine.chat.completions.create({\n    stream: true,\n    messages: chatHistory,\n  });\n  for await (const chunk of completion) {\n    const curDelta = chunk.choices[0].delta.content;\n    if (curDelta) {\n      curMessage += curDelta;\n    }\n    updateAnswer(curMessage);\n  }\n  const response = await engine.getMessage();\n  chatHistory.push({ role: \"assistant\", content: await engine.getMessage() });\n  console.log(\"response\", response);\n\n  requestInProgress = false;\n  (<HTMLButtonElement>submitButton).disabled = false;\n}\nsubmitButton.addEventListener(\"click\", handleClick);\n\n// listen for changes in modelSelector\nasync function handleSelectChange() {\n  if (isLoadingParams) {\n    return;\n  }\n\n  modelName.innerText = \"\";\n\n  const initLabel = document.createElement(\"p\");\n  initLabel.id = \"init-label\";\n  initLabel.innerText = \"Initializing model...\";\n  const loadingContainer = document.createElement(\"div\");\n  loadingContainer.id = \"loadingContainer\";\n\n  const loadingBox = getElementAndCheck(\"loadingBox\");\n  loadingBox.appendChild(initLabel);\n  loadingBox.appendChild(loadingContainer);\n\n  isLoadingParams = true;\n  (<HTMLButtonElement>submitButton).disabled = true;\n\n  if (requestInProgress) {\n    engine.interruptGenerate();\n  }\n  engine.resetChat();\n  chatHistory = [];\n  await engine.unload();\n\n  selectedModel = modelSelector.value;\n\n  progressBar = new Line(\"#loadingContainer\", {\n    strokeWidth: 4,\n    easing: \"easeInOut\",\n    duration: 1400,\n    color: \"#ffd166\",\n    trailColor: \"#eee\",\n    trailWidth: 1,\n    svgStyle: { width: \"100%\", height: \"100%\" },\n  });\n\n  initProgressCallback = (report: InitProgressReport) => {\n    setLabel(\"init-label\", report.text);\n    progressBar.animate(report.progress, {\n      duration: 50,\n    });\n    if (report.progress == 1.0) {\n      enableInputs();\n    }\n  };\n\n  engine.setInitProgressCallback(initProgressCallback);\n\n  requestInProgress = true;\n  modelName.innerText = \"Reloading with new model...\";\n  await engine.reload(selectedModel);\n  requestInProgress = false;\n  modelName.innerText = \"Now chatting with \" + modelDisplayName;\n}\nmodelSelector.addEventListener(\"change\", handleSelectChange);\n\n// Listen for messages from the background script\nchrome.runtime.onMessage.addListener(({ answer, error }) => {\n  if (answer) {\n    updateAnswer(answer);\n  }\n});\n\nfunction updateAnswer(answer: string) {\n  // Show answer\n  document.getElementById(\"answerWrapper\")!.style.display = \"block\";\n  const answerWithBreaks = answer.replace(/\\n/g, \"<br>\");\n  document.getElementById(\"answer\")!.innerHTML = answerWithBreaks;\n  // Add event listener to copy button\n  document.getElementById(\"copyAnswer\")!.addEventListener(\"click\", () => {\n    // Get the answer text\n    const answerText = answer;\n    // Copy the answer text to the clipboard\n    navigator.clipboard\n      .writeText(answerText)\n      .then(() => console.log(\"Answer text copied to clipboard\"))\n      .catch((err) => console.error(\"Could not copy text: \", err));\n  });\n  const options: Intl.DateTimeFormatOptions = {\n    month: \"short\",\n    day: \"2-digit\",\n    hour: \"2-digit\",\n    minute: \"2-digit\",\n    second: \"2-digit\",\n  };\n  const time = new Date().toLocaleString(\"en-US\", options);\n  // Update timestamp\n  document.getElementById(\"timestamp\")!.innerText = time;\n  // Hide loading indicator\n  document.getElementById(\"loading-indicator\")!.style.display = \"none\";\n}\n\nfunction fetchPageContents() {\n  chrome.tabs.query({ currentWindow: true, active: true }, function (tabs) {\n    const port = chrome.tabs.connect(tabs[0].id, { name: \"channelName\" });\n    port.postMessage({});\n    port.onMessage.addListener(function (msg) {\n      console.log(\"Page contents:\", msg.contents);\n      context = msg.contents;\n    });\n  });\n}\n"
  },
  {
    "path": "examples/chrome-extension-webgpu-service-worker/README.md",
    "content": "# WebLLM Chrome Extension using WebGPU Running on Service Worker\n\n![Chrome Extension](https://github.com/mlc-ai/mlc-llm/assets/11940172/0d94cc73-eff1-4128-a6e4-70dc879f04e0)\n\n> [!WARNING]  \n> Service worker support in WebGPU is enabled by default in [Chrome 124](https://chromiumdash.appspot.com/commit/8d78510e4aca5ac3cd8ee4a33e96b404eaa43246).\n> If you are using Chrome 123, go to `chrome://flags/#enable-experimental-web-platform-features`, enable the `#enable-experimental-web-platform-features` flag, and **relaunch the browser**.\n\nThis example shows how we can create a Chrome extension using WebGPU and service worker.\n\n- The project structure is as follows:\n  - `manifest.json`: A required file that lists important information about the structure and behavior of that extension. Here we are using manifest V3.\n  - `popup.ts`: Script of the extension pop-up window.\n  - `background.ts`: Script of the service worker. An extension service worker is loaded when it is needed, and unloaded when it goes dormant.\n  - `content.js`: Content script that interacts with DOM.\n- Run\n\n  ```bash\n  npm install\n  npm run build\n  ```\n\n  This will create a new directory at `./dist/`. To load the extension into Chrome, go to Extensions > Manage Extensions and select Load Unpacked. Add the `./dist/` directory. You can now pin the extension to your toolbar and use it to chat with your favorite model!\n\n**Note**: This example disables chatting using the contents of the active tab by default.\nTo enable it, set `useContext` in `popup.ts` to `true`. More info about this feature can be found\n[here](https://github.com/mlc-ai/web-llm/pull/190).\nHowever, if the web content is too large, it might run into issues. We recommend using `example.html` to\ntest this feature.\n"
  },
  {
    "path": "examples/chrome-extension-webgpu-service-worker/package.json",
    "content": "{\n  \"name\": \"chrome-extension\",\n  \"version\": \"1.0.0\",\n  \"description\": \"\",\n  \"private\": true,\n  \"scripts\": {\n    \"build\": \"parcel build src/manifest.json --config @parcel/config-webextension\"\n  },\n  \"author\": \"\",\n  \"license\": \"ISC\",\n  \"devDependencies\": {\n    \"@parcel/config-webextension\": \"^2.9.3\",\n    \"@types/chrome\": \"^0.0.242\",\n    \"buffer\": \"^6.0.3\",\n    \"parcel\": \"^2.9.3\",\n    \"process\": \"^0.11.10\",\n    \"url\": \"^0.11.1\"\n  },\n  \"dependencies\": {\n    \"@mlc-ai/web-llm\": \"^0.2.82\",\n    \"progressbar.js\": \"^1.1.0\"\n  }\n}\n"
  },
  {
    "path": "examples/chrome-extension-webgpu-service-worker/src/background.ts",
    "content": "import { ExtensionServiceWorkerMLCEngineHandler } from \"@mlc-ai/web-llm\";\n\n// Hookup an engine to a service worker handler\nlet handler;\n\nchrome.runtime.onConnect.addListener(function (port) {\n  console.assert(port.name === \"web_llm_service_worker\");\n  if (handler === undefined) {\n    handler = new ExtensionServiceWorkerMLCEngineHandler(port);\n  } else {\n    handler.setPort(port);\n  }\n  port.onMessage.addListener(handler.onmessage.bind(handler));\n});\n"
  },
  {
    "path": "examples/chrome-extension-webgpu-service-worker/src/content.js",
    "content": "// Only the content script is able to access the DOM\nchrome.runtime.onConnect.addListener(function (port) {\n  port.onMessage.addListener(function (msg) {\n    port.postMessage({ contents: document.body.innerHTML });\n  });\n});\n"
  },
  {
    "path": "examples/chrome-extension-webgpu-service-worker/src/example.html",
    "content": "In the year 2154, humanity had colonized several planets in the distant reaches\nof the galaxy. The planet of Xylophia-IV was one of the most remote and\ninhospitable, with temperatures often dropping to -200 degrees Celsius. Despite\nthese harsh conditions, a team of scientists had established a research station\non the planet to study the unique geological formations and exotic flora and\nfauna. One day, while conducting a routine survey of the planet's surface, the\nteam discovered an strange object buried deep in the ice. As they examined it\ncloser, they realized it was a small, metallic capsule with a glowing blue\nsymbol etched onto its surface. The team's leader, a brilliant scientist named\nDr. Maria Rodriguez, was immediately intrigued by the capsule's mysterious\norigins. She ordered her team to bring it back to the research station for\nfurther analysis. After weeks of studying the capsule, the team finally cracked\nthe code to the symbol etched onto its surface. It was a message from an alien\nrace, warning Earth of an impending attack from an unknown threat. The team was\nshocked and dismayed by the news, but they knew they had to act quickly to warn\nthe rest of humanity. They transmitted the message to the nearest space station,\nwhich relayed it to Earth's government. As the threat of attack loomed near, the\nteam remained on high alert, ready to face whatever dangers lay ahead. They had\nuncovered a secrets of the universe, and now they were determined to protect\ntheir planet and its inhabitants at all costs.\n"
  },
  {
    "path": "examples/chrome-extension-webgpu-service-worker/src/manifest.json",
    "content": "{\n  \"manifest_version\": 3,\n  \"name\": \"MLCBot\",\n  \"version\": \"0.1.0\",\n  \"description\": \"Chat with your browser\",\n  \"icons\": {\n    \"16\": \"icons/icon-16.png\",\n    \"32\": \"icons/icon-32.png\",\n    \"64\": \"icons/icon-64.png\",\n    \"128\": \"icons/icon-128.png\"\n  },\n  \"content_security_policy\": {\n    \"extension_pages\": \"style-src-elem 'self' https://cdnjs.cloudflare.com; font-src 'self' https://cdnjs.cloudflare.com; script-src 'self' 'wasm-unsafe-eval'; default-src 'self' data:; connect-src 'self' data: http://localhost:8000 https://huggingface.co https://cdn-lfs.huggingface.co https://cdn-lfs-us-1.huggingface.co https://raw.githubusercontent.com https://cdn-lfs-us-1.hf.co https://cas-bridge.xethub.hf.co\"\n  },\n  \"action\": {\n    \"default_title\": \"MLCBot\",\n    \"default_popup\": \"popup.html\"\n  },\n  \"content_scripts\": [\n    {\n      \"matches\": [\"<all_urls>\"],\n      \"js\": [\"content.js\"]\n    }\n  ],\n  \"background\": {\n    \"service_worker\": \"background.ts\",\n    \"type\": \"module\"\n  },\n  \"permissions\": [\"storage\", \"tabs\", \"webNavigation\"]\n}\n"
  },
  {
    "path": "examples/chrome-extension-webgpu-service-worker/src/popup.css",
    "content": "*,\n*::before,\n*::after {\n  margin: 0;\n  padding: 0;\n  box-sizing: border-box;\n}\n\nhtml {\n  font-family:\n    -apple-system,\n    BlinkMacSystemFont,\n    Segoe UI,\n    Helvetica,\n    Arial,\n    sans-serif;\n  color: #222;\n}\n\nbody {\n  margin: 0;\n  padding: 0.5rem;\n  background-color: #778da9;\n  width: 320px;\n  font-size: small;\n}\n\np {\n  margin: 0;\n}\n\n/* LOADING BAR */\n#loadingContainer {\n  margin-bottom: 15px;\n  width: 300px;\n  height: 8px;\n}\n\n/* INPUT AREA */\n#query-input {\n  border: 1px solid #ccc;\n  border-radius: 4px;\n}\n\n.input-container {\n  display: flex;\n  flex-direction: row;\n  align-items: center;\n}\n\n.input-container input {\n  width: 100%;\n  outline: none;\n  padding: 0.5rem;\n  margin-right: 0.5rem;\n}\n\n/* SUBMIT BUTTON */\n.btn {\n  background-color: #1b263b;\n  color: white;\n  font-size: small;\n  cursor: pointer;\n  border-radius: 4px;\n  border: none;\n  padding: 0.5rem;\n}\n\n.btn:hover {\n  background-color: #d0d0d0;\n}\n\n.btn:disabled {\n  background-color: #a7a7a7;\n  color: rgb(255, 255, 255);\n  cursor: default;\n}\n\n.btn img {\n  width: 1rem;\n  height: 1rem;\n}\n\n/* LOADING */\n\n.stage {\n  display: flex;\n  justify-content: center;\n  align-items: center;\n  position: relative;\n  margin: 0 -5%;\n  overflow: hidden;\n}\n\n#loading-indicator {\n  display: none;\n  color: white;\n  margin-top: 0.5rem;\n}\n\n.dot-flashing {\n  position: relative;\n  width: 10px;\n  height: 10px;\n  border-radius: 5px;\n  background-color: #1b263b;\n  color: #1b263b;\n  animation: dot-flashing 0.4s infinite linear alternate;\n  animation-delay: 0.2s;\n}\n\n.dot-flashing::before,\n.dot-flashing::after {\n  content: \"\";\n  display: inline-block;\n  position: absolute;\n  top: 0;\n}\n\n.dot-flashing::before {\n  left: -15px;\n  width: 10px;\n  height: 10px;\n  border-radius: 5px;\n  background-color: #1b263b;\n  color: #1b263b;\n  animation: dot-flashing 0.4s infinite alternate;\n  animation-delay: 0s;\n}\n\n.dot-flashing::after {\n  left: 15px;\n  width: 10px;\n  height: 10px;\n  border-radius: 5px;\n  background-color: #1b263b;\n  color: #1b263b;\n  animation: dot-flashing 0.4s infinite alternate;\n  animation-delay: 0.4s;\n}\n\n@keyframes dot-flashing {\n  0% {\n    background-color: #1b263b;\n  }\n\n  50%,\n  100% {\n    background-color: #415a77;\n  }\n}\n\n/* ANSWERS */\n#queriesAnswersContainer {\n  display: block;\n  color: white;\n  margin-top: 0.5rem;\n}\n\n#answer {\n  color: #333333;\n}\n\n#answerWrapper {\n  display: none;\n  background-color: #ffd166;\n  border-radius: 8px;\n  padding: 0.5rem;\n  margin-top: 0.5rem;\n}\n\n.queriesAnswers {\n  border-radius: 8px;\n  background-color: #ffd166;\n  padding: 0.5rem;\n  color: #333333;\n}\n\n#lastQuery {\n  color: rgb(188, 188, 188);\n}\n\n#lastAnswer {\n  color: white;\n  margin-top: 0.5rem;\n}\n\n#lastRequest {\n  padding: 0.5rem;\n  margin-top: 0.5rem;\n  background-color: #333333;\n  border-radius: 4px;\n}\n\n/* ANSWER OPTIONS */\n.timeStamp {\n  color: #9a8c98;\n}\n\n.copyRow {\n  display: flex;\n  flex-direction: row;\n  align-items: end;\n  justify-content: space-between;\n  color: #a7a7a7;\n  margin-top: 0.5rem;\n}\n\n.copyText {\n  display: none;\n  color: #a7a7a7;\n  margin-right: 0.5rem;\n}\n\n.copyButton {\n  color: #415a77;\n  background-color: transparent;\n  border: none;\n  cursor: pointer;\n  padding: 0;\n  margin-left: 0.5rem;\n}\n\n.copyButton:hover {\n  color: #5e80a7;\n  background-color: transparent;\n}\n\n.removeButton {\n  color: #415a77;\n  background-color: transparent;\n  border: none;\n  cursor: pointer;\n  padding: 0;\n}\n\n.removeButton:hover {\n  color: #5e80a7;\n  background-color: transparent;\n}\n"
  },
  {
    "path": "examples/chrome-extension-webgpu-service-worker/src/popup.html",
    "content": "<!doctype html>\n<html lang=\"en\">\n  <head>\n    <meta charset=\"UTF-8\" />\n    <title>Chatbot</title>\n    <link rel=\"stylesheet\" href=\"popup.css\" />\n    <link\n      rel=\"stylesheet\"\n      href=\"https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css\"\n    />\n  </head>\n  <body>\n    <div id=\"loadingContainer\"></div>\n\n    <div class=\"input-container form-group\">\n      <input\n        type=\"search\"\n        id=\"query-input\"\n        placeholder=\"What's on your mind?\"\n      />\n      <button id=\"submit-button\" class=\"btn\">\n        <i class=\"fa fa-comments\"></i>\n      </button>\n    </div>\n\n    <div class=\"stage\">\n      <div id=\"loading-indicator\" class=\"dot-flashing\"></div>\n    </div>\n\n    <div id=\"answerWrapper\">\n      <div id=\"answer\"></div>\n      <div class=\"copyRow\">\n        <span id=\"timestamp\"></span>\n        <button\n          id=\"copyAnswer\"\n          class=\"btn copyButton\"\n          title=\"Copy the Answer to the Clipboard\"\n        >\n          <i class=\"fa-solid fa-copy fa-lg\"></i>\n        </button>\n      </div>\n    </div>\n\n    <script type=\"module\" src=\"./popup.ts\"></script>\n  </body>\n</html>\n"
  },
  {
    "path": "examples/chrome-extension-webgpu-service-worker/src/popup.ts",
    "content": "\"use strict\";\n\n// This code is partially adapted from the openai-chatgpt-chrome-extension repo:\n// https://github.com/jessedi0n/openai-chatgpt-chrome-extension\n\nimport \"./popup.css\";\n\nimport {\n  ChatCompletionMessageParam,\n  CreateExtensionServiceWorkerMLCEngine,\n  MLCEngineInterface,\n  InitProgressReport,\n} from \"@mlc-ai/web-llm\";\nimport { ProgressBar, Line } from \"progressbar.js\";\n\n/***************** UI elements *****************/\n// Whether or not to use the content from the active tab as the context\nconst useContext = false;\nconst sleep = (ms: number) => new Promise((r) => setTimeout(r, ms));\n\nconst queryInput = document.getElementById(\"query-input\")!;\nconst submitButton = document.getElementById(\"submit-button\")!;\n\nlet isLoadingParams = false;\n\n(<HTMLButtonElement>submitButton).disabled = true;\n\nconst progressBar: ProgressBar = new Line(\"#loadingContainer\", {\n  strokeWidth: 4,\n  easing: \"easeInOut\",\n  duration: 1400,\n  color: \"#ffd166\",\n  trailColor: \"#eee\",\n  trailWidth: 1,\n  svgStyle: { width: \"100%\", height: \"100%\" },\n});\n\n/***************** Web-LLM MLCEngine Configuration *****************/\nconst initProgressCallback = (report: InitProgressReport) => {\n  progressBar.animate(report.progress, {\n    duration: 50,\n  });\n  if (report.progress == 1.0) {\n    enableInputs();\n  }\n};\n\nconst engine: MLCEngineInterface = await CreateExtensionServiceWorkerMLCEngine(\n  \"Qwen2-0.5B-Instruct-q4f16_1-MLC\",\n  { initProgressCallback: initProgressCallback },\n);\nconst chatHistory: ChatCompletionMessageParam[] = [];\n\nisLoadingParams = true;\n\nfunction enableInputs() {\n  if (isLoadingParams) {\n    sleep(500);\n    (<HTMLButtonElement>submitButton).disabled = false;\n    const loadingBarContainer = document.getElementById(\"loadingContainer\")!;\n    loadingBarContainer.remove();\n    queryInput.focus();\n    isLoadingParams = false;\n  }\n}\n\n/***************** Event Listeners *****************/\n\n// Disable submit button if input field is empty\nqueryInput.addEventListener(\"keyup\", () => {\n  if ((<HTMLInputElement>queryInput).value === \"\") {\n    (<HTMLButtonElement>submitButton).disabled = true;\n  } else {\n    (<HTMLButtonElement>submitButton).disabled = false;\n  }\n});\n\n// If user presses enter, click submit button\nqueryInput.addEventListener(\"keyup\", (event) => {\n  if (event.code === \"Enter\") {\n    event.preventDefault();\n    submitButton.click();\n  }\n});\n\n// Listen for clicks on submit button\nasync function handleClick() {\n  // Get the message from the input field\n  const message = (<HTMLInputElement>queryInput).value;\n  console.log(\"message\", message);\n  chatHistory.push({ role: \"user\", content: message });\n\n  // Clear the answer\n  document.getElementById(\"answer\")!.innerHTML = \"\";\n  // Hide the answer\n  document.getElementById(\"answerWrapper\")!.style.display = \"none\";\n  // Show the loading indicator\n  document.getElementById(\"loading-indicator\")!.style.display = \"block\";\n\n  // Send the chat completion message to the engine\n  let curMessage = \"\";\n  const completion = await engine.chat.completions.create({\n    stream: true,\n    messages: chatHistory,\n  });\n\n  // Update the answer as the model generates more text\n  for await (const chunk of completion) {\n    const curDelta = chunk.choices[0].delta.content;\n    if (curDelta) {\n      curMessage += curDelta;\n    }\n    updateAnswer(curMessage);\n  }\n  chatHistory.push({ role: \"assistant\", content: await engine.getMessage() });\n}\n\nsubmitButton.addEventListener(\"click\", handleClick);\n\nfunction updateAnswer(answer: string) {\n  // Show answer\n  document.getElementById(\"answerWrapper\")!.style.display = \"block\";\n  const answerWithBreaks = answer.replace(/\\n/g, \"<br>\");\n  document.getElementById(\"answer\")!.innerHTML = answerWithBreaks;\n  // Add event listener to copy button\n  document.getElementById(\"copyAnswer\")!.addEventListener(\"click\", () => {\n    // Get the answer text\n    const answerText = answer;\n    // Copy the answer text to the clipboard\n    navigator.clipboard\n      .writeText(answerText)\n      .then(() => console.log(\"Answer text copied to clipboard\"))\n      .catch((err) => console.error(\"Could not copy text: \", err));\n  });\n  const options: Intl.DateTimeFormatOptions = {\n    month: \"short\",\n    day: \"2-digit\",\n    hour: \"2-digit\",\n    minute: \"2-digit\",\n    second: \"2-digit\",\n  };\n  const time = new Date().toLocaleString(\"en-US\", options);\n  // Update timestamp\n  document.getElementById(\"timestamp\")!.innerText = time;\n  // Hide loading indicator\n  document.getElementById(\"loading-indicator\")!.style.display = \"none\";\n}\n\nfunction fetchPageContents() {\n  chrome.tabs.query({ currentWindow: true, active: true }, function (tabs) {\n    if (tabs[0]?.id) {\n      const port = chrome.tabs.connect(tabs[0].id, { name: \"channelName\" });\n      port.postMessage({});\n      port.onMessage.addListener(function (msg) {\n        console.log(\"Page contents:\", msg.contents);\n        chrome.runtime.sendMessage({ context: msg.contents });\n      });\n    }\n  });\n}\n\n// Grab the page contents when the popup is opened\nwindow.onload = function () {\n  if (useContext) {\n    fetchPageContents();\n  }\n};\n"
  },
  {
    "path": "examples/embeddings/README.md",
    "content": "# WebLLM Get Started App\n\nThis folder provides a minimum demo to show WebLLM API in a webapp setting.\nTo try it out, you can do the following steps under this folder\n\n```bash\nnpm install\nnpm start\n```\n\nNote if you would like to hack WebLLM core package.\nYou can change web-llm dependencies as `\"file:../..\"`, and follow the build from source\ninstruction in the project to build webllm locally. This option is only recommended\nif you would like to hack WebLLM core package.\n"
  },
  {
    "path": "examples/embeddings/package.json",
    "content": "{\n  \"name\": \"embeddings-example\",\n  \"version\": \"0.1.0\",\n  \"private\": true,\n  \"scripts\": {\n    \"start\": \"parcel src/embeddings.html  --port 8885\",\n    \"build\": \"parcel build src/embeddings.html --dist-dir lib\"\n  },\n  \"devDependencies\": {\n    \"buffer\": \"^5.7.1\",\n    \"parcel\": \"^2.8.3\",\n    \"process\": \"^0.11.10\",\n    \"tslib\": \"^2.3.1\",\n    \"typescript\": \"^4.9.5\",\n    \"url\": \"^0.11.3\"\n  },\n  \"dependencies\": {\n    \"@mlc-ai/web-llm\": \"^0.2.82\",\n    \"langchain\": \"0.2.15\"\n  }\n}\n"
  },
  {
    "path": "examples/embeddings/src/embeddings.html",
    "content": "<!doctype html>\n<html>\n  <script>\n    webLLMGlobal = {};\n  </script>\n  <body>\n    <h2>WebLLM Test Page</h2>\n    Open console to see output\n    <br />\n    <br />\n    <label id=\"init-label\"> </label>\n\n    <h3>Prompt</h3>\n    <label id=\"prompt-label\"> </label>\n\n    <h3>Response</h3>\n    <label id=\"generate-label\"> </label>\n    <br />\n    <label id=\"stats-label\"> </label>\n\n    <script type=\"module\" src=\"./embeddings.ts\"></script>\n  </body>\n</html>\n"
  },
  {
    "path": "examples/embeddings/src/embeddings.ts",
    "content": "import * as webllm from \"@mlc-ai/web-llm\";\nimport { MemoryVectorStore } from \"langchain/vectorstores/memory\";\nimport type { EmbeddingsInterface } from \"@langchain/core/embeddings\";\nimport type { Document } from \"@langchain/core/documents\";\nimport { formatDocumentsAsString } from \"langchain/util/document\";\nimport { PromptTemplate } from \"@langchain/core/prompts\";\nimport {\n  RunnableSequence,\n  RunnablePassthrough,\n} from \"@langchain/core/runnables\";\n\nfunction setLabel(id: string, text: string) {\n  const label = document.getElementById(id);\n  if (label == null) {\n    throw Error(\"Cannot find label \" + id);\n  }\n  label.innerText = text;\n}\n\nconst initProgressCallback = (report: webllm.InitProgressReport) => {\n  setLabel(\"init-label\", report.text);\n};\n\n// For integration with Langchain\nclass WebLLMEmbeddings implements EmbeddingsInterface {\n  engine: webllm.MLCEngineInterface;\n  modelId: string;\n  constructor(engine: webllm.MLCEngineInterface, modelId: string) {\n    this.engine = engine;\n    this.modelId = modelId;\n  }\n\n  async _embed(texts: string[]): Promise<number[][]> {\n    const reply = await this.engine.embeddings.create({\n      input: texts,\n      model: this.modelId,\n    });\n    const result: number[][] = [];\n    for (let i = 0; i < texts.length; i++) {\n      result.push(reply.data[i].embedding);\n    }\n    return result;\n  }\n\n  async embedQuery(document: string): Promise<number[]> {\n    return this._embed([document]).then((embeddings) => embeddings[0]);\n  }\n\n  async embedDocuments(documents: string[]): Promise<number[][]> {\n    return this._embed(documents);\n  }\n}\n\n// Prepare inputs\nconst documents_og = [\"The Data Cloud!\", \"Mexico City of Course!\"];\nconst queries_og = [\"what is snowflake?\", \"Where can I get the best tacos?\"];\nconst documents: string[] = [];\nconst queries: string[] = [];\nconst query_prefix =\n  \"Represent this sentence for searching relevant passages: \";\n// Process according to Snowflake model\ndocuments_og.forEach(function (item, index) {\n  documents[index] = `[CLS] ${item} [SEP]`;\n});\nqueries_og.forEach(function (item, index) {\n  queries[index] = `[CLS] ${query_prefix}${item} [SEP]`;\n});\nconsole.log(\"Formatted documents: \", documents);\nconsole.log(\"Formatted queries: \", queries);\n\n// Using webllm's API\nasync function webllmAPI() {\n  // b4 means the max batch size is compiled as 4. That is, the model can process 4 inputs in a\n  // batch. If given more than 4, the model will forward multiple times. The larger the max batch\n  // size, the more memory it consumes.\n  // const selectedModel = \"snowflake-arctic-embed-m-q0f32-MLC-b32\";\n  const selectedModel = \"snowflake-arctic-embed-m-q0f32-MLC-b4\";\n  const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(\n    selectedModel,\n    {\n      initProgressCallback: initProgressCallback,\n      logLevel: \"INFO\", // specify the log level\n    },\n  );\n\n  const docReply = await engine.embeddings.create({ input: documents });\n  console.log(docReply);\n  console.log(docReply.usage);\n\n  const queryReply = await engine.embeddings.create({ input: queries });\n  console.log(queryReply);\n  console.log(queryReply.usage);\n\n  // Calculate similarity (we use langchain here, but any method works)\n  const vectorStore = await MemoryVectorStore.fromExistingIndex(\n    new WebLLMEmbeddings(engine, selectedModel),\n  );\n  // See score\n  for (let i = 0; i < queries_og.length; i++) {\n    console.log(`Similarity with: ${queries_og[i]}`);\n    for (let j = 0; j < documents_og.length; j++) {\n      const similarity = vectorStore.similarity(\n        queryReply.data[i].embedding,\n        docReply.data[j].embedding,\n      );\n      console.log(`${documents_og[j]}: ${similarity}`);\n    }\n  }\n}\n\n// Alternatively, integrating with Langchain's API\nasync function langchainAPI() {\n  // b4 means the max batch size is compiled as 4. That is, the model can process 4 inputs in a\n  // batch. If given more than 4, the model will forward multiple times. The larger the max batch\n  // size, the more memory it consumes.\n  // const selectedModel = \"snowflake-arctic-embed-m-q0f32-MLC-b32\";\n  const selectedModel = \"snowflake-arctic-embed-m-q0f32-MLC-b4\";\n  const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(\n    selectedModel,\n    {\n      initProgressCallback: initProgressCallback,\n      logLevel: \"INFO\", // specify the log level\n    },\n  );\n\n  const vectorStore = await MemoryVectorStore.fromExistingIndex(\n    new WebLLMEmbeddings(engine, selectedModel),\n  );\n  const document0: Document = {\n    pageContent: documents[0],\n    metadata: {},\n  };\n  const document1: Document = {\n    pageContent: documents[1],\n    metadata: {},\n  };\n  await vectorStore.addDocuments([document0, document1]);\n\n  const similaritySearchResults0 = await vectorStore.similaritySearch(\n    queries[0],\n    1,\n  );\n  for (const doc of similaritySearchResults0) {\n    console.log(`* ${doc.pageContent}`);\n  }\n\n  const similaritySearchResults1 = await vectorStore.similaritySearch(\n    queries[1],\n    1,\n  );\n  for (const doc of similaritySearchResults1) {\n    console.log(`* ${doc.pageContent}`);\n  }\n}\n\n// RAG with Langchain.js using WebLLM for both LLM and Embedding in a single engine\n// Followed https://js.langchain.com/v0.1/docs/expression_language/cookbook/retrieval/\n// There are many possible ways to achieve RAG (e.g. degree of integration with Langchain,\n// using WebWorker, etc.). We provide a minimal example here.\nasync function simpleRAG() {\n  // 0. Load both embedding model and LLM to a single WebLLM Engine\n  const embeddingModelId = \"snowflake-arctic-embed-m-q0f32-MLC-b4\";\n  const llmModelId = \"gemma-2-2b-it-q4f32_1-MLC-1k\";\n  const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(\n    [embeddingModelId, llmModelId],\n    {\n      initProgressCallback: initProgressCallback,\n      logLevel: \"INFO\", // specify the log level\n    },\n  );\n\n  const vectorStore = await MemoryVectorStore.fromTexts(\n    [\"mitochondria is the powerhouse of the cell\"],\n    [{ id: 1 }],\n    new WebLLMEmbeddings(engine, embeddingModelId),\n  );\n  const retriever = vectorStore.asRetriever();\n\n  const prompt =\n    PromptTemplate.fromTemplate(`Answer the question based only on the following context:\n  {context}\n  \n  Question: {question}`);\n\n  const chain = RunnableSequence.from([\n    {\n      context: retriever.pipe(formatDocumentsAsString),\n      question: new RunnablePassthrough(),\n    },\n    prompt,\n  ]);\n\n  const formattedPrompt = (\n    await chain.invoke(\"What is the powerhouse of the cell?\")\n  ).toString();\n  const reply = await engine.chat.completions.create({\n    messages: [{ role: \"user\", content: formattedPrompt }],\n    model: llmModelId,\n  });\n\n  console.log(reply.choices[0].message.content);\n\n  /*\n    \"The powerhouse of the cell is the mitochondria.\"\n  */\n}\n\n// Select one to run\n// webllmAPI();\n// langchainAPI();\nsimpleRAG();\n"
  },
  {
    "path": "examples/function-calling/README.md",
    "content": "### OpenAI API Demos - Function calling\n\nThis folder contains two main ways of using function calling with WebLLM.\n\n`function-calling-manual` demonstrates how you can use function calling with Llama3.1 and Hermes2\nwithout using the `tools`, `tool_choice`, and `tool_call` fields. This is the most flexible way and you can follow\nthe instruction given by the model releaser and iterate yourself on top of that. However, you need to do parsing on your own, which differs for each model. For instance, Hermes2 models use `<tool_call>` and `</tool_call>` to wrap around a tool call, which may be very different from other models' format.\n\n`function-calling-openai` conforms to the OpenAI function calling usage, leveraging `tools`, `tool_choice`, and `tool_call`\nfields. This is more usable, but sacrifices the flexibility since we have pre-defined system prompt\nfor this.\n"
  },
  {
    "path": "examples/function-calling/function-calling-manual/README.md",
    "content": "### Demos - Function calling\n\nRun `npm install` first, followed by `npm start`.\n\nNote if you would like to hack WebLLM core package,\nyou can change web-llm dependencies as `\"file:../../..\"`, and follow the build from source\ninstruction in the project to build webllm locally. This option is only recommended\nif you would like to hack WebLLM core package.\n"
  },
  {
    "path": "examples/function-calling/function-calling-manual/package.json",
    "content": "{\n  \"name\": \"openai-api\",\n  \"version\": \"0.1.0\",\n  \"private\": true,\n  \"scripts\": {\n    \"start\": \"parcel src/function_calling_manual.html  --port 8888\",\n    \"build\": \"parcel build src/function_calling_manual.html --dist-dir lib\"\n  },\n  \"devDependencies\": {\n    \"buffer\": \"^5.7.1\",\n    \"parcel\": \"^2.8.3\",\n    \"process\": \"^0.11.10\",\n    \"tslib\": \"^2.3.1\",\n    \"typescript\": \"^4.9.5\",\n    \"url\": \"^0.11.3\"\n  },\n  \"dependencies\": {\n    \"@mlc-ai/web-llm\": \"^0.2.82\"\n  }\n}\n"
  },
  {
    "path": "examples/function-calling/function-calling-manual/src/function_calling_manual.html",
    "content": "<!doctype html>\n<html>\n  <script>\n    webLLMGlobal = {};\n  </script>\n\n  <body>\n    <h2>WebLLM Test Page</h2>\n    Open console to see output\n    <br />\n    <br />\n    <label id=\"init-label\"> </label>\n    <label id=\"generate-label\"> </label>\n\n    <script type=\"module\" src=\"./function_calling_manual.ts\"></script>\n  </body>\n</html>\n"
  },
  {
    "path": "examples/function-calling/function-calling-manual/src/function_calling_manual.ts",
    "content": "/* eslint-disable no-useless-escape */\nimport * as webllm from \"@mlc-ai/web-llm\";\n\n// Common helper methods\nfunction setLabel(id: string, text: string) {\n  const label = document.getElementById(id);\n  if (label == null) {\n    throw Error(\"Cannot find label \" + id);\n  }\n  label.innerText = text;\n}\n\nconst initProgressCallback = (report: webllm.InitProgressReport) => {\n  setLabel(\"init-label\", report.text);\n};\n\n// Same example as https://huggingface.co/NousResearch/Hermes-2-Theta-Llama-3-8B#prompt-format-for-function-calling\nasync function hermes2_example() {\n  // 0. Setups\n  // Most manual function calling models specify the tools inside the system prompt\n  const system_prompt = `You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools: <tools> {\"type\": \"function\", \"function\": {\"name\": \"get_stock_fundamentals\", \"description\": \"get_stock_fundamentals(symbol: str) -> dict - Get fundamental data for a given stock symbol using yfinance API.\\\\n\\\\n    Args:\\\\n        symbol (str): The stock symbol.\\\\n\\\\n    Returns:\\\\n        dict: A dictionary containing fundamental data.\\\\n            Keys:\\\\n                - \\'symbol\\': The stock symbol.\\\\n                - \\'company_name\\': The long name of the company.\\\\n                - \\'sector\\': The sector to which the company belongs.\\\\n                - \\'industry\\': The industry to which the company belongs.\\\\n                - \\'market_cap\\': The market capitalization of the company.\\\\n                - \\'pe_ratio\\': The forward price-to-earnings ratio.\\\\n                - \\'pb_ratio\\': The price-to-book ratio.\\\\n                - \\'dividend_yield\\': The dividend yield.\\\\n                - \\'eps\\': The trailing earnings per share.\\\\n                - \\'beta\\': The beta value of the stock.\\\\n                - \\'52_week_high\\': The 52-week high price of the stock.\\\\n                - \\'52_week_low\\': The 52-week low price of the stock.\", \"parameters\": {\"type\": \"object\", \"properties\": {\"symbol\": {\"type\": \"string\"}}, \"required\": [\"symbol\"]}}}  </tools> Use the following pydantic model json schema for each tool call you will make: {\"properties\": {\"arguments\": {\"title\": \"Arguments\", \"type\": \"object\"}, \"name\": {\"title\": \"Name\", \"type\": \"string\"}}, \"required\": [\"arguments\", \"name\"], \"title\": \"FunctionCall\", \"type\": \"object\"} For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:\\n<tool_call>\\n{\"arguments\": <args-dict>, \"name\": <function-name>}\\n</tool_call>`;\n  // Same formatting for Hermes-2-Pro-Llama-3, Hermes-2-Theta-Llama-3\n  // const selectedModel = \"Hermes-2-Theta-Llama-3-8B-q4f16_1-MLC\";\n  const selectedModel = \"Hermes-2-Pro-Llama-3-8B-q4f16_1-MLC\";\n  const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(\n    selectedModel,\n    { initProgressCallback: initProgressCallback, logLevel: \"INFO\" },\n  );\n  const seed = 0;\n\n  // 1. First request, expect to generate tool call\n  const messages: webllm.ChatCompletionMessageParam[] = [\n    { role: \"system\", content: system_prompt },\n    {\n      role: \"user\",\n      content: \"Fetch the stock fundamentals data for Tesla (TSLA)\",\n    },\n  ];\n  const request1: webllm.ChatCompletionRequest = {\n    stream: false, // works with either streaming or non-streaming; code below assumes non-streaming\n    messages: messages,\n    seed: seed,\n  };\n  const reply1 = await engine.chat.completions.create(request1);\n  const response1 = reply1.choices[0].message.content;\n  console.log(reply1.usage);\n  console.log(\"Response 1: \" + response1);\n  messages.push({ role: \"assistant\", content: response1 });\n  // <tool_call>\\n{\"arguments\": {\"symbol\": \"TSLA\"}, \"name\": \"get_stock_fundamentals\"}\\n</tool_call>\n\n  // 2. Call function on your own to get tool response\n  const tool_response = `<tool_response>\\n{\"name\": \"get_stock_fundamentals\", \"content\": {'symbol': 'TSLA', 'company_name': 'Tesla, Inc.', 'sector': 'Consumer Cyclical', 'industry': 'Auto Manufacturers', 'market_cap': 611384164352, 'pe_ratio': 49.604652, 'pb_ratio': 9.762013, 'dividend_yield': None, 'eps': 4.3, 'beta': 2.427, '52_week_high': 299.29, '52_week_low': 152.37}}\\n</tool_response>`;\n  messages.push({ role: \"tool\", content: tool_response, tool_call_id: \"0\" });\n\n  // 3. Get natural language response\n  const request2: webllm.ChatCompletionRequest = {\n    stream: false, // works with either streaming or non-streaming; code below assumes non-streaming\n    messages: messages,\n    seed: seed,\n  };\n  const reply2 = await engine.chat.completions.create(request2);\n  const response2 = reply2.choices[0].message.content;\n  messages.push({ role: \"assistant\", content: response2 });\n  console.log(reply2.usage);\n  console.log(\"Response 2: \" + response2);\n\n  // 4. Another function call\n  messages.push({\n    role: \"user\",\n    content: \"Now do another one with NVIDIA, symbol being NVDA.\",\n  });\n  const request3: webllm.ChatCompletionRequest = {\n    stream: false, // works with either streaming or non-streaming; code below assumes non-streaming\n    messages: messages,\n    seed: seed,\n  };\n  const reply3 = await engine.chat.completions.create(request3);\n  const response3 = reply3.choices[0].message.content;\n  messages.push({ role: \"assistant\", content: response3 });\n  console.log(reply3.usage);\n  console.log(\"Response 3: \" + response3);\n  // <tool_call>\\n{\"arguments\": {\"symbol\": \"NVDA\"}, \"name\": \"get_stock_fundamentals\"}\\n</tool_call>\n}\n\n// Similar example to https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1/#user-defined-custom-tool-calling\nasync function llama3_1_example() {\n  // Follows example, but tweaks the formatting with <function>\n  const system_prompt = `Cutting Knowledge Date: December 2023\nToday Date: 23 Jul 2024\n# Tool Instructions\n- When looking for real time information use relevant functions if available\nYou have access to the following functions:\n\n{\n    \"type\": \"function\",\n    \"function\": {\n        \"name\": \"get_current_temperature\",\n        \"description\": \"Get the current temperature at a location.\",\n        \"parameters\": {\n            \"type\": \"object\",\n            \"properties\": {\n                \"location\": {\n                    \"type\": \"string\",\n                    \"description\": \"The location to get the temperature for, in the format \\\"City, Country\\\"\"\n                }\n            },\n            \"required\": [\n                \"location\"\n            ]\n        },\n        \"return\": {\n            \"type\": \"number\",\n            \"description\": \"The current temperature at the specified location in the specified units, as a float.\"\n        }\n    }\n}\n{\n    \"type\": \"function\",\n    \"function\": {\n        \"name\": \"send_message\",\n        \"description\": \"Send a message to a recipient.\",\n        \"parameters\": {\n            \"type\": \"object\",\n            \"properties\": {\n                \"recipient\": {\n                    \"type\": \"string\",\n                    \"description\": \"Name of the recipient of the message\"\n                }\n                \"content\": {\n                    \"type\": \"string\",\n                    \"description\": \"Content of the message\"\n                }\n            },\n            \"required\": [\n                \"recipient\",\n                \"content\"\n            ]\n        },\n        \"return\": {\n            \"type\": \"None\"\n        }\n    }\n}\nIf a you choose to call a function ONLY reply in the following format:\n    <function>{\"name\": function name, \"parameters\": dictionary of argument name and its value}</function>\nHere is an example,\n    <function>{\"name\": \"example_function_name\", \"parameters\": {\"example_name\": \"example_value\"}}</function>\nReminder:\n- Function calls MUST follow the specified format and use BOTH <function> and </function>\n- Required parameters MUST be specified\n- Only call one function at a time\n- When calling a function, do NOT add any other words, ONLY the function calling\n- Put the entire function call reply on one line\n- Always add your sources when using search results to answer the user query\nYou are a helpful Assistant.`;\n\n  const selectedModel = \"Llama-3.1-8B-Instruct-q4f16_1-MLC\";\n  const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(\n    selectedModel,\n    { initProgressCallback: initProgressCallback, logLevel: \"INFO\" },\n  );\n  const seed = 0;\n\n  // 1. First request, expect to generate tool call to get temperature of Paris\n  const messages: webllm.ChatCompletionMessageParam[] = [\n    { role: \"system\", content: system_prompt },\n    {\n      role: \"user\",\n      content: \"Hey, what's the temperature in Paris right now?\",\n    },\n  ];\n  const request1: webllm.ChatCompletionRequest = {\n    stream: false, // works with either streaming or non-streaming; code below assumes non-streaming\n    messages: messages,\n    seed: seed,\n  };\n  const reply1 = await engine.chat.completions.create(request1);\n  const response1 = reply1.choices[0].message.content;\n  console.log(reply1.usage);\n  console.log(\"Response 1: \" + response1);\n  messages.push({ role: \"assistant\", content: response1 });\n  // <function>{\"name\": \"get_current_temperature\", \"parameters\": {\"location\": \"Paris, France\"}}</function>\n\n  // 2. Call function on your own to get tool response\n  const tool_response = `{\"output\": 22.5}`;\n  messages.push({ role: \"tool\", content: tool_response, tool_call_id: \"0\" });\n\n  // 3. Get natural language response\n  const request2: webllm.ChatCompletionRequest = {\n    stream: false, // works with either streaming or non-streaming; code below assumes non-streaming\n    messages: messages,\n    seed: seed,\n  };\n  const reply2 = await engine.chat.completions.create(request2);\n  const response2 = reply2.choices[0].message.content;\n  messages.push({ role: \"assistant\", content: response2 });\n  console.log(reply2.usage);\n  console.log(\"Response 2: \" + response2);\n  // The current temperature in Paris is 22.5°C.\n\n  // 4. Make another request, expect model to call `send_message`\n  messages.push({\n    role: \"user\",\n    content: \"Send a message to Tom to tell him this information.\",\n  });\n  const request3: webllm.ChatCompletionRequest = {\n    stream: false, // works with either streaming or non-streaming; code below assumes non-streaming\n    messages: messages,\n    seed: seed,\n  };\n  const reply3 = await engine.chat.completions.create(request3);\n  const response3 = reply3.choices[0].message.content;\n  messages.push({ role: \"assistant\", content: response3 });\n  console.log(reply3.usage);\n  console.log(\"Response 3: \" + response3);\n  // <function>{\"name\": \"send_message\", \"parameters\": {\"recipient\": \"Tom\", \"content\": \"The current temperature in Paris is 22.5°C.\"}}</function>\n\n  // 5. Call API, which has no return value, so simply prompt model again\n  const tool_response2 = `{\"output\": None}`;\n  messages.push({ role: \"tool\", content: tool_response2, tool_call_id: \"1\" });\n  const request4: webllm.ChatCompletionRequest = {\n    stream: false, // works with either streaming or non-streaming; code below assumes non-streaming\n    messages: messages,\n    seed: seed,\n  };\n  const reply4 = await engine.chat.completions.create(request4);\n  const response4 = reply4.choices[0].message.content;\n  console.log(reply4.usage);\n  console.log(\"Response 4: \" + response4);\n  // The message has been sent to Tom.\n}\n\n// Pick one to run\n// hermes2_example();\nllama3_1_example();\n"
  },
  {
    "path": "examples/function-calling/function-calling-openai/README.md",
    "content": "### Demos - Function calling\n\nRun `npm install` first, followed by `npm start`.\n\nNote if you would like to hack WebLLM core package,\nyou can change web-llm dependencies as `\"file:../../..\"`, and follow the build from source\ninstruction in the project to build webllm locally. This option is only recommended\nif you would like to hack WebLLM core package.\n"
  },
  {
    "path": "examples/function-calling/function-calling-openai/package.json",
    "content": "{\n  \"name\": \"openai-api\",\n  \"version\": \"0.1.0\",\n  \"private\": true,\n  \"scripts\": {\n    \"start\": \"parcel src/function_calling_openai.html  --port 8888\",\n    \"build\": \"parcel build src/function_calling_openai.html --dist-dir lib\"\n  },\n  \"devDependencies\": {\n    \"buffer\": \"^5.7.1\",\n    \"parcel\": \"^2.8.3\",\n    \"process\": \"^0.11.10\",\n    \"tslib\": \"^2.3.1\",\n    \"typescript\": \"^4.9.5\",\n    \"url\": \"^0.11.3\"\n  },\n  \"dependencies\": {\n    \"@mlc-ai/web-llm\": \"^0.2.82\"\n  }\n}\n"
  },
  {
    "path": "examples/function-calling/function-calling-openai/src/function_calling_openai.html",
    "content": "<!doctype html>\n<html>\n  <script>\n    webLLMGlobal = {};\n  </script>\n\n  <body>\n    <h2>WebLLM Test Page</h2>\n    Open console to see output\n    <br />\n    <br />\n    <label id=\"init-label\"> </label>\n    <label id=\"generate-label\"> </label>\n\n    <script type=\"module\" src=\"./function_calling_openai.ts\"></script>\n  </body>\n</html>\n"
  },
  {
    "path": "examples/function-calling/function-calling-openai/src/function_calling_openai.ts",
    "content": "import * as webllm from \"@mlc-ai/web-llm\";\n\nfunction setLabel(id: string, text: string) {\n  const label = document.getElementById(id);\n  if (label == null) {\n    throw Error(\"Cannot find label \" + id);\n  }\n  label.innerText = text;\n}\n\nasync function main() {\n  const initProgressCallback = (report: webllm.InitProgressReport) => {\n    setLabel(\"init-label\", report.text);\n  };\n  const selectedModel = \"Hermes-2-Pro-Llama-3-8B-q4f16_1-MLC\";\n  const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(\n    selectedModel,\n    { initProgressCallback: initProgressCallback },\n  );\n\n  const tools: Array<webllm.ChatCompletionTool> = [\n    {\n      type: \"function\",\n      function: {\n        name: \"get_current_weather\",\n        description: \"Get the current weather in a given location\",\n        parameters: {\n          type: \"object\",\n          properties: {\n            location: {\n              type: \"string\",\n              description: \"The city and state, e.g. San Francisco, CA\",\n            },\n            unit: { type: \"string\", enum: [\"celsius\", \"fahrenheit\"] },\n          },\n          required: [\"location\"],\n        },\n      },\n    },\n  ];\n\n  const request: webllm.ChatCompletionRequest = {\n    stream: true, // works with stream as well, where the last chunk returns tool_calls\n    stream_options: { include_usage: true },\n    messages: [\n      {\n        role: \"user\",\n        content:\n          \"What is the current weather in celsius in Pittsburgh and Tokyo?\",\n      },\n    ],\n    tool_choice: \"auto\",\n    tools: tools,\n  };\n\n  if (!request.stream) {\n    const reply0 = await engine.chat.completions.create(request);\n    console.log(reply0.choices[0]);\n    console.log(reply0.usage);\n  } else {\n    // If streaming, the last chunk returns tool calls\n    const asyncChunkGenerator = await engine.chat.completions.create(request);\n    let message = \"\";\n    let lastChunk: webllm.ChatCompletionChunk | undefined;\n    let usageChunk: webllm.ChatCompletionChunk | undefined;\n    for await (const chunk of asyncChunkGenerator) {\n      console.log(chunk);\n      message += chunk.choices[0]?.delta?.content || \"\";\n      setLabel(\"generate-label\", message);\n      if (!chunk.usage) {\n        lastChunk = chunk;\n      }\n      usageChunk = chunk;\n    }\n    console.log(lastChunk!.choices[0].delta);\n    console.log(usageChunk!.usage);\n  }\n}\n\nmain();\n"
  },
  {
    "path": "examples/get-started/README.md",
    "content": "# WebLLM Get Started App\n\nThis folder provides a minimum demo to show WebLLM API in a webapp setting.\nTo try it out, you can do the following steps under this folder\n\n```bash\nnpm install\nnpm start\n```\n\nNote if you would like to hack WebLLM core package.\nYou can change web-llm dependencies as `\"file:../..\"`, and follow the build from source\ninstruction in the project to build webllm locally. This option is only recommended\nif you would like to hack WebLLM core package.\n"
  },
  {
    "path": "examples/get-started/package.json",
    "content": "{\n  \"name\": \"get-started\",\n  \"version\": \"0.1.0\",\n  \"private\": true,\n  \"scripts\": {\n    \"start\": \"parcel src/get_started.html  --port 8888\",\n    \"build\": \"parcel build src/get_started.html --dist-dir lib\"\n  },\n  \"devDependencies\": {\n    \"buffer\": \"^5.7.1\",\n    \"parcel\": \"^2.8.3\",\n    \"process\": \"^0.11.10\",\n    \"tslib\": \"^2.3.1\",\n    \"typescript\": \"^4.9.5\",\n    \"url\": \"^0.11.3\"\n  },\n  \"dependencies\": {\n    \"@mlc-ai/web-llm\": \"^0.2.82\"\n  }\n}\n"
  },
  {
    "path": "examples/get-started/src/get_started.html",
    "content": "<!doctype html>\n<html>\n  <script>\n    webLLMGlobal = {};\n  </script>\n  <body>\n    <h2>WebLLM Test Page</h2>\n    Open console to see output\n    <br />\n    <br />\n    <label id=\"init-label\"> </label>\n\n    <h3>Prompt</h3>\n    <label id=\"prompt-label\"> </label>\n\n    <h3>Response</h3>\n    <label id=\"generate-label\"> </label>\n    <br />\n    <label id=\"stats-label\"> </label>\n\n    <script type=\"module\" src=\"./get_started.ts\"></script>\n  </body>\n</html>\n"
  },
  {
    "path": "examples/get-started/src/get_started.ts",
    "content": "import * as webllm from \"@mlc-ai/web-llm\";\n\nfunction setLabel(id: string, text: string) {\n  const label = document.getElementById(id);\n  if (label == null) {\n    throw Error(\"Cannot find label \" + id);\n  }\n  label.innerText = text;\n}\n\nasync function main() {\n  const initProgressCallback = (report: webllm.InitProgressReport) => {\n    setLabel(\"init-label\", report.text);\n  };\n  // Option 1: If we do not specify appConfig, we use `prebuiltAppConfig` defined in `config.ts`\n  const selectedModel = \"Llama-3.1-8B-Instruct-q4f32_1-MLC\";\n  const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(\n    selectedModel,\n    {\n      initProgressCallback: initProgressCallback,\n      logLevel: \"INFO\", // specify the log level\n    },\n    // customize kv cache, use either context_window_size or sliding_window_size (with attention sink)\n    {\n      context_window_size: 2048,\n      // sliding_window_size: 1024,\n      // attention_sink_size: 4,\n    },\n  );\n\n  // Option 2: Specify your own model other than the prebuilt ones\n  // const appConfig: webllm.AppConfig = {\n  //   model_list: [\n  //     {\n  //       model: \"https://huggingface.co/mlc-ai/Llama-3.1-8B-Instruct-q4f32_1-MLC\",\n  //       model_id: \"Llama-3.1-8B-Instruct-q4f32_1-MLC\",\n  //       model_lib:\n  //         webllm.modelLibURLPrefix +\n  //         webllm.modelVersion +\n  //         \"/Llama-3_1-8B-Instruct-q4f32_1-ctx4k_cs1k-webgpu.wasm\",\n  //       overrides: {\n  //         context_window_size: 2048,\n  //       },\n  //     },\n  //   ],\n  // };\n  // const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(\n  //   selectedModel,\n  //   { appConfig: appConfig, initProgressCallback: initProgressCallback },\n  // );\n\n  // Option 3: Instantiate MLCEngine() and call reload() separately\n  // const engine: webllm.MLCEngineInterface = new webllm.MLCEngine({\n  //   appConfig: appConfig, // if do not specify, we use webllm.prebuiltAppConfig\n  //   initProgressCallback: initProgressCallback,\n  // });\n  // await engine.reload(selectedModel);\n\n  const reply0 = await engine.chat.completions.create({\n    messages: [{ role: \"user\", content: \"List three US states.\" }],\n    // below configurations are all optional\n    n: 3,\n    temperature: 1.5,\n    max_tokens: 256,\n    // 46510 and 7188 are \"California\", and 8421 and 51325 are \"Texas\" in Llama-3.1-8B-Instruct\n    // So we would have a higher chance of seeing the latter two, but never the first in the answer\n    logit_bias: {\n      \"46510\": -100,\n      \"7188\": -100,\n      \"8421\": 5,\n      \"51325\": 5,\n    },\n    logprobs: true,\n    top_logprobs: 2,\n  });\n  console.log(reply0);\n  console.log(reply0.usage);\n\n  // To change model, either create a new engine via `CreateMLCEngine()`, or call `engine.reload(modelId)`\n}\n\nmain();\n"
  },
  {
    "path": "examples/get-started-latency-breakdown/README.md",
    "content": "# WebLLM Get Started App\n\nThis folder provides a minimum demo to show WebLLM API in a webapp setting with\ncollection of latency statistics for individual token sampling steps.\nTo try it out, you can do the following steps under this folder\n\n```bash\nnpm install\nnpm start\n```\n\nNote if you would like to hack WebLLM core package.\nYou can change web-llm dependencies as `\"file:../..\"`, and follow the build from source\ninstruction in the project to build webllm locally. This option is only recommended\nif you would like to hack WebLLM core package.\n"
  },
  {
    "path": "examples/get-started-latency-breakdown/package.json",
    "content": "{\n  \"name\": \"get-started-latency-breakdown\",\n  \"version\": \"0.1.0\",\n  \"private\": true,\n  \"scripts\": {\n    \"start\": \"parcel src/get_started_latency_breakdown.html  --port 8888\",\n    \"build\": \"parcel build src/get_started_latency_breakdown.html --dist-dir lib\"\n  },\n  \"devDependencies\": {\n    \"buffer\": \"^5.7.1\",\n    \"parcel\": \"^2.8.3\",\n    \"process\": \"^0.11.10\",\n    \"tslib\": \"^2.3.1\",\n    \"typescript\": \"^4.9.5\",\n    \"url\": \"^0.11.3\"\n  },\n  \"dependencies\": {\n    \"@mlc-ai/web-llm\": \"^0.2.82\"\n  }\n}\n"
  },
  {
    "path": "examples/get-started-latency-breakdown/src/get_started_latency_breakdown.html",
    "content": "<!doctype html>\n<html>\n  <script>\n    webLLMGlobal = {};\n  </script>\n  <body>\n    <h2>WebLLM Test Page</h2>\n    Open console to see output\n    <br />\n    <br />\n    <label id=\"init-label\"> </label>\n\n    <h3>Prompt</h3>\n    <label id=\"prompt-label\"> </label>\n\n    <h3>Response</h3>\n    <label id=\"generate-label\"> </label>\n    <br />\n    <label id=\"stats-label\"> </label>\n\n    <script type=\"module\" src=\"./get_started_latency_breakdown.ts\"></script>\n  </body>\n</html>\n"
  },
  {
    "path": "examples/get-started-latency-breakdown/src/get_started_latency_breakdown.ts",
    "content": "import * as webllm from \"@mlc-ai/web-llm\";\n\nfunction setLabel(id: string, text: string) {\n  const label = document.getElementById(id);\n  if (label == null) {\n    throw Error(\"Cannot find label \" + id);\n  }\n  label.innerText = text;\n}\n\ntype LatencyBreakdown = {\n  logitProcessorTime: number[];\n  logitBiasTime: number[];\n  penaltyTime: number[];\n  sampleTime: number[];\n  totalTime: number[];\n  grammarBitmaskTime: number[];\n};\nfunction computeStats(\n  latency_breakdown: LatencyBreakdown,\n): Record<string, any> {\n  function _computeStats(arr: number[]) {\n    if (!arr.length) return undefined;\n    const sorted = [...arr].sort((a, b) => a - b);\n    const sum = arr.reduce((a, b) => a + b, 0);\n    const avg = sum / arr.length;\n    const min = sorted[0];\n    const max = sorted[sorted.length - 1];\n    const p99 = sorted[Math.floor(0.99 * (sorted.length - 1))];\n    return { avg, min, max, p99 };\n  }\n\n  const latencyStats: Record<string, any> = {};\n  for (const key of Object.keys(latency_breakdown)) {\n    const arr = (latency_breakdown as any)[key];\n    if (Array.isArray(arr) && arr.length > 0) {\n      latencyStats[key] = _computeStats(arr);\n    }\n  }\n  return latencyStats;\n}\n\nasync function main() {\n  const initProgressCallback = (report: webllm.InitProgressReport) => {\n    setLabel(\"init-label\", report.text);\n  };\n  // Option 1: If we do not specify appConfig, we use `prebuiltAppConfig` defined in `config.ts`\n  const selectedModel = \"Qwen3-0.6B-q0f32-MLC\";\n  const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(\n    selectedModel,\n    {\n      initProgressCallback: initProgressCallback,\n      logLevel: \"INFO\", // specify the log level\n    },\n    // customize kv cache, use either context_window_size or sliding_window_size (with attention sink)\n    {\n      context_window_size: 2048,\n      // sliding_window_size: 1024,\n      // attention_sink_size: 4,\n    },\n  );\n\n  const latencyBreakdown: LatencyBreakdown = {\n    logitProcessorTime: [],\n    logitBiasTime: [],\n    penaltyTime: [],\n    sampleTime: [],\n    totalTime: [],\n    grammarBitmaskTime: [],\n  };\n\n  const decodeTokensPerS: number[] = [];\n  const completionTokens: number[] = [];\n  const e2eLatencyS: number[] = [];\n  const timePerOutputTokenS: number[] = [];\n\n  const numTrials = 20;\n  for (let i = 0; i < numTrials; i++) {\n    console.log(`Trial ${i + 1} / ${numTrials}`);\n    const reply0 = await engine.chat.completions.create({\n      messages: [{ role: \"user\", content: \"List twenty US states.\" }],\n      // below configurations are all optional\n      n: 1,\n      temperature: 0,\n      max_tokens: 2048,\n      // 46510 and 7188 are \"California\", and 8421 and 51325 are \"Texas\" in Llama-3.1-8B-Instruct\n      // So we would have a higher chance of seeing the latter two, but never the first in the answer\n      // logit_bias: {\n      //   \"46510\": -100,\n      //   \"7188\": -100,\n      //   \"8421\": 5,\n      //   \"41325\": 5,\n      // },\n      top_p: 0.8,\n      logprobs: true,\n      top_logprobs: 2,\n      frequency_penalty: 1.2,\n      presence_penalty: 1.0,\n      repetition_penalty: 1.1,\n    });\n\n    const logitProcessorTime =\n      reply0.usage?.extra.latencyBreakdown?.logitProcessorTime;\n    const logitBiasTime = reply0.usage?.extra.latencyBreakdown?.logitBiasTime;\n    const penaltyTime = reply0.usage?.extra.latencyBreakdown?.penaltyTime;\n    const sampleTime = reply0.usage?.extra.latencyBreakdown?.sampleTime;\n    const totalTime = reply0.usage?.extra.latencyBreakdown?.totalTime;\n    const grammarBitmaskTime =\n      reply0.usage?.extra.latencyBreakdown?.grammarBitmaskTime;\n\n    latencyBreakdown.logitProcessorTime.push(...(logitProcessorTime || []));\n    latencyBreakdown.logitBiasTime.push(...(logitBiasTime || []));\n    latencyBreakdown.penaltyTime.push(...(penaltyTime || []));\n    latencyBreakdown.sampleTime.push(...(sampleTime || []));\n    latencyBreakdown.totalTime.push(...(totalTime || []));\n    latencyBreakdown.grammarBitmaskTime.push(...(grammarBitmaskTime || []));\n\n    decodeTokensPerS.push(reply0.usage?.extra.decode_tokens_per_s || 0);\n    e2eLatencyS.push(reply0.usage?.extra.e2e_latency_s || 0);\n    timePerOutputTokenS.push(reply0.usage?.extra.time_per_output_token_s || 0);\n    completionTokens.push(reply0.usage?.completion_tokens || 0);\n  }\n\n  const latencyStats: { [key: string]: number } =\n    computeStats(latencyBreakdown);\n  console.log(\"Latency stats: \", latencyStats);\n  console.log(\"Decode tokens per second: \", decodeTokensPerS);\n  console.log(\"Completion tokens: \", completionTokens);\n  console.log(\"E2E latency (s): \", e2eLatencyS);\n  console.log(\"Time per output token (s): \", timePerOutputTokenS);\n\n  // To change model, either create a new engine via `CreateMLCEngine()`, or call `engine.reload(modelId)`\n}\n\nmain();\n"
  },
  {
    "path": "examples/get-started-web-worker/README.md",
    "content": "# WebLLM Get Started with WebWorker\n\nThis folder provides a minimum demo to show WebLLM API using\n[WebWorker](https://developer.mozilla.org/en-US/docs/Web/API/Web_Workers_API/Using_web_workers).\nThe main benefit of web worker is that all ML workloads runs on a separate thread as a result\nwill less likely block the UI.\n\nTo try it out, you can do the following steps under this folder\n\n```bash\nnpm install\nnpm start\n```\n\nNote if you would like to hack WebLLM core package.\nYou can change web-llm dependencies as `\"file:../..\"`, and follow the build from source\ninstruction in the project to build webllm locally. This option is only recommended\nif you would like to hack WebLLM core package.\n"
  },
  {
    "path": "examples/get-started-web-worker/package.json",
    "content": "{\n  \"name\": \"get-started-web-worker\",\n  \"version\": \"0.1.0\",\n  \"private\": true,\n  \"scripts\": {\n    \"start\": \"parcel src/get_started.html  --port 8885\",\n    \"build\": \"parcel build src/get_started.html --dist-dir lib\"\n  },\n  \"devDependencies\": {\n    \"buffer\": \"^6.0.3\",\n    \"parcel\": \"^2.8.3\",\n    \"process\": \"^0.11.10\",\n    \"tslib\": \"^2.3.1\",\n    \"typescript\": \"^4.9.5\",\n    \"url\": \"^0.11.3\"\n  },\n  \"dependencies\": {\n    \"@mlc-ai/web-llm\": \"^0.2.82\"\n  }\n}\n"
  },
  {
    "path": "examples/get-started-web-worker/src/get_started.html",
    "content": "<!doctype html>\n<html>\n  <script>\n    webLLMGlobal = {};\n  </script>\n  <body>\n    <h2>WebLLM Test Page</h2>\n    Open console to see output\n    <br />\n    <br />\n    <label id=\"init-label\"> </label>\n\n    <h3>Prompt</h3>\n    <label id=\"prompt-label\"> </label>\n\n    <h3>Response</h3>\n    <label id=\"generate-label\"> </label>\n    <br />\n    <label id=\"stats-label\"> </label>\n\n    <script type=\"module\" src=\"./main.ts\"></script>\n  </body>\n</html>\n"
  },
  {
    "path": "examples/get-started-web-worker/src/main.ts",
    "content": "import * as webllm from \"@mlc-ai/web-llm\";\n\nfunction setLabel(id: string, text: string) {\n  const label = document.getElementById(id);\n  if (label == null) {\n    throw Error(\"Cannot find label \" + id);\n  }\n  label.innerText = text;\n}\n\n// There are two demonstrations, pick one to run\n\n/**\n * Chat completion (OpenAI style) without streaming, where we get the entire response at once.\n */\nasync function mainNonStreaming() {\n  const initProgressCallback = (report: webllm.InitProgressReport) => {\n    setLabel(\"init-label\", report.text);\n  };\n  const selectedModel = \"Llama-3.1-8B-Instruct-q4f32_1-MLC\";\n\n  const engine: webllm.MLCEngineInterface =\n    await webllm.CreateWebWorkerMLCEngine(\n      new Worker(new URL(\"./worker.ts\", import.meta.url), { type: \"module\" }),\n      selectedModel,\n      { initProgressCallback: initProgressCallback },\n    );\n\n  const request: webllm.ChatCompletionRequest = {\n    messages: [\n      {\n        role: \"system\",\n        content:\n          \"You are a helpful, respectful and honest assistant. \" +\n          \"Be as happy as you can when speaking please. \",\n      },\n      { role: \"user\", content: \"Provide me three US states.\" },\n      { role: \"assistant\", content: \"California, New York, Pennsylvania.\" },\n      { role: \"user\", content: \"Two more please!\" },\n    ],\n    n: 3,\n    temperature: 1.5,\n    max_tokens: 256,\n  };\n\n  const reply0 = await engine.chat.completions.create(request);\n  console.log(reply0);\n\n  console.log(reply0.usage);\n}\n\n/**\n * Chat completion (OpenAI style) with streaming, where delta is sent while generating response.\n */\nasync function mainStreaming() {\n  const initProgressCallback = (report: webllm.InitProgressReport) => {\n    setLabel(\"init-label\", report.text);\n  };\n  const selectedModel = \"Llama-3.1-8B-Instruct-q4f32_1-MLC\";\n\n  const engine: webllm.MLCEngineInterface =\n    await webllm.CreateWebWorkerMLCEngine(\n      new Worker(new URL(\"./worker.ts\", import.meta.url), { type: \"module\" }),\n      selectedModel,\n      { initProgressCallback: initProgressCallback },\n    );\n\n  const request: webllm.ChatCompletionRequest = {\n    stream: true,\n    stream_options: { include_usage: true },\n    messages: [\n      {\n        role: \"system\",\n        content:\n          \"You are a helpful, respectful and honest assistant. \" +\n          \"Be as happy as you can when speaking please. \",\n      },\n      { role: \"user\", content: \"Provide me three US states.\" },\n      { role: \"assistant\", content: \"California, New York, Pennsylvania.\" },\n      { role: \"user\", content: \"Two more please!\" },\n    ],\n    temperature: 1.5,\n    max_tokens: 256,\n  };\n\n  const asyncChunkGenerator = await engine.chat.completions.create(request);\n  let message = \"\";\n  for await (const chunk of asyncChunkGenerator) {\n    console.log(chunk);\n    message += chunk.choices[0]?.delta?.content || \"\";\n    setLabel(\"generate-label\", message);\n    if (chunk.usage) {\n      console.log(chunk.usage); // only last chunk has usage\n    }\n    // engine.interruptGenerate();  // works with interrupt as well\n  }\n  console.log(\"Final message:\\n\", await engine.getMessage()); // the concatenated message\n}\n\n// Run one of the function below\n// mainNonStreaming();\nmainStreaming();\n"
  },
  {
    "path": "examples/get-started-web-worker/src/worker.ts",
    "content": "import { WebWorkerMLCEngineHandler } from \"@mlc-ai/web-llm\";\n\n// Hookup an engine to a worker handler\nconst handler = new WebWorkerMLCEngineHandler();\nself.onmessage = (msg: MessageEvent) => {\n  handler.onmessage(msg);\n};\n"
  },
  {
    "path": "examples/json-mode/README.md",
    "content": "### OpenAI API Demos\n\nRun `npm install` first, followed by `npm start`.\n\nNote if you would like to hack WebLLM core package,\nyou can change web-llm dependencies as `\"file:../..\"`, and follow the build from source\ninstruction in the project to build webllm locally. This option is only recommended\nif you would like to hack WebLLM core package.\n"
  },
  {
    "path": "examples/json-mode/package.json",
    "content": "{\n  \"name\": \"openai-api\",\n  \"version\": \"0.1.0\",\n  \"private\": true,\n  \"scripts\": {\n    \"start\": \"parcel src/json_mode.html  --port 8888\",\n    \"build\": \"parcel build src/json_mode.html --dist-dir lib\"\n  },\n  \"devDependencies\": {\n    \"buffer\": \"^5.7.1\",\n    \"parcel\": \"^2.8.3\",\n    \"process\": \"^0.11.10\",\n    \"tslib\": \"^2.3.1\",\n    \"typescript\": \"^4.9.5\",\n    \"url\": \"^0.11.3\"\n  },\n  \"dependencies\": {\n    \"@mlc-ai/web-llm\": \"^0.2.82\"\n  }\n}\n"
  },
  {
    "path": "examples/json-mode/src/json_mode.html",
    "content": "<!doctype html>\n<html>\n  <script>\n    webLLMGlobal = {};\n  </script>\n\n  <body>\n    <h2>WebLLM Test Page</h2>\n    Open console to see output.\n    <br />\n    <br />\n    <label id=\"init-label\"> </label>\n\n    <script type=\"module\" src=\"./json_mode.ts\"></script>\n  </body>\n</html>\n"
  },
  {
    "path": "examples/json-mode/src/json_mode.ts",
    "content": "import * as webllm from \"@mlc-ai/web-llm\";\n\nfunction setLabel(id: string, text: string) {\n  const label = document.getElementById(id);\n  if (label == null) {\n    throw Error(\"Cannot find label \" + id);\n  }\n  label.innerText = text;\n}\n\nasync function main() {\n  const initProgressCallback = (report: webllm.InitProgressReport) => {\n    setLabel(\"init-label\", report.text);\n  };\n  // Pick any one of these models to start trying -- most models in WebLLM support grammar\n  const selectedModel = \"Llama-3.2-3B-Instruct-q4f16_1-MLC\";\n  // const selectedModel = \"Qwen2.5-1.5B-Instruct-q4f16_1-MLC\";\n  // const selectedModel = \"Phi-3.5-mini-instruct-q4f16_1-MLC\";\n  const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(\n    selectedModel,\n    { initProgressCallback: initProgressCallback },\n  );\n  // Note that you'd need to prompt the model to answer in JSON either in\n  // user's message or the system prompt\n  const request: webllm.ChatCompletionRequest = {\n    stream: false, // works with streaming, logprobs, top_logprobs as well\n    messages: [\n      {\n        role: \"user\",\n        content: \"Write a short JSON file introducing yourself.\",\n      },\n    ],\n    n: 2,\n    max_tokens: 128,\n    response_format: { type: \"json_object\" } as webllm.ResponseFormat,\n  };\n\n  const reply0 = await engine.chatCompletion(request);\n  console.log(reply0);\n  console.log(\"First reply's last choice:\\n\" + (await engine.getMessage()));\n  console.log(reply0.usage);\n}\n\nmain();\n"
  },
  {
    "path": "examples/json-schema/README.md",
    "content": "### OpenAI API Demos\n\nRun `npm install` first, followed by `npm start`.\n\nNote if you would like to hack WebLLM core package,\nyou can change web-llm dependencies as `\"file:../..\"`, and follow the build from source\ninstruction in the project to build webllm locally. This option is only recommended\nif you would like to hack WebLLM core package.\n"
  },
  {
    "path": "examples/json-schema/package.json",
    "content": "{\n  \"name\": \"openai-api\",\n  \"version\": \"0.1.0\",\n  \"private\": true,\n  \"scripts\": {\n    \"start\": \"parcel src/json_schema.html  --port 8885\",\n    \"build\": \"parcel build src/json_schema.html --dist-dir lib\"\n  },\n  \"devDependencies\": {\n    \"buffer\": \"^5.7.1\",\n    \"parcel\": \"^2.8.3\",\n    \"process\": \"^0.11.10\",\n    \"tslib\": \"^2.3.1\",\n    \"typescript\": \"^4.9.5\",\n    \"url\": \"^0.11.3\"\n  },\n  \"dependencies\": {\n    \"@mlc-ai/web-llm\": \"^0.2.82\"\n  }\n}\n"
  },
  {
    "path": "examples/json-schema/src/json_schema.html",
    "content": "<!doctype html>\n<html>\n  <script>\n    webLLMGlobal = {};\n  </script>\n\n  <body>\n    <h2>WebLLM Test Page</h2>\n    Open console to see output.\n    <br />\n    <br />\n    <label id=\"init-label\"> </label>\n\n    <script type=\"module\" src=\"./json_schema.ts\"></script>\n  </body>\n</html>\n"
  },
  {
    "path": "examples/json-schema/src/json_schema.ts",
    "content": "import * as webllm from \"@mlc-ai/web-llm\";\nimport { Type, Static } from \"@sinclair/typebox\";\n\nfunction setLabel(id: string, text: string) {\n  const label = document.getElementById(id);\n  if (label == null) {\n    throw Error(\"Cannot find label \" + id);\n  }\n  label.innerText = text;\n}\n\nasync function simpleStructuredTextExample() {\n  // There are several options of providing such a schema\n  // 1. You can directly define a schema in string\n  const schema1 = `{\n        \"properties\": {\n            \"size\": {\"title\": \"Size\", \"type\": \"integer\"}, \n            \"is_accepted\": {\"title\": \"Is Accepted\", \"type\": \"boolean\"}, \n            \"num\": {\"title\": \"Num\", \"type\": \"number\"}\n        },\n        \"required\": [\"size\", \"is_accepted\", \"num\"], \n        \"title\": \"Schema\", \"type\": \"object\"\n    }`;\n\n  // 2. You can use 3rdparty libraries like typebox to create a schema\n  const T = Type.Object({\n    size: Type.Integer(),\n    is_accepted: Type.Boolean(),\n    num: Type.Number(),\n  });\n  type T = Static<typeof T>;\n  const schema2 = JSON.stringify(T);\n  console.log(schema2);\n  // {\"type\":\"object\",\"properties\":{\"size\":{\"type\":\"integer\"},\"is_accepted\":{\"type\":\"boolean\"},\n  // \"num\":{\"type\":\"number\"}},\"required\":[\"size\",\"is_accepted\",\"num\"]}\n\n  const initProgressCallback = (report: webllm.InitProgressReport) => {\n    setLabel(\"init-label\", report.text);\n  };\n\n  // Pick any one of these models to start trying -- most models in WebLLM support grammar\n  // const selectedModel = \"Llama-3.2-3B-Instruct-q4f16_1-MLC\";\n  // const selectedModel = \"Qwen2.5-1.5B-Instruct-q4f16_1-MLC\";\n  const selectedModel = \"Phi-3.5-mini-instruct-q4f16_1-MLC\";\n  const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(\n    selectedModel,\n    { initProgressCallback: initProgressCallback, logLevel: \"INFO\" },\n  );\n\n  // Note that you'd need to prompt the model to answer in JSON either in\n  // user's message or the system prompt\n  const request: webllm.ChatCompletionRequest = {\n    stream: false, // works with streaming, logprobs, top_logprobs as well\n    messages: [\n      {\n        role: \"user\",\n        content:\n          \"Generate a json containing three fields: an integer field named size, a \" +\n          \"boolean field named is_accepted, and a float field named num.\",\n      },\n    ],\n    max_tokens: 128,\n    response_format: {\n      type: \"json_object\",\n      schema: schema2,\n    } as webllm.ResponseFormat,\n  };\n\n  const reply0 = await engine.chatCompletion(request);\n  console.log(reply0);\n  console.log(\"Output:\\n\" + (await engine.getMessage()));\n  console.log(reply0.usage);\n}\n\n// The json schema and prompt is taken from\n// https://github.com/sgl-project/sglang/tree/main?tab=readme-ov-file#json-decoding\nasync function harryPotterExample() {\n  const T = Type.Object({\n    name: Type.String(),\n    house: Type.Enum({\n      Gryffindor: \"Gryffindor\",\n      Hufflepuff: \"Hufflepuff\",\n      Ravenclaw: \"Ravenclaw\",\n      Slytherin: \"Slytherin\",\n    }),\n    blood_status: Type.Enum({\n      \"Pure-blood\": \"Pure-blood\",\n      \"Half-blood\": \"Half-blood\",\n      \"Muggle-born\": \"Muggle-born\",\n    }),\n    occupation: Type.Enum({\n      Student: \"Student\",\n      Professor: \"Professor\",\n      \"Ministry of Magic\": \"Ministry of Magic\",\n      Other: \"Other\",\n    }),\n    wand: Type.Object({\n      wood: Type.String(),\n      core: Type.String(),\n      length: Type.Number(),\n    }),\n    alive: Type.Boolean(),\n    patronus: Type.String(),\n  });\n\n  type T = Static<typeof T>;\n  const schema = JSON.stringify(T);\n  console.log(schema);\n\n  const initProgressCallback = (report: webllm.InitProgressReport) => {\n    setLabel(\"init-label\", report.text);\n  };\n\n  // Pick any one of these models to start trying -- most models in WebLLM support grammar\n  const selectedModel = \"Llama-3.2-3B-Instruct-q4f16_1-MLC\";\n  // const selectedModel = \"Qwen2.5-1.5B-Instruct-q4f16_1-MLC\";\n  // const selectedModel = \"Phi-3.5-mini-instruct-q4f16_1-MLC\";\n\n  const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(\n    selectedModel,\n    { initProgressCallback: initProgressCallback, logLevel: \"INFO\" },\n  );\n\n  // Note that you'd need to prompt the model to answer in JSON either in\n  // user's message or the system prompt\n  const request: webllm.ChatCompletionRequest = {\n    stream: false,\n    messages: [\n      {\n        role: \"user\",\n        content:\n          \"Hermione Granger is a character in Harry Potter. Please fill in the following information about this character in JSON format.\" +\n          \"Name is a string of character name. House is one of Gryffindor, Hufflepuff, Ravenclaw, Slytherin. Blood status is one of Pure-blood, Half-blood, Muggle-born. Occupation is one of Student, Professor, Ministry of Magic, Other. Wand is an object with wood, core, and length. Alive is a boolean. Patronus is a string.\",\n      },\n    ],\n    max_tokens: 128,\n    response_format: {\n      type: \"json_object\",\n      schema: schema,\n    } as webllm.ResponseFormat,\n  };\n\n  const reply = await engine.chatCompletion(request);\n  console.log(reply);\n  console.log(\"Output:\\n\" + (await engine.getMessage()));\n  console.log(reply.usage);\n  console.log(reply.usage!.extra);\n}\n\nasync function functionCallingExample() {\n  const T = Type.Object({\n    tool_calls: Type.Array(\n      Type.Object({\n        arguments: Type.Any(),\n        name: Type.String(),\n      }),\n    ),\n  });\n  type T = Static<typeof T>;\n  const schema = JSON.stringify(T);\n  console.log(schema);\n\n  const tools: Array<webllm.ChatCompletionTool> = [\n    {\n      type: \"function\",\n      function: {\n        name: \"get_current_weather\",\n        description: \"Get the current weather in a given location\",\n        parameters: {\n          type: \"object\",\n          properties: {\n            location: {\n              type: \"string\",\n              description: \"The city and state, e.g. San Francisco, CA\",\n            },\n            unit: { type: \"string\", enum: [\"celsius\", \"fahrenheit\"] },\n          },\n          required: [\"location\"],\n        },\n      },\n    },\n  ];\n\n  const initProgressCallback = (report: webllm.InitProgressReport) => {\n    setLabel(\"init-label\", report.text);\n  };\n\n  const selectedModel = \"Hermes-2-Pro-Llama-3-8B-q4f16_1-MLC\";\n  const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(\n    selectedModel,\n    {\n      initProgressCallback: initProgressCallback,\n    },\n  );\n\n  const request: webllm.ChatCompletionRequest = {\n    stream: false,\n    messages: [\n      {\n        role: \"system\",\n        content: `You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools: <tools> ${JSON.stringify(\n          tools,\n        )} </tools>. Do not stop calling functions until the task has been accomplished or you've reached max iteration of 10.\n      Calling multiple functions at once can overload the system and increase cost so call one function at a time please.\n      If you plan to continue with analysis, always call another function.\n      Return a valid json object (using double quotes) in the following schema: ${JSON.stringify(\n        schema,\n      )}.`,\n      },\n      {\n        role: \"user\",\n        content:\n          \"What is the current weather in celsius in Pittsburgh and Tokyo?\",\n      },\n    ],\n    response_format: {\n      type: \"json_object\",\n      schema: schema,\n    } as webllm.ResponseFormat,\n  };\n\n  const reply = await engine.chat.completions.create(request);\n  console.log(reply.choices[0].message.content);\n\n  console.log(reply.usage);\n}\n\nasync function ebnfGrammarExample() {\n  // You can directly define an EBNFGrammar string with ResponseFormat.grammar\n  const jsonGrammarStr = String.raw`\nroot ::= basic_array | basic_object\nbasic_any ::= basic_number | basic_string | basic_boolean | basic_null | basic_array | basic_object\nbasic_integer ::= (\"0\" | \"-\"? [1-9] [0-9]*) \".0\"?\nbasic_number ::= (\"0\" | \"-\"? [1-9] [0-9]*) (\".\" [0-9]+)? ([eE] [+-]? [0-9]+)?\nbasic_string ::= (([\\\"] basic_string_1 [\\\"]))\nbasic_string_1 ::= \"\" | [^\"\\\\\\x00-\\x1F] basic_string_1 | \"\\\\\" escape basic_string_1\nescape ::= [\"\\\\/bfnrt] | \"u\" [A-Fa-f0-9] [A-Fa-f0-9] [A-Fa-f0-9] [A-Fa-f0-9]\nbasic_boolean ::= \"true\" | \"false\"\nbasic_null ::= \"null\"\nbasic_array ::= \"[\" (\"\" | ws basic_any (ws \",\" ws basic_any)*) ws \"]\"\nbasic_object ::= \"{\" (\"\" | ws basic_string ws \":\" ws basic_any ( ws \",\" ws basic_string ws \":\" ws basic_any)*) ws \"}\"\nws ::= [ \\n\\t]*\n`;\n\n  const initProgressCallback = (report: webllm.InitProgressReport) => {\n    setLabel(\"init-label\", report.text);\n  };\n\n  // Pick any one of these models to start trying -- most models in WebLLM support grammar\n  const selectedModel = \"Llama-3.2-3B-Instruct-q4f16_1-MLC\";\n  // const selectedModel = \"Qwen2.5-1.5B-Instruct-q4f16_1-MLC\";\n  // const selectedModel = \"Phi-3.5-mini-instruct-q4f16_1-MLC\";\n  const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(\n    selectedModel,\n    { initProgressCallback: initProgressCallback, logLevel: \"INFO\" },\n  );\n\n  // Note that you'd need to prompt the model to answer in JSON either in\n  // user's message or the system prompt\n  const request: webllm.ChatCompletionRequest = {\n    stream: false, // works with streaming, logprobs, top_logprobs as well\n    messages: [\n      {\n        role: \"user\",\n        content: \"Introduce yourself in JSON\",\n      },\n    ],\n    max_tokens: 128,\n    response_format: {\n      type: \"grammar\",\n      grammar: jsonGrammarStr,\n    } as webllm.ResponseFormat,\n  };\n\n  const reply0 = await engine.chatCompletion(request);\n  console.log(reply0);\n  console.log(\"Output:\\n\" + (await engine.getMessage()));\n  console.log(reply0.usage);\n}\n\nasync function main() {\n  // await simpleStructuredTextExample();\n  await harryPotterExample();\n  // await functionCallingExample();\n  // await ebnfGrammarExample();\n}\n\nmain();\n"
  },
  {
    "path": "examples/logit-processor/README.md",
    "content": "# WebLLM Logit Processor and Low-Level API Example\n\nThis folder explains the usage of `LogitProcessor`, demonstrating how it can be used to\nmanipulate the raw logits before sampling the token (e.g. setting certain tokens to `inf` or `-inf`).\nWe demonstrate how to use it with and without a web worker, which can be toggled with `USE_WEB_WORKER`\nin `logit_processor.ts` (see `worker.ts` on how `LogitProcessor` plays a role there).\n\nWe also demonstrate the usage of a low-level API `forwardTokenAndSample()`, which, unlike `chat.completions.create()`\nthat assumes the usage is for autoregressive chatting, here we have more fine-grained control.\n\nSee `my_logit_processor.ts` on how to customize your own logit processor. Here we make the logit\nof token 0 `100.0` manually, large enough that we should expect to always sample token 0, which\nis indeed the case if we observe the console log. We also demonstarte that a LogitProcessor can be\nstateful, and the state can also be cleaned with `LogitProcessor.resetState()`.\n\nTo try it out, you can do the following steps under this folder\n\n```bash\nnpm install\nnpm start\n```\n\nNote if you would like to hack WebLLM core package, you can change web-llm dependencies as `\"file:../..\"`, and follow the build from source instruction in the project to build webllm locally. This option is only recommended if you would like to hack WebLLM core package.\n"
  },
  {
    "path": "examples/logit-processor/package.json",
    "content": "{\n  \"name\": \"logit-processor\",\n  \"version\": \"0.1.0\",\n  \"private\": true,\n  \"scripts\": {\n    \"start\": \"parcel src/logit_processor.html  --port 8885\",\n    \"build\": \"parcel build src/logit_processor.html --dist-dir lib\"\n  },\n  \"devDependencies\": {\n    \"buffer\": \"^5.7.1\",\n    \"parcel\": \"^2.8.3\",\n    \"process\": \"^0.11.10\",\n    \"tslib\": \"^2.3.1\",\n    \"typescript\": \"^4.9.5\",\n    \"url\": \"^0.11.3\"\n  },\n  \"dependencies\": {\n    \"@mlc-ai/web-llm\": \"^0.2.82\"\n  }\n}\n"
  },
  {
    "path": "examples/logit-processor/src/logit_processor.html",
    "content": "<!doctype html>\n<html>\n  <script>\n    webLLMGlobal = {};\n  </script>\n\n  <body>\n    <h2>WebLLM Logit Processor Test Page</h2>\n    Open console to see the effect of your logit processor.\n    <br />\n    <br />\n    <label id=\"init-label\"> </label>\n\n    <script type=\"module\" src=\"./logit_processor.ts\"></script>\n  </body>\n</html>\n"
  },
  {
    "path": "examples/logit-processor/src/logit_processor.ts",
    "content": "import * as webllm from \"@mlc-ai/web-llm\";\nimport { MyLogitProcessor } from \"./my_logit_processor\";\n\nconst USE_WEB_WORKER = true; // Toggle this to use Logit Processor without a web worker\nconst AUTOREGRESS_LIMIT = 32; // How many tokens to generate for this test\n\nfunction setLabel(id: string, text: string) {\n  const label = document.getElementById(id);\n  if (label == null) {\n    throw Error(\"Cannot find label \" + id);\n  }\n  label.innerText = text;\n}\n\nasync function main() {\n  const initProgressCallback = (report: webllm.InitProgressReport) => {\n    setLabel(\"init-label\", report.text);\n  };\n  // Instantiate myLogitProcessor, registering in the logitProcessorRegistry\n  const myLogitProcessor = new MyLogitProcessor();\n  const logitProcessorRegistry = new Map<string, webllm.LogitProcessor>();\n  logitProcessorRegistry.set(\"phi-2-q4f32_1-MLC\", myLogitProcessor);\n\n  let engine: webllm.MLCEngineInterface;\n\n  // Depending on whether we use a web worker, the code is slightly different\n  if (USE_WEB_WORKER) {\n    // see worker.ts on how LogitProcessor plays a role there\n    engine = await webllm.CreateWebWorkerMLCEngine(\n      new Worker(new URL(\"./worker.ts\", import.meta.url), { type: \"module\" }),\n      \"phi-2-q4f32_1-MLC\",\n      { initProgressCallback: initProgressCallback },\n    );\n  } else {\n    engine = await webllm.CreateMLCEngine(\"phi-2-q4f32_1-MLC\", {\n      initProgressCallback: initProgressCallback,\n      logitProcessorRegistry: logitProcessorRegistry,\n    });\n  }\n\n  // Below we demonstrate the usage of a low-level API `forwardTokensAndSample()`\n  const prompt: Array<number> = [42];\n  let nextToken = await engine.forwardTokensAndSample(\n    prompt,\n    /*isPrefill=*/ true,\n  );\n  console.log(nextToken);\n\n  let counter = prompt.length;\n  while (counter < AUTOREGRESS_LIMIT) {\n    counter += 1;\n    nextToken = await engine.forwardTokensAndSample(\n      [nextToken],\n      /*isPrefill=*/ false,\n    );\n    console.log(nextToken);\n  }\n\n  // By calling `engine.resetChat()`, we triggers MyLogitProcessor.resetState()\n  engine.resetChat();\n  counter = prompt.length;\n  nextToken = await engine.forwardTokensAndSample(prompt, /*isPrefill=*/ true);\n  console.log(nextToken);\n  while (counter < AUTOREGRESS_LIMIT) {\n    counter += 1;\n    nextToken = await engine.forwardTokensAndSample(\n      [nextToken],\n      /*isPrefill=*/ false,\n    );\n    console.log(nextToken);\n  }\n\n  // `forwardTokensAndSample()` is made compatible with registering runtime stats.\n  console.log(await engine.runtimeStatsText());\n}\n\nmain();\n"
  },
  {
    "path": "examples/logit-processor/src/my_logit_processor.ts",
    "content": "import * as webllm from \"@mlc-ai/web-llm\";\n\n// Define LogitProcessor\nexport class MyLogitProcessor implements webllm.LogitProcessor {\n  private tokenSequence: Array<number> = [];\n\n  processLogits(logits: Float32Array): Float32Array {\n    logits[0] = 100.0; // should be enough so that we always sample token 0 below\n    return logits;\n  }\n\n  processSampledToken(token: number): void {\n    this.tokenSequence.push(token);\n    console.log(\"processSampledToken: \" + this.tokenSequence.length);\n  }\n\n  resetState(): void {\n    this.tokenSequence = [];\n    console.log(\"resetState\");\n  }\n}\n"
  },
  {
    "path": "examples/logit-processor/src/worker.ts",
    "content": "// Serve the chat workload through web worker\nimport * as webllm from \"@mlc-ai/web-llm\";\nimport { MyLogitProcessor } from \"./my_logit_processor\";\n\nconsole.log(\"Use web worker for logit processor\");\n\nconst myLogitProcessor = new MyLogitProcessor();\nconst logitProcessorRegistry = new Map<string, webllm.LogitProcessor>();\nlogitProcessorRegistry.set(\"phi-2-q4f32_1-MLC\", myLogitProcessor);\n\nconst handler = new webllm.WebWorkerMLCEngineHandler();\nhandler.setLogitProcessorRegistry(logitProcessorRegistry);\nself.onmessage = (msg: MessageEvent) => {\n  handler.onmessage(msg);\n};\n"
  },
  {
    "path": "examples/multi-models/README.md",
    "content": "# WebLLM Get Started App\n\nThis folder provides a minimum demo to show WebLLM API in a webapp setting.\nTo try it out, you can do the following steps under this folder\n\n```bash\nnpm install\nnpm start\n```\n\nNote if you would like to hack WebLLM core package.\nYou can change web-llm dependencies as `\"file:../..\"`, and follow the build from source\ninstruction in the project to build webllm locally. This option is only recommended\nif you would like to hack WebLLM core package.\n"
  },
  {
    "path": "examples/multi-models/package.json",
    "content": "{\n  \"name\": \"get-started\",\n  \"version\": \"0.1.0\",\n  \"private\": true,\n  \"scripts\": {\n    \"start\": \"parcel src/multi_models.html  --port 8888\",\n    \"build\": \"parcel build src/multi_models.html --dist-dir lib\"\n  },\n  \"devDependencies\": {\n    \"buffer\": \"^5.7.1\",\n    \"parcel\": \"^2.8.3\",\n    \"process\": \"^0.11.10\",\n    \"tslib\": \"^2.3.1\",\n    \"typescript\": \"^4.9.5\",\n    \"url\": \"^0.11.3\"\n  },\n  \"dependencies\": {\n    \"@mlc-ai/web-llm\": \"^0.2.82\"\n  }\n}\n"
  },
  {
    "path": "examples/multi-models/src/main.ts",
    "content": "/**\n * This example demonstrates loading multiple models in the same engine concurrently.\n * sequentialGeneration() shows inference each model one at a time.\n * parallelGeneration() shows inference both models at the same time.\n * This example uses WebWorkerMLCEngine, but the same idea applies to MLCEngine and\n * ServiceWorkerMLCEngine as well.\n */\n\nimport * as webllm from \"@mlc-ai/web-llm\";\n\nfunction setLabel(id: string, text: string) {\n  const label = document.getElementById(id);\n  if (label == null) {\n    throw Error(\"Cannot find label \" + id);\n  }\n  label.innerText = text;\n}\n\nconst initProgressCallback = (report: webllm.InitProgressReport) => {\n  setLabel(\"init-label\", report.text);\n};\n\n// Prepare request for each model, same for both methods\nconst selectedModel1 = \"Phi-3.5-mini-instruct-q4f32_1-MLC-1k\";\nconst selectedModel2 = \"gemma-2-2b-it-q4f32_1-MLC-1k\";\nconst prompt1 = \"Tell me about California in 3 short sentences.\";\nconst prompt2 = \"Tell me about New York City in 3 short sentences.\";\nsetLabel(\"prompt-label-1\", `(with model ${selectedModel1})\\n` + prompt1);\nsetLabel(\"prompt-label-2\", `(with model ${selectedModel2})\\n` + prompt2);\n\nconst request1: webllm.ChatCompletionRequestStreaming = {\n  stream: true,\n  stream_options: { include_usage: true },\n  messages: [{ role: \"user\", content: prompt1 }],\n  model: selectedModel1, // without specifying it, error will throw due to ambiguity\n  max_tokens: 128,\n};\n\nconst request2: webllm.ChatCompletionRequestStreaming = {\n  stream: true,\n  stream_options: { include_usage: true },\n  messages: [{ role: \"user\", content: prompt2 }],\n  model: selectedModel2, // without specifying it, error will throw due to ambiguity\n  max_tokens: 128,\n};\n\n/**\n * Chat completion (OpenAI style) with streaming, with two models in the pipeline.\n */\nasync function sequentialGeneration() {\n  const engine = await webllm.CreateWebWorkerMLCEngine(\n    new Worker(new URL(\"./worker.ts\", import.meta.url), { type: \"module\" }),\n    [selectedModel1, selectedModel2],\n    { initProgressCallback: initProgressCallback },\n  );\n\n  const asyncChunkGenerator1 = await engine.chat.completions.create(request1);\n  let message1 = \"\";\n  for await (const chunk of asyncChunkGenerator1) {\n    // console.log(chunk);\n    message1 += chunk.choices[0]?.delta?.content || \"\";\n    setLabel(\"generate-label-1\", message1);\n    if (chunk.usage) {\n      console.log(chunk.usage); // only last chunk has usage\n    }\n    // engine.interruptGenerate();  // works with interrupt as well\n  }\n  const asyncChunkGenerator2 = await engine.chat.completions.create(request2);\n  let message2 = \"\";\n  for await (const chunk of asyncChunkGenerator2) {\n    // console.log(chunk);\n    message2 += chunk.choices[0]?.delta?.content || \"\";\n    setLabel(\"generate-label-2\", message2);\n    if (chunk.usage) {\n      console.log(chunk.usage); // only last chunk has usage\n    }\n    // engine.interruptGenerate();  // works with interrupt as well\n  }\n\n  // without specifying from which model to get message, error will throw due to ambiguity\n  console.log(\"Final message 1:\\n\", await engine.getMessage(selectedModel1));\n  console.log(\"Final message 2:\\n\", await engine.getMessage(selectedModel2));\n}\n\n/**\n * Chat completion (OpenAI style) with streaming, with two models in the pipeline.\n */\nasync function parallelGeneration() {\n  const engine = await webllm.CreateWebWorkerMLCEngine(\n    new Worker(new URL(\"./worker.ts\", import.meta.url), { type: \"module\" }),\n    [selectedModel1, selectedModel2],\n    { initProgressCallback: initProgressCallback },\n  );\n\n  // We can serve the two requests concurrently\n  async function getModel1Response() {\n    let message1 = \"\";\n    const asyncChunkGenerator1 = await engine.chat.completions.create(request1);\n    for await (const chunk of asyncChunkGenerator1) {\n      // console.log(chunk);\n      message1 += chunk.choices[0]?.delta?.content || \"\";\n      setLabel(\"generate-label-1\", message1);\n      if (chunk.usage) {\n        console.log(chunk.usage); // only last chunk has usage\n      }\n      // engine.interruptGenerate();  // works with interrupt as well\n    }\n  }\n\n  async function getModel2Response() {\n    let message2 = \"\";\n    const asyncChunkGenerator2 = await engine.chat.completions.create(request2);\n    for await (const chunk of asyncChunkGenerator2) {\n      // console.log(chunk);\n      message2 += chunk.choices[0]?.delta?.content || \"\";\n      setLabel(\"generate-label-2\", message2);\n      if (chunk.usage) {\n        console.log(chunk.usage); // only last chunk has usage\n      }\n      // engine.interruptGenerate();  // works with interrupt as well\n    }\n  }\n\n  await Promise.all([getModel1Response(), getModel2Response()]);\n  // Note: concurrent requests to the same model are executed sequentially in FCFS,\n  // unlike to different models like above\n  // Fore more, see https://github.com/mlc-ai/web-llm/pull/549\n  // await Promise.all([getModel1Response(), getModel1Response()]);\n\n  // without specifying from which model to get message, error will throw due to ambiguity\n  console.log(\"Final message 1:\\n\", await engine.getMessage(selectedModel1));\n  console.log(\"Final message 2:\\n\", await engine.getMessage(selectedModel2));\n}\n\n// Pick one to run\nsequentialGeneration();\n// parallelGeneration();\n"
  },
  {
    "path": "examples/multi-models/src/multi_models.html",
    "content": "<!doctype html>\n<html>\n  <script>\n    webLLMGlobal = {};\n  </script>\n  <body>\n    <h2>WebLLM Test Page</h2>\n    Open console to see output\n    <br />\n    <br />\n    <label id=\"init-label\"> </label>\n\n    <h3>Prompt 1</h3>\n    <label id=\"prompt-label-1\"> </label>\n\n    <h3>Response from model 1</h3>\n    <label id=\"generate-label-1\"> </label>\n    <br />\n\n    <h3>Prompt 2</h3>\n    <label id=\"prompt-label-2\"> </label>\n\n    <h3>Response from model 2</h3>\n    <label id=\"generate-label-2\"> </label>\n    <br />\n    <label id=\"stats-label\"> </label>\n\n    <script type=\"module\" src=\"./main.ts\"></script>\n  </body>\n</html>\n"
  },
  {
    "path": "examples/multi-models/src/worker.ts",
    "content": "import { WebWorkerMLCEngineHandler } from \"@mlc-ai/web-llm\";\n\n// Hookup an engine to a worker handler\nconst handler = new WebWorkerMLCEngineHandler();\nself.onmessage = (msg: MessageEvent) => {\n  handler.onmessage(msg);\n};\n"
  },
  {
    "path": "examples/multi-round-chat/README.md",
    "content": "### OpenAI API Demos\n\nRun `npm install` first, followed by `npm start`.\n\nNote if you would like to hack WebLLM core package,\nyou can change web-llm dependencies as `\"file:../..\"`, and follow the build from source\ninstruction in the project to build webllm locally. This option is only recommended\nif you would like to hack WebLLM core package.\n"
  },
  {
    "path": "examples/multi-round-chat/package.json",
    "content": "{\n  \"name\": \"openai-api\",\n  \"version\": \"0.1.0\",\n  \"private\": true,\n  \"scripts\": {\n    \"start\": \"parcel src/multi_round_chat.html  --port 8888\",\n    \"build\": \"parcel build src/multi_round_chat.html --dist-dir lib\"\n  },\n  \"devDependencies\": {\n    \"buffer\": \"^5.7.1\",\n    \"parcel\": \"^2.8.3\",\n    \"process\": \"^0.11.10\",\n    \"tslib\": \"^2.3.1\",\n    \"typescript\": \"^4.9.5\",\n    \"url\": \"^0.11.3\"\n  },\n  \"dependencies\": {\n    \"@mlc-ai/web-llm\": \"^0.2.82\"\n  }\n}\n"
  },
  {
    "path": "examples/multi-round-chat/src/multi_round_chat.html",
    "content": "<!doctype html>\n<html>\n  <script>\n    webLLMGlobal = {};\n  </script>\n\n  <body>\n    <h2>WebLLM Test Page</h2>\n    Open console to see output\n    <br />\n    <br />\n    <label id=\"init-label\"> </label>\n\n    <script type=\"module\" src=\"./multi_round_chat.ts\"></script>\n  </body>\n</html>\n"
  },
  {
    "path": "examples/multi-round-chat/src/multi_round_chat.ts",
    "content": "import * as webllm from \"@mlc-ai/web-llm\";\n\nfunction setLabel(id: string, text: string) {\n  const label = document.getElementById(id);\n  if (label == null) {\n    throw Error(\"Cannot find label \" + id);\n  }\n  label.innerText = text;\n}\n\n/**\n * We demonstrate multiround chatting. Though users are required to maintain chat history, internally\n * we compare provided `messages` with the internal chat history. If it matches, we will reuse KVs\n * and hence save computation -- essentially an implicit internal optimization.\n */\nasync function main() {\n  const initProgressCallback = (report: webllm.InitProgressReport) => {\n    setLabel(\"init-label\", report.text);\n  };\n  const selectedModel = \"Llama-3.1-8B-Instruct-q4f32_1-MLC\";\n  const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(\n    selectedModel,\n    { initProgressCallback: initProgressCallback },\n  );\n\n  // Round 0\n  const messages: webllm.ChatCompletionMessageParam[] = [\n    {\n      role: \"system\",\n      content:\n        \"You are a helpful, respectful and honest assistant. \" +\n        \"Be as happy as you can when speaking please. \",\n    },\n    { role: \"user\", content: \"Provide me three US states.\" },\n  ];\n\n  const request0: webllm.ChatCompletionRequest = {\n    stream: false, // can be streaming, same behavior\n    messages: messages,\n  };\n\n  const reply0 = await engine.chat.completions.create(request0);\n  const replyMessage0 = await engine.getMessage();\n  console.log(reply0);\n  console.log(replyMessage0);\n  console.log(reply0.usage);\n\n  // Round 1\n  // Append generated response to messages\n  messages.push({ role: \"assistant\", content: replyMessage0 });\n  // Append new user input\n  messages.push({ role: \"user\", content: \"Two more please!\" });\n  // Below line would cause an internal reset (clear KV cache, etc.) since the history no longer\n  // matches the new request\n  // messages[0].content = \"Another system prompt\";\n\n  const request1: webllm.ChatCompletionRequest = {\n    stream: false, // can be streaming, same behavior\n    messages: messages,\n  };\n\n  const reply1 = await engine.chat.completions.create(request1);\n  const replyMessage1 = await engine.getMessage();\n  console.log(reply1);\n  console.log(replyMessage1);\n  console.log(reply1.usage);\n\n  // If we used multiround chat, request1 should only prefill a small number of tokens\n  const prefillTokens0 = reply0.usage?.prompt_tokens;\n  const prefillTokens1 = reply1.usage?.prompt_tokens;\n  console.log(\"Requset 0 prompt tokens: \", prefillTokens0);\n  console.log(\"Requset 1 prompt tokens: \", prefillTokens1);\n  if (\n    prefillTokens0 === undefined ||\n    prefillTokens1 === undefined ||\n    prefillTokens1 > prefillTokens0\n  ) {\n    throw Error(\"Multi-round chat is not triggered as expected.\");\n  }\n}\n\nmain();\n"
  },
  {
    "path": "examples/next-simple-chat/.gitignore",
    "content": "# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.\n\n# dependencies\n/node_modules\n/.pnp\n.pnp.js\n\n# testing\n/coverage\n\n# next.js\n/.next/\n/out/\n\n# production\n/build\n\n# misc\n.DS_Store\n*.pem\n\n# debug\nnpm-debug.log*\nyarn-debug.log*\nyarn-error.log*\n\n# local env files\n.env*.local\n\n# vercel\n.vercel\n\n# typescript\n*.tsbuildinfo\nnext-env.d.ts\n"
  },
  {
    "path": "examples/next-simple-chat/README.md",
    "content": "This is a [Next.js](https://nextjs.org/) project using web-llm.\n\n## Getting Started\n\nFirst, install web-llm from source.\n\nThen, run the development server:\n\n```bash\nnpm run dev\n# or\nyarn dev\n# or\npnpm dev\n```\n\nOpen [http://localhost:3000](http://localhost:3000) with your browser to see the result.\n"
  },
  {
    "path": "examples/next-simple-chat/next.config.js",
    "content": "/** @type {import('next').NextConfig} */\nconst nextConfig = {\n  reactStrictMode: true,\n\n  webpack: (config, { isServer }) => {\n    // Fixes npm packages that depend on `fs` module\n    if (!isServer) {\n      config.resolve.fallback = {\n        ...config.resolve.fallback, // if you miss it, all the other options in fallback, specified\n        // by next.js will be dropped. Doesn't make much sense, but how it is\n        fs: false, // the solution\n        module: false,\n        perf_hooks: false,\n      };\n    }\n\n    return config;\n  },\n};\n\nmodule.exports = nextConfig;\n"
  },
  {
    "path": "examples/next-simple-chat/package.json",
    "content": "{\n  \"name\": \"next-simple-chat\",\n  \"version\": \"0.1.0\",\n  \"private\": true,\n  \"scripts\": {\n    \"dev\": \"next dev\",\n    \"build\": \"next build\",\n    \"start\": \"next start\",\n    \"lint\": \"next lint\"\n  },\n  \"dependencies\": {\n    \"@mlc-ai/web-llm\": \"^0.2.82\",\n    \"@types/node\": \"20.3.3\",\n    \"@types/react\": \"18.2.14\",\n    \"@types/react-dom\": \"18.2.6\",\n    \"autoprefixer\": \"10.4.14\",\n    \"eslint\": \"8.44.0\",\n    \"eslint-config-next\": \"13.4.7\",\n    \"next\": \"^13.5.6\",\n    \"postcss\": \"8.4.24\",\n    \"react\": \"18.2.0\",\n    \"react-dom\": \"18.2.0\",\n    \"tailwindcss\": \"3.3.2\",\n    \"typescript\": \"5.1.6\"\n  }\n}\n"
  },
  {
    "path": "examples/next-simple-chat/postcss.config.js",
    "content": "module.exports = {\n  plugins: {\n    tailwindcss: {},\n    autoprefixer: {},\n  },\n};\n"
  },
  {
    "path": "examples/next-simple-chat/src/pages/_app.tsx",
    "content": "import \"~/styles/globals.css\";\nimport type { AppProps } from \"next/app\";\n\nexport default function App({ Component, pageProps }: AppProps) {\n  return <Component {...pageProps} />;\n}\n"
  },
  {
    "path": "examples/next-simple-chat/src/pages/_document.tsx",
    "content": "import { Html, Head, Main, NextScript } from \"next/document\";\n\nexport default function Document() {\n  return (\n    <Html lang=\"en\">\n      <Head />\n      <body>\n        <Main />\n        <NextScript />\n      </body>\n    </Html>\n  );\n}\n"
  },
  {
    "path": "examples/next-simple-chat/src/pages/api/hello.ts",
    "content": "// Next.js API route support: https://nextjs.org/docs/api-routes/introduction\nimport type { NextApiRequest, NextApiResponse } from \"next\";\n\ntype Data = {\n  name: string;\n};\n\nexport default function handler(\n  req: NextApiRequest,\n  res: NextApiResponse<Data>,\n) {\n  res.status(200).json({ name: \"John Doe\" });\n}\n"
  },
  {
    "path": "examples/next-simple-chat/src/pages/index.tsx",
    "content": "import Head from \"next/head\";\nimport ChatComponent from \"~/utils/chat_component\";\nimport { Inter } from \"next/font/google\";\n\nconst inter = Inter({ subsets: [\"latin\"] });\n\nexport default function Home() {\n  return (\n    <>\n      <Head>\n        <title>Example App</title>\n        <meta\n          name=\"description\"\n          content=\"Example app for web llm next compatibility\"\n        />\n        <link rel=\"icon\" href=\"/favicon.ico\" />\n      </Head>\n      <main\n        className={`flex min-h-screen flex-col items-center justify-between p-24 ${inter.className}`}\n      >\n        <ChatComponent />\n      </main>\n    </>\n  );\n}\n"
  },
  {
    "path": "examples/next-simple-chat/src/styles/globals.css",
    "content": "@tailwind base;\n@tailwind components;\n@tailwind utilities;\n\n:root {\n  --foreground-rgb: 0, 0, 0;\n  --background-start-rgb: 214, 219, 220;\n  --background-end-rgb: 255, 255, 255;\n}\n\n@media (prefers-color-scheme: dark) {\n  :root {\n    --foreground-rgb: 255, 255, 255;\n    --background-start-rgb: 0, 0, 0;\n    --background-end-rgb: 0, 0, 0;\n  }\n}\n\nbody {\n  color: rgb(var(--foreground-rgb));\n  background: linear-gradient(\n      to bottom,\n      transparent,\n      rgb(var(--background-end-rgb))\n    )\n    rgb(var(--background-start-rgb));\n}\n\na {\n  color: inherit;\n  text-decoration: none;\n}\n\n* {\n  box-sizing: border-box;\n}\n\nchatui-chat {\n  height: 100;\n}\n\n.chatui {\n  display: flex;\n  flex-flow: column wrap;\n  justify-content: space-between;\n  width: 100%;\n  max-width: 867px;\n  margin: 25px 10px;\n  height: 600px;\n  border: 2px solid #ddd;\n  border-radius: 5px;\n  box-shadow: 0 15px 15px -5px rgba(0, 0, 0, 0.2);\n}\n\ns .chatui-header {\n  display: flex;\n  justify-content: space-between;\n  padding: 10px;\n  border-bottom: 2px solid #ddd;\n  background: #eee;\n  color: #666;\n}\n\n.chatui-chat {\n  flex: 1;\n  overflow-y: auto;\n  padding: 10px;\n}\n\n.chatui-chat::-webkit-scrollbar {\n  width: 6px;\n}\n\n.chatui-chat::-webkit-scrollbar-track {\n  background: #ddd;\n}\n\n.chatui-chat::-webkit-scrollbar-thumb {\n  background: #bdbdbd;\n}\n\n.msg {\n  display: flex;\n  align-items: flex-end;\n  margin-bottom: 10px;\n}\n\n.msg:last-of-type {\n  margin: 0;\n}\n\n.msg-bubble {\n  max-width: 450px;\n  padding: 15px;\n  border-radius: 15px;\n  background: #ececec;\n}\n\n.left-msg .msg-bubble {\n  border-bottom-left-radius: 0;\n}\n\n.error-msg .msg-bubble {\n  border-bottom-left-radius: 0;\n  color: #f15959;\n}\n\n.init-msg .msg-bubble {\n  border-bottom-left-radius: 0;\n}\n\n.right-msg {\n  flex-direction: row-reverse;\n}\n\n.right-msg .msg-bubble {\n  background: #579ffb;\n  color: #fff;\n  border-bottom-right-radius: 0;\n}\n\n.chatui-inputarea {\n  display: flex;\n  padding: 10px;\n  border-top: 2px solid #ddd;\n  background: #eee;\n}\n\n.chatui-inputarea * {\n  padding: 10px;\n  border: none;\n  border-radius: 3px;\n  font-size: 1em;\n}\n\n.chatui-input {\n  flex: 1;\n  background: #ddd;\n}\n\n.chatui-btn {\n  margin-left: 10px;\n  background: #579ffb;\n  color: #fff;\n  font-weight: bold;\n  cursor: pointer;\n  padding: 10px;\n}\n\n.chatui-btn:hover {\n  background: #577bfb;\n}\n\n.chatui-chat {\n  background-color: #fcfcfe;\n}\n"
  },
  {
    "path": "examples/next-simple-chat/src/utils/chat_component.tsx",
    "content": "import { useState } from \"react\";\nimport { MLCEngine } from \"@mlc-ai/web-llm\";\nimport ChatUI from \"~/utils/chat_ui\";\n\nconst ChatComponent = () => {\n  const [messages, setMessages] = useState<{ kind: string; text: string }[]>(\n    [],\n  );\n  const [prompt, setPrompt] = useState(\"\");\n  const [runtimeStats, setRuntimeStats] = useState(\"\");\n  const [chat_ui] = useState(new ChatUI(new MLCEngine()));\n  const updateMessage = (kind: string, text: string, append: boolean) => {\n    if (kind == \"init\") {\n      text = \"[System Initalize] \" + text;\n    }\n    const msgCopy = [...messages];\n    if (msgCopy.length == 0 || append) {\n      setMessages([...msgCopy, { kind, text }]);\n    } else {\n      msgCopy[msgCopy.length - 1] = { kind, text };\n      setMessages([...msgCopy]);\n    }\n  };\n  return (\n    <div className=\"flex flex-col items-center\">\n      <button\n        className=\"chatui-btn\"\n        onClick={() => {\n          chat_ui.asyncInitChat(updateMessage).catch((error) => {\n            console.log(error);\n          });\n        }}\n      >\n        Download Model\n      </button>\n\n      <div className=\"chatui\">\n        <div className=\"chatui-chat\" id=\"chatui-chat\">\n          {messages.map((value, index) => (\n            <div key={index} className={`msg ${value.kind}-msg`}>\n              <div className=\"msg-bubble\">\n                <div className=\"msg-text\">${value.text}</div>\n              </div>\n            </div>\n          ))}\n        </div>\n\n        <div className=\"chatui-inputarea\">\n          <input\n            id=\"chatui-input\"\n            type=\"text\"\n            className=\"chatui-input\"\n            placeholder=\"Enter your message...\"\n            onKeyDown={(event) => {\n              if (event.key === \"Enter\") {\n                chat_ui\n                  .onGenerate(prompt, updateMessage, setRuntimeStats)\n                  .catch((error) => console.log(error));\n              }\n            }}\n            value={prompt}\n            onChange={(event) => setPrompt(event.target.value)}\n          />\n          <button\n            className=\"chatui-btn\"\n            onClick={() => {\n              chat_ui\n                .onGenerate(prompt, updateMessage, setRuntimeStats)\n                .catch((error) => console.log(error));\n            }}\n          >\n            Send\n          </button>\n        </div>\n      </div>\n\n      <div className=\"chatui-extra-control\">\n        <button\n          className=\"chatui-btn\"\n          onClick={() => {\n            chat_ui\n              .onReset(() => {\n                setMessages([]);\n              })\n              .catch((error) => console.log(error));\n          }}\n        >\n          Reset Chat\n        </button>\n        <label id=\"chatui-info-label\">{runtimeStats}</label>\n      </div>\n    </div>\n  );\n};\n\nexport default ChatComponent;\n"
  },
  {
    "path": "examples/next-simple-chat/src/utils/chat_ui.ts",
    "content": "import {\n  MLCEngineInterface,\n  ChatCompletionMessageParam,\n  CompletionUsage,\n} from \"@mlc-ai/web-llm\";\n\nexport default class ChatUI {\n  private engine: MLCEngineInterface;\n  private chatLoaded = false;\n  private requestInProgress = false;\n  // We use a request chain to ensure that\n  // all requests send to chat are sequentialized\n  private chatRequestChain: Promise<void> = Promise.resolve();\n  private chatHistory: ChatCompletionMessageParam[] = [];\n\n  constructor(engine: MLCEngineInterface) {\n    this.engine = engine;\n  }\n  /**\n   * Push a task to the execution queue.\n   *\n   * @param task The task to be executed;\n   */\n  private pushTask(task: () => Promise<void>) {\n    const lastEvent = this.chatRequestChain;\n    this.chatRequestChain = lastEvent.then(task);\n  }\n  // Event handlers\n  // all event handler pushes the tasks to a queue\n  // that get executed sequentially\n  // the tasks previous tasks, which causes them to early stop\n  // can be interrupted by chat.interruptGenerate\n  async onGenerate(\n    prompt: string,\n    messageUpdate: (kind: string, text: string, append: boolean) => void,\n    setRuntimeStats: (runtimeStats: string) => void,\n  ) {\n    if (this.requestInProgress) {\n      return;\n    }\n    this.pushTask(async () => {\n      await this.asyncGenerate(prompt, messageUpdate, setRuntimeStats);\n    });\n    return this.chatRequestChain;\n  }\n\n  async onReset(clearMessages: () => void) {\n    if (this.requestInProgress) {\n      // interrupt previous generation if any\n      this.engine.interruptGenerate();\n    }\n    this.chatHistory = [];\n    // try reset after previous requests finishes\n    this.pushTask(async () => {\n      await this.engine.resetChat();\n      clearMessages();\n    });\n    return this.chatRequestChain;\n  }\n\n  async asyncInitChat(\n    messageUpdate: (kind: string, text: string, append: boolean) => void,\n  ) {\n    if (this.chatLoaded) return;\n    this.requestInProgress = true;\n    messageUpdate(\"init\", \"\", true);\n    const initProgressCallback = (report: { text: string }) => {\n      messageUpdate(\"init\", report.text, false);\n    };\n    this.engine.setInitProgressCallback(initProgressCallback);\n\n    try {\n      const selectedModel = \"Llama-3.1-8B-Instruct-q4f32_1-MLC\";\n      // const selectedModel = \"TinyLlama-1.1B-Chat-v0.4-q4f16_1-MLC-1k\";\n      await this.engine.reload(selectedModel);\n    } catch (err: unknown) {\n      messageUpdate(\"error\", \"Init error, \" + (err?.toString() ?? \"\"), true);\n      console.log(err);\n      await this.unloadChat();\n      this.requestInProgress = false;\n      return;\n    }\n    this.requestInProgress = false;\n    this.chatLoaded = true;\n  }\n\n  private async unloadChat() {\n    await this.engine.unload();\n    this.chatLoaded = false;\n  }\n\n  /**\n   * Run generate\n   */\n  private async asyncGenerate(\n    prompt: string,\n    messageUpdate: (kind: string, text: string, append: boolean) => void,\n    setRuntimeStats: (runtimeStats: string) => void,\n  ) {\n    await this.asyncInitChat(messageUpdate);\n    this.requestInProgress = true;\n    // const prompt = this.uiChatInput.value;\n    if (prompt == \"\") {\n      this.requestInProgress = false;\n      return;\n    }\n\n    messageUpdate(\"right\", prompt, true);\n    // this.uiChatInput.value = \"\";\n    // this.uiChatInput.setAttribute(\"placeholder\", \"Generating...\");\n\n    messageUpdate(\"left\", \"\", true);\n\n    try {\n      this.chatHistory.push({ role: \"user\", content: prompt });\n      let curMessage = \"\";\n      let usage: CompletionUsage | undefined = undefined;\n      const completion = await this.engine.chat.completions.create({\n        stream: true,\n        messages: this.chatHistory,\n        stream_options: { include_usage: true },\n      });\n      for await (const chunk of completion) {\n        const curDelta = chunk.choices[0]?.delta.content;\n        if (curDelta) {\n          curMessage += curDelta;\n        }\n        messageUpdate(\"left\", curMessage, false);\n        if (chunk.usage) {\n          usage = chunk.usage;\n        }\n      }\n      const output = await this.engine.getMessage();\n      this.chatHistory.push({ role: \"assistant\", content: output });\n      messageUpdate(\"left\", output, false);\n      if (usage) {\n        const runtimeStats =\n          `prompt_tokens: ${usage.prompt_tokens}, ` +\n          `completion_tokens: ${usage.completion_tokens}, ` +\n          `prefill: ${usage.extra.prefill_tokens_per_s.toFixed(4)} tokens/sec, ` +\n          `decoding: ${usage.extra.decode_tokens_per_s.toFixed(4)} tokens/sec`;\n        setRuntimeStats(runtimeStats);\n      }\n    } catch (err: unknown) {\n      messageUpdate(\n        \"error\",\n        \"Generate error, \" + (err?.toString() ?? \"\"),\n        true,\n      );\n      console.log(err);\n      await this.unloadChat();\n    }\n    this.requestInProgress = false;\n  }\n}\n"
  },
  {
    "path": "examples/next-simple-chat/tailwind.config.js",
    "content": "/** @type {import('tailwindcss').Config} */\nmodule.exports = {\n  content: [\n    \"./src/pages/**/*.{js,ts,jsx,tsx,mdx}\",\n    \"./src/components/**/*.{js,ts,jsx,tsx,mdx}\",\n    \"./src/app/**/*.{js,ts,jsx,tsx,mdx}\",\n  ],\n  theme: {\n    extend: {\n      backgroundImage: {\n        \"gradient-radial\": \"radial-gradient(var(--tw-gradient-stops))\",\n        \"gradient-conic\":\n          \"conic-gradient(from 180deg at 50% 50%, var(--tw-gradient-stops))\",\n      },\n    },\n  },\n  plugins: [],\n};\n"
  },
  {
    "path": "examples/next-simple-chat/tsconfig.json",
    "content": "{\n  \"compilerOptions\": {\n    \"target\": \"es5\",\n    \"lib\": [\"dom\", \"dom.iterable\", \"esnext\"],\n    \"allowJs\": true,\n    \"skipLibCheck\": true,\n    \"strict\": true,\n    \"forceConsistentCasingInFileNames\": true,\n    \"noEmit\": true,\n    \"esModuleInterop\": true,\n    \"module\": \"esnext\",\n    \"moduleResolution\": \"node\",\n    \"resolveJsonModule\": true,\n    \"isolatedModules\": true,\n    \"jsx\": \"preserve\",\n    \"incremental\": true,\n    \"paths\": {\n      \"~/*\": [\"./src/*\"]\n    }\n  },\n  \"include\": [\"next-env.d.ts\", \"**/*.ts\", \"**/*.tsx\"],\n  \"exclude\": [\"node_modules\"]\n}\n"
  },
  {
    "path": "examples/qwen3/README.md",
    "content": "### OpenAI API Demos w/ Qwen3\n\nRun `npm install` first, followed by `npm start`.\n\nNote if you would like to hack WebLLM core package,\nyou can change web-llm dependencies as `\"file:../..\"`, and follow the build from source\ninstruction in the project to build webllm locally. This option is only recommended\nif you would like to hack WebLLM core package.\n"
  },
  {
    "path": "examples/qwen3/package.json",
    "content": "{\n  \"name\": \"qwen3_example\",\n  \"version\": \"0.1.0\",\n  \"private\": true,\n  \"scripts\": {\n    \"start\": \"parcel src/qwen3_example.html  --port 8883\",\n    \"build\": \"parcel build src/qwen3_example.html --dist-dir lib\"\n  },\n  \"devDependencies\": {\n    \"buffer\": \"^5.7.1\",\n    \"parcel\": \"^2.8.3\",\n    \"process\": \"^0.11.10\",\n    \"tslib\": \"^2.3.1\",\n    \"typescript\": \"^4.9.5\",\n    \"url\": \"^0.11.3\"\n  },\n  \"dependencies\": {\n    \"@mlc-ai/web-llm\": \"^0.2.82\"\n  }\n}\n"
  },
  {
    "path": "examples/qwen3/src/qwen3_example.html",
    "content": "<!doctype html>\n<html>\n  <script>\n    webLLMGlobal = {};\n  </script>\n\n  <body>\n    <h2>WebLLM Test Page</h2>\n    Open console to see output\n    <br />\n    <br />\n    <label id=\"init-label\"> </label>\n    <h3>Response</h3>\n    <label id=\"generate-label\"> </label>\n    <script type=\"module\" src=\"./qwen3_example.ts\"></script>\n  </body>\n</html>\n"
  },
  {
    "path": "examples/qwen3/src/qwen3_example.ts",
    "content": "import * as webllm from \"@mlc-ai/web-llm\";\n\nfunction setLabel(id: string, text: string) {\n  const label = document.getElementById(id);\n  if (label == null) {\n    throw Error(\"Cannot find label \" + id);\n  }\n  label.innerText = text;\n}\n\n// Helper method to stream responses from the engine\nasync function streamResponse(\n  engine: webllm.MLCEngineInterface,\n  request: webllm.ChatCompletionRequestStreaming,\n): Promise<void> {\n  console.log(\"Requesting chat completion with request:\", request);\n  const asyncChunkGenerator = await engine.chat.completions.create(request);\n  let message = \"\";\n  for await (const chunk of asyncChunkGenerator) {\n    message += chunk.choices[0]?.delta?.content || \"\";\n    setLabel(\"generate-label\", message);\n    if (chunk.usage) {\n      console.log(chunk.usage); // only last chunk has usage\n    }\n    // engine.interruptGenerate();  // works with interrupt as well\n  }\n  console.log(\"Final message:\\n\", await engine.getMessage()); // the concatenated message\n}\n\n/**\n * We demonstrate how Qwen3's best practices can be followed in WebLLM. For more, see\n * https://huggingface.co/Qwen/Qwen3-8B#best-practices.\n */\nasync function main() {\n  const initProgressCallback = (report: webllm.InitProgressReport) => {\n    setLabel(\"init-label\", report.text);\n  };\n  const selectedModel = \"Qwen3-4B-q4f16_1-MLC\";\n  const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(\n    selectedModel,\n    { initProgressCallback: initProgressCallback },\n  );\n\n  /**\n   * 1. Default behavior: enable thinking\n   */\n  let request: webllm.ChatCompletionRequest = {\n    stream: true,\n    stream_options: { include_usage: true },\n    messages: [\n      {\n        role: \"user\",\n        content: \"How many r's are there in the word strawberry?\",\n      },\n    ],\n    // Specifying `enable_thinking` is optional, as it defaults to think.\n    // extra_body: {\n    //   enable_thinking: true,\n    // }\n  };\n  await streamResponse(engine, request);\n\n  /**\n   * 2. Disable thinking with `enable_thinking: false`.\n   */\n  request = {\n    stream: true,\n    stream_options: { include_usage: true },\n    messages: [\n      {\n        role: \"user\",\n        content: \"How many r's are there in the word strawberry?\",\n      },\n    ],\n    extra_body: {\n      enable_thinking: false,\n    },\n  };\n  await streamResponse(engine, request);\n\n  /**\n   * 3. Disable thinking with soft switch /no_think\n   * or enable thinking with soft switch /think.\n   * Using soft switch: \"When enable_thinking=True, regardless of whether the user\n   * uses /think or /no_think, the model will always output a block wrapped in\n   * <think>...</think>. However, the content inside this block may be empty if\n   * thinking is disabled. When enable_thinking=False, the soft switches are not\n   * valid. Regardless of any /think or /no_think tags input by the user, the\n   * model will not generate think content and will not include a <think>...</think> block.\n   */\n  request = {\n    stream: true,\n    stream_options: { include_usage: true },\n    messages: [\n      {\n        role: \"user\",\n        content: \"How many r's are there in the word strawberry? /no_think\",\n        // content: \"How many r's are there in the word strawberry? /think\",\n      },\n    ],\n  };\n  await streamResponse(engine, request);\n\n  /**\n   * 4. For multi-turn messages, it is recommended to\n   * parse out the thinking content in the history\n   * messages as described in the Best Practices section.\n   */\n  const history: webllm.ChatCompletionMessageParam[] = [\n    {\n      role: \"user\",\n      content: \"How many r's are there in the word strawberry? /think\",\n    },\n    {\n      role: \"assistant\",\n      content:\n        \"<think>Dummy thinking content here...</think>\\n\\nThe answer is 3.\",\n    },\n  ];\n  // Preprocess history to remove thinking content\n  const preprocessedHistory = history.map((msg) => {\n    if (msg.role === \"assistant\") {\n      // Remove <think>...</think> block from assistant messages that is at the start\n      // and may contain two \\n\\n line breaks.\n      const thinkRegex = /<think>.*?<\\/think>\\n?\\n?/s; // Match <think>...</think> with optional \\n\\n\n      const contentWithoutThink = msg.content!.replace(thinkRegex, \"\").trim();\n      return { ...msg, content: contentWithoutThink };\n    }\n    return msg; // User messages remain unchanged\n  });\n  console.log(\"Preprocessed history:\", preprocessedHistory);\n\n  // Now use the preprocessed history in the request\n  const newMessage: webllm.ChatCompletionMessageParam = {\n    role: \"user\",\n    content: \"What about blueberries?\",\n  };\n\n  request = {\n    stream: true,\n    stream_options: { include_usage: true },\n    messages: [...preprocessedHistory, newMessage],\n  };\n  await streamResponse(engine, request);\n}\n\nmain();\n"
  },
  {
    "path": "examples/seed-to-reproduce/README.md",
    "content": "### OpenAI API Demos\n\nRun `npm install` first, followed by `npm start`.\n\nNote if you would like to hack WebLLM core package,\nyou can change web-llm dependencies as `\"file:../..\"`, and follow the build from source\ninstruction in the project to build webllm locally. This option is only recommended\nif you would like to hack WebLLM core package.\n"
  },
  {
    "path": "examples/seed-to-reproduce/package.json",
    "content": "{\n  \"name\": \"seed-to-reproduce\",\n  \"version\": \"0.1.0\",\n  \"private\": true,\n  \"scripts\": {\n    \"start\": \"parcel src/seed.html  --port 8888\",\n    \"build\": \"parcel build src/seed.html --dist-dir lib\"\n  },\n  \"devDependencies\": {\n    \"buffer\": \"^5.7.1\",\n    \"parcel\": \"^2.8.3\",\n    \"process\": \"^0.11.10\",\n    \"tslib\": \"^2.3.1\",\n    \"typescript\": \"^4.9.5\",\n    \"url\": \"^0.11.3\"\n  },\n  \"dependencies\": {\n    \"@mlc-ai/web-llm\": \"^0.2.82\"\n  }\n}\n"
  },
  {
    "path": "examples/seed-to-reproduce/src/seed.html",
    "content": "<!doctype html>\n<html>\n  <script>\n    webLLMGlobal = {};\n  </script>\n\n  <body>\n    <h2>WebLLM Test Page</h2>\n    Open console to see output. We make two generations with same seed, we\n    should expect them to be the same.\n    <br />\n    <br />\n    <label id=\"init-label\"> </label>\n\n    <script type=\"module\" src=\"./seed.ts\"></script>\n  </body>\n</html>\n"
  },
  {
    "path": "examples/seed-to-reproduce/src/seed.ts",
    "content": "import * as webllm from \"@mlc-ai/web-llm\";\n\nfunction setLabel(id: string, text: string) {\n  const label = document.getElementById(id);\n  if (label == null) {\n    throw Error(\"Cannot find label \" + id);\n  }\n  label.innerText = text;\n}\n\n/**\n * We domnstrate the effect of seeding. The prompt is about writing a poem and we use a high\n * `temperature`, making the sampling distribution supposedly more random. However, we demonstrate\n * that with seeding, we should see the exact same result being generated across two trials.\n * With `n > 1`, all choices should also be exactly the same.\n */\nasync function main() {\n  const initProgressCallback = (report: webllm.InitProgressReport) => {\n    setLabel(\"init-label\", report.text);\n  };\n  const selectedModel = \"Llama-3.1-8B-Instruct-q4f32_1-MLC\";\n  const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(\n    selectedModel,\n    { initProgressCallback: initProgressCallback },\n  );\n\n  const request: webllm.ChatCompletionRequest = {\n    stream: false, // works with streaming as well\n    messages: [\n      { role: \"user\", content: \"Write a creative Haiku about Pittsburgh\" },\n    ],\n    n: 3,\n    temperature: 1.2, // high temperature gives much more random results\n    max_tokens: 128, // To save time; enough to demonstrate the effect\n    seed: 42,\n  };\n\n  const reply0 = await engine.chat.completions.create(request);\n  console.log(reply0);\n  console.log(\"First reply's last choice:\\n\" + (await engine.getMessage()));\n  console.log(reply0.usage);\n\n  const reply1 = await engine.chat.completions.create(request);\n  console.log(reply1);\n  console.log(\"Second reply's last choice:\\n\" + (await engine.getMessage()));\n\n  // Rigorously check the generation results of each choice for the two requests\n  for (const choice0 of reply0.choices) {\n    const id = choice0.index;\n    const choice1 = reply1.choices[id];\n    if (choice0.message.content !== choice1.message.content) {\n      throw Error(\n        \"Chocie \" +\n          id +\n          \" of the two generations are different despite seeding\",\n      );\n    }\n  }\n\n  console.log(reply1.usage);\n}\n\n// Run one of the functions\nmain();\n"
  },
  {
    "path": "examples/service-worker/README.md",
    "content": "# WebLLM Service Worker Example\n\nThis example shows how we can create a page with Web-LLM running in service worker.\n\n```bash\nnpm install\nnpm run build\n```\n"
  },
  {
    "path": "examples/service-worker/package.json",
    "content": "{\n  \"name\": \"web-llm-service-worker\",\n  \"version\": \"0.1.0\",\n  \"private\": true,\n  \"scripts\": {\n    \"start\": \"rm -rf .parcel-cache && parcel src/index.html --port 3000\",\n    \"build\": \"rm -rf .parcel-cache && parcel build src/index.html --dist-dir lib\"\n  },\n  \"devDependencies\": {\n    \"buffer\": \"^6.0.3\",\n    \"parcel\": \"^2.8.3\",\n    \"process\": \"^0.11.10\",\n    \"tslib\": \"^2.3.1\",\n    \"typescript\": \"^4.9.5\",\n    \"url\": \"^0.11.3\"\n  },\n  \"dependencies\": {\n    \"@mlc-ai/web-llm\": \"^0.2.82\"\n  }\n}\n"
  },
  {
    "path": "examples/service-worker/src/index.html",
    "content": "<!doctype html>\n<html>\n  <script>\n    webLLMGlobal = {};\n  </script>\n  <body>\n    <h2>WebLLM Test Page</h2>\n    Open console to see output\n    <br />\n    <br />\n    <label id=\"init-label\"> </label>\n\n    <h3>Prompt</h3>\n    <label id=\"prompt-label\"> </label>\n\n    <h3>Response</h3>\n    <label id=\"generate-label\"> </label>\n    <br />\n    <label id=\"stats-label\"> </label>\n\n    <script type=\"module\" src=\"./main.ts\"></script>\n  </body>\n</html>\n"
  },
  {
    "path": "examples/service-worker/src/main.ts",
    "content": "import * as webllm from \"@mlc-ai/web-llm\";\n\nconst registerServiceWorker = async () => {\n  if (\"serviceWorker\" in navigator) {\n    try {\n      const registration = await navigator.serviceWorker.register(\n        new URL(\"sw.ts\", import.meta.url),\n        { type: \"module\" },\n      );\n      if (registration.installing) {\n        console.log(\"Service worker installing\");\n      } else if (registration.waiting) {\n        console.log(\"Service worker installed\");\n      } else if (registration.active) {\n        console.log(\"Service worker active\");\n      }\n    } catch (error) {\n      console.error(`Registration failed with ${error}`);\n    }\n  }\n};\n\nfunction setLabel(id: string, text: string) {\n  const label = document.getElementById(id);\n  if (label == null) {\n    throw Error(\"Cannot find label \" + id);\n  }\n  label.innerText = text;\n}\n\n// There are two demonstrations, pick one to run\n\n/**\n * Chat completion (OpenAI style) without streaming, where we get the entire response at once.\n */\nasync function mainNonStreaming() {\n  const initProgressCallback = (report: webllm.InitProgressReport) => {\n    setLabel(\"init-label\", report.text);\n  };\n  const selectedModel = \"Llama-3.1-8B-Instruct-q4f32_1-MLC\";\n\n  const engine: webllm.MLCEngineInterface =\n    await webllm.CreateServiceWorkerMLCEngine(selectedModel, {\n      initProgressCallback: initProgressCallback,\n    });\n\n  const request: webllm.ChatCompletionRequest = {\n    messages: [\n      {\n        role: \"system\",\n        content:\n          \"You are a helpful, respectful and honest assistant. \" +\n          \"Be as happy as you can when speaking please. \",\n      },\n      { role: \"user\", content: \"Provide me three US states.\" },\n      { role: \"assistant\", content: \"California, New York, Pennsylvania.\" },\n      { role: \"user\", content: \"Two more please!\" },\n    ],\n    n: 3,\n    temperature: 1.5,\n    max_tokens: 256,\n  };\n\n  const reply0 = await engine.chat.completions.create(request);\n  console.log(reply0);\n  setLabel(\"generate-label\", reply0.choices[0].message.content || \"\");\n\n  console.log(reply0.usage);\n}\n\n/**\n * Chat completion (OpenAI style) with streaming, where delta is sent while generating response.\n */\nasync function mainStreaming() {\n  const initProgressCallback = (report: webllm.InitProgressReport) => {\n    setLabel(\"init-label\", report.text);\n  };\n  const selectedModel = \"Llama-3.1-8B-Instruct-q4f32_1-MLC\";\n\n  const engine: webllm.ServiceWorkerMLCEngine =\n    await webllm.CreateServiceWorkerMLCEngine(selectedModel, {\n      initProgressCallback: initProgressCallback,\n    });\n\n  const request: webllm.ChatCompletionRequest = {\n    stream: true,\n    stream_options: { include_usage: true },\n    messages: [\n      {\n        role: \"system\",\n        content:\n          \"You are a helpful, respectful and honest assistant. \" +\n          \"Be as happy as you can when speaking please. \",\n      },\n      { role: \"user\", content: \"Provide me three US states.\" },\n      { role: \"assistant\", content: \"California, New York, Pennsylvania.\" },\n      { role: \"user\", content: \"Two more please!\" },\n    ],\n    temperature: 1.5,\n    max_tokens: 256,\n  };\n\n  const asyncChunkGenerator = await engine.chat.completions.create(request);\n  let message = \"\";\n  for await (const chunk of asyncChunkGenerator) {\n    console.log(chunk);\n    message += chunk.choices[0]?.delta?.content || \"\";\n    setLabel(\"generate-label\", message);\n    if (chunk.usage) {\n      console.log(chunk.usage); // only last chunk has usage\n    }\n    // engine.interruptGenerate();  // works with interrupt as well\n  }\n  console.log(\"Final message:\\n\", await engine.getMessage()); // the concatenated message\n}\n\nregisterServiceWorker();\n// Run one of the function below\n// mainNonStreaming();\nmainStreaming();\n"
  },
  {
    "path": "examples/service-worker/src/sw.ts",
    "content": "import { ServiceWorkerMLCEngineHandler } from \"@mlc-ai/web-llm\";\n\nlet handler: ServiceWorkerMLCEngineHandler;\n\nself.addEventListener(\"activate\", function (event) {\n  handler = new ServiceWorkerMLCEngineHandler();\n  console.log(\"Web-LLM Service Worker Activated\");\n});\n"
  },
  {
    "path": "examples/simple-chat-js/index.css",
    "content": "body,\nhtml {\n  font-family: Arial, sans-serif;\n  padding: 10px 20px;\n}\n\n.download-container {\n  display: flex;\n  justify-content: space-between;\n  margin-bottom: 20px;\n}\n\n#download-status {\n  border: solid 1px black;\n  box-shadow:\n    0 10px 15px -3px rgba(0, 0, 0, 0.1),\n    0 4px 6px -2px rgba(0, 0, 0, 0.05);\n  padding: 10px;\n}\n\n.chat-container {\n  height: 400px;\n  width: 100%;\n  border: 2px solid black;\n  display: flex;\n  flex-direction: column;\n}\n\n.chat-box {\n  overflow-y: scroll;\n  background-color: #c3c3c3;\n  border: 1px solid #ccc;\n  padding: 5px;\n  flex: 1 1;\n}\n\n.chat-stats {\n  background-color: #d3eceb;\n  flex: 0 0;\n  padding: 10px;\n  font-size: 0.75rem;\n}\n\n.message-container {\n  width: 100%;\n  display: flex;\n}\n\n.message {\n  padding: 10px;\n  margin: 10px 0;\n  border-radius: 10px;\n  width: fit-content;\n}\n\n.message-container.user {\n  justify-content: end;\n}\n\n.message-container.assistant {\n  justify-content: start;\n}\n\n.message-container.user .message {\n  background: #007bff;\n  color: #fff;\n}\n\n.message-container.assistant .message {\n  background: #f1f0f0;\n  color: #333;\n}\n\n.chat-input-container {\n  min-height: 40px;\n  flex: 0 0;\n  display: flex;\n}\n\n#user-input {\n  width: 70%;\n  padding: 10px;\n  border: 1px solid #ccc;\n}\n\nbutton {\n  width: 25%;\n  padding: 10px;\n  border: none;\n  background-color: #007bff;\n  color: white;\n  cursor: pointer;\n}\n\nbutton:disabled {\n  background-color: lightgray;\n  cursor: not-allowed;\n}\n\nbutton:hover:not(:disabled) {\n  background-color: #0056b3;\n}\n\n.hidden {\n  display: none;\n}\n"
  },
  {
    "path": "examples/simple-chat-js/index.html",
    "content": "<!doctype html>\n<html>\n  <head>\n    <title>Simple Chatbot</title>\n    <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\" />\n    <meta charset=\"UTF-8\" />\n    <link rel=\"stylesheet\" href=\"./index.css\" />\n  </head>\n\n  <body>\n    <p>Step 1: Initialize WebLLM and Download Model</p>\n    <div class=\"download-container\">\n      <select id=\"model-selection\"></select>\n      <button id=\"download\">Download</button>\n    </div>\n    <p id=\"download-status\" class=\"hidden\"></p>\n\n    <p>Step 2: Chat</p>\n    <div class=\"chat-container\">\n      <div id=\"chat-box\" class=\"chat-box\"></div>\n      <div id=\"chat-stats\" class=\"chat-stats hidden\"></div>\n      <div class=\"chat-input-container\">\n        <input type=\"text\" id=\"user-input\" placeholder=\"Type a message...\" />\n        <button id=\"send\" disabled>Send</button>\n      </div>\n    </div>\n\n    <script src=\"./index.js\" type=\"module\"></script>\n  </body>\n</html>\n"
  },
  {
    "path": "examples/simple-chat-js/index.js",
    "content": "import * as webllm from \"https://esm.run/@mlc-ai/web-llm\";\n\n/*************** WebLLM logic ***************/\nconst messages = [\n  {\n    content: \"You are a helpful AI agent helping users.\",\n    role: \"system\",\n  },\n];\n\nconst availableModels = webllm.prebuiltAppConfig.model_list.map(\n  (m) => m.model_id,\n);\nlet selectedModel = \"Llama-3.1-8B-Instruct-q4f32_1-1k\";\n\n// Callback function for initializing progress\nfunction updateEngineInitProgressCallback(report) {\n  console.log(\"initialize\", report.progress);\n  document.getElementById(\"download-status\").textContent = report.text;\n}\n\n// Create engine instance\nconst engine = new webllm.MLCEngine();\nengine.setInitProgressCallback(updateEngineInitProgressCallback);\n\nasync function initializeWebLLMEngine() {\n  document.getElementById(\"download-status\").classList.remove(\"hidden\");\n  selectedModel = document.getElementById(\"model-selection\").value;\n  const config = {\n    temperature: 1.0,\n    top_p: 1,\n  };\n  await engine.reload(selectedModel, config);\n}\n\nasync function streamingGenerating(messages, onUpdate, onFinish, onError) {\n  try {\n    let curMessage = \"\";\n    let usage;\n    const completion = await engine.chat.completions.create({\n      stream: true,\n      messages,\n      stream_options: { include_usage: true },\n    });\n    for await (const chunk of completion) {\n      const curDelta = chunk.choices[0]?.delta.content;\n      if (curDelta) {\n        curMessage += curDelta;\n      }\n      if (chunk.usage) {\n        usage = chunk.usage;\n      }\n      onUpdate(curMessage);\n    }\n    const finalMessage = await engine.getMessage();\n    onFinish(finalMessage, usage);\n  } catch (err) {\n    onError(err);\n  }\n}\n\n/*************** UI logic ***************/\nfunction onMessageSend() {\n  const input = document.getElementById(\"user-input\").value.trim();\n  const message = {\n    content: input,\n    role: \"user\",\n  };\n  if (input.length === 0) {\n    return;\n  }\n  document.getElementById(\"send\").disabled = true;\n\n  messages.push(message);\n  appendMessage(message);\n\n  document.getElementById(\"user-input\").value = \"\";\n  document\n    .getElementById(\"user-input\")\n    .setAttribute(\"placeholder\", \"Generating...\");\n\n  const aiMessage = {\n    content: \"typing...\",\n    role: \"assistant\",\n  };\n  appendMessage(aiMessage);\n\n  const onFinishGenerating = (finalMessage, usage) => {\n    updateLastMessage(finalMessage);\n    document.getElementById(\"send\").disabled = false;\n    const usageText =\n      `prompt_tokens: ${usage.prompt_tokens}, ` +\n      `completion_tokens: ${usage.completion_tokens}, ` +\n      `prefill: ${usage.extra.prefill_tokens_per_s.toFixed(4)} tokens/sec, ` +\n      `decoding: ${usage.extra.decode_tokens_per_s.toFixed(4)} tokens/sec`;\n    document.getElementById(\"chat-stats\").classList.remove(\"hidden\");\n    document.getElementById(\"chat-stats\").textContent = usageText;\n  };\n\n  streamingGenerating(\n    messages,\n    updateLastMessage,\n    onFinishGenerating,\n    console.error,\n  );\n}\n\nfunction appendMessage(message) {\n  const chatBox = document.getElementById(\"chat-box\");\n  const container = document.createElement(\"div\");\n  container.classList.add(\"message-container\");\n  const newMessage = document.createElement(\"div\");\n  newMessage.classList.add(\"message\");\n  newMessage.textContent = message.content;\n\n  if (message.role === \"user\") {\n    container.classList.add(\"user\");\n  } else {\n    container.classList.add(\"assistant\");\n  }\n\n  container.appendChild(newMessage);\n  chatBox.appendChild(container);\n  chatBox.scrollTop = chatBox.scrollHeight; // Scroll to the latest message\n}\n\nfunction updateLastMessage(content) {\n  const messageDoms = document\n    .getElementById(\"chat-box\")\n    .querySelectorAll(\".message\");\n  const lastMessageDom = messageDoms[messageDoms.length - 1];\n  lastMessageDom.textContent = content;\n}\n\n/*************** UI binding ***************/\navailableModels.forEach((modelId) => {\n  const option = document.createElement(\"option\");\n  option.value = modelId;\n  option.textContent = modelId;\n  document.getElementById(\"model-selection\").appendChild(option);\n});\ndocument.getElementById(\"model-selection\").value = selectedModel;\ndocument.getElementById(\"download\").addEventListener(\"click\", function () {\n  initializeWebLLMEngine().then(() => {\n    document.getElementById(\"send\").disabled = false;\n  });\n});\ndocument.getElementById(\"send\").addEventListener(\"click\", function () {\n  onMessageSend();\n});\n"
  },
  {
    "path": "examples/simple-chat-ts/.gitignore",
    "content": "src/app-config.js\n"
  },
  {
    "path": "examples/simple-chat-ts/README.md",
    "content": "# SimpleChat\n\nThis folder provides a complete implementation of a simple\nchat app based on WebLLM. To try it out, you can do the following steps\nunder this folder\n\n```bash\nnpm install\nnpm start\n```\n\nNote if you would like to hack WebLLM core package.\nYou can change web-llm dependencies as `\"file:../..\"`, and follow the build from source\ninstruction in the project to build webllm locally. This option is only recommended\nif you would like to hack WebLLM core package.\n\nDue to the differences in command-line tools between Unix/Linux and Windows systems, special adaptation is necessary for Windows. Unix/Linux systems natively support commands like `cp` for file operations, which are not directly available in Windows. To ensure cross-platform compatibility, we use a Node.js script for file copying in Windows.\n\n### Steps for Windows Users\n\n1. **Create a Node.js Script File**:\n   - In the `examples\\simple-chat` directory, create a file named `copy-config.js`.\n   - Add the following code to handle file copying:\n     ```javascript\n     const fs = require(\"fs\");\n     // Copy file\n     fs.copyFileSync(\"src/gh-config.js\", \"src/app-config.js\");\n     ```\n\n2. **Modify `package.json`**:\n   - In the `scripts` section of your `package.json`, replace Unix-style `cp` commands with our new Node.js script. For example:\n     ```json\n     \"scripts\": {\n         \"start\": \"node copy-config.js && parcel src/llm_chat.html --port 8888\",\n         \"mlc-local\": \"node copy-config.js && parcel src/llm_chat.html --port 8888\",\n         \"build\": \"node copy-config.js && parcel build src/llm_chat.html --dist-dir lib --no-content-hash\"\n     },\n     ```\n\n3. **Run the Application**:\n   - Save your changes and run `npm start` in CMD or PowerShell to start the application.\n"
  },
  {
    "path": "examples/simple-chat-ts/package.json",
    "content": "{\n  \"name\": \"simple-chat\",\n  \"version\": \"0.1.0\",\n  \"private\": true,\n  \"scripts\": {\n    \"start\": \"cp src/gh-config.js src/app-config.js && parcel src/llm_chat.html  --port 8883\",\n    \"build\": \"cp src/gh-config.js src/app-config.js && parcel build src/llm_chat.html --dist-dir lib --no-content-hash\"\n  },\n  \"devDependencies\": {\n    \"buffer\": \"^5.7.1\",\n    \"parcel\": \"^2.8.3\",\n    \"process\": \"^0.11.10\",\n    \"tslib\": \"^2.3.1\",\n    \"typescript\": \"^4.9.5\",\n    \"url\": \"^0.11.3\"\n  },\n  \"dependencies\": {\n    \"@mlc-ai/web-llm\": \"^0.2.82\"\n  }\n}\n"
  },
  {
    "path": "examples/simple-chat-ts/src/gh-config.js",
    "content": "import { prebuiltAppConfig } from \"@mlc-ai/web-llm\";\n\nexport default {\n  model_list: prebuiltAppConfig.model_list,\n  use_web_worker: true,\n};\n"
  },
  {
    "path": "examples/simple-chat-ts/src/llm_chat.css",
    "content": ".chatui {\n  display: flex;\n  position: relative;\n  flex-flow: column wrap;\n  justify-content: space-between;\n  width: 100%;\n  max-width: 867px;\n  margin: 25px 10px;\n  height: 600px;\n  border: 2px solid #ddd;\n  border-radius: 5px;\n  background-color: #1f2027;\n}\n\n.chatui-select-wrapper {\n  display: flex;\n  justify-content: center;\n  background-color: #1f2027;\n  padding: 10px 0;\n}\n\n#chatui-select {\n  width: 350px;\n  background-color: #1f2027;\n  color: white;\n  border: none;\n}\n\n#chatui-select:focus {\n  outline: none;\n}\n\n#chatui-select::-webkit-scrollbar {\n  display: none;\n}\n\n#chatui-select option {\n  background-color: #1f2027;\n  color: white;\n}\n\n#chatui-select option:hover {\n  background-color: #474747;\n  color: white;\n}\n\ns .chatui-header {\n  display: flex;\n  justify-content: space-between;\n  padding: 10px;\n  border-bottom: 2px solid #ddd;\n  background: #eee;\n  color: #666;\n}\n\n/* Used to remove tiny white lines in android devices; not sure if there is a better way */\n*,\n*::before,\n*::after {\n  box-sizing: content-box;\n}\n\n.chatui-chat {\n  flex: 1;\n  overflow-y: auto;\n  padding: 10px;\n  background-color: #1f2027;\n}\n\n.chatui-chat::-webkit-scrollbar {\n  width: 6px;\n}\n\n.chatui-chat::-webkit-scrollbar-track {\n  background: #1f2027;\n}\n\n.chatui-chat::-webkit-scrollbar-thumb {\n  background: #888;\n}\n\n.chatui-chat::-webkit-scrollbar-thumb:hover {\n  background: #555;\n}\n\n.msg {\n  display: flex;\n  align-items: flex-end;\n  margin-bottom: 10px;\n}\n\n.msg:last-of-type {\n  margin: 0;\n}\n\n.msg-bubble {\n  background-color: #f0f0f0;\n  border-radius: 8px;\n  padding: 16px;\n  margin: 5px auto;\n  width: calc(100% - 20px);\n  box-sizing: border-box;\n  color: black;\n  border: none;\n  font-size: medium;\n  margin-left: auto;\n  margin-right: auto;\n}\n\n.left-msg .msg-bubble {\n  background-color: #343541;\n  color: #ececec;\n}\n\n.error-msg .msg-bubble {\n  background-color: #343541;\n  color: #f15959;\n}\n\n.init-msg .msg-bubble {\n  background-color: #343541;\n  color: #ececec;\n}\n\n.right-msg .msg-bubble {\n  background-color: #444654;\n  color: #ececec;\n}\n\n.chatui-inputarea {\n  display: flex;\n  padding: 10px;\n  border-top: 2px solid transparent;\n  background-color: #1f2027;\n}\n\n.chatui-inputarea * {\n  padding: 10px;\n  border: none;\n  border-radius: 3px;\n  font-size: 1em;\n  color: white;\n  background: rgba(0, 0, 0, 0.3);\n}\n\n.chatui-input {\n  flex: 1;\n  background-color: #40414f;\n  color: white;\n}\n\n.chatui-reset-btn {\n  margin-left: 10px;\n  background-color: #40414f;\n  color: #fff;\n  font-weight: bold;\n  cursor: pointer;\n  background-image: url(\"img/reset.png\");\n  background-repeat: no-repeat;\n  background-position: center;\n  width: 40px;\n  background-repeat: no-repeat;\n  background-position: center;\n  background-size: 20px 20px;\n}\n\n.chatui-reset-btn:hover {\n  background-color: #03a33e;\n}\n\n.chatui-send-btn {\n  margin-left: 10px;\n  background-color: #40414f;\n  color: #fff;\n  font-weight: bold;\n  cursor: pointer;\n  background-image: url(\"img/plane.png\");\n  background-repeat: no-repeat;\n  background-position: center;\n  width: 40px;\n  background-repeat: no-repeat;\n  background-position: center;\n  background-size: 20px 20px;\n}\n\n.chatui-send-btn:hover {\n  background-color: #03a33e;\n}\n"
  },
  {
    "path": "examples/simple-chat-ts/src/llm_chat.html",
    "content": "<link href=\"./llm_chat.css\" rel=\"stylesheet\" type=\"text/css\" />\n\n<div class=\"chatui\">\n  <div class=\"chatui-select-wrapper\">\n    <select id=\"chatui-select\"></select>\n  </div>\n  <div class=\"chatui-chat\" id=\"chatui-chat\" height=\"100\"></div>\n\n  <div class=\"chatui-inputarea\">\n    <input\n      id=\"chatui-input\"\n      type=\"text\"\n      class=\"chatui-input\"\n      placeholder=\"Enter your message...\"\n    />\n    <button id=\"chatui-send-btn\" class=\"chatui-send-btn\"></button>\n    <button id=\"chatui-reset-btn\" class=\"chatui-reset-btn\"></button>\n  </div>\n</div>\n\n<div class=\"chatui-extra-control\">\n  <label id=\"chatui-info-label\"></label>\n</div>\n<!--- Place script after ui to make sure ui loads first -->\n<script type=\"module\" src=\"./simple_chat.ts\"></script>\n"
  },
  {
    "path": "examples/simple-chat-ts/src/simple_chat.ts",
    "content": "import appConfig from \"./app-config\";\nimport * as webllm from \"@mlc-ai/web-llm\";\n\nfunction getElementAndCheck(id: string): HTMLElement {\n  const element = document.getElementById(id);\n  if (element == null) {\n    throw Error(\"Cannot find element \" + id);\n  }\n  return element;\n}\n\nclass ChatUI {\n  private uiChat: HTMLElement;\n  private uiChatInput: HTMLInputElement;\n  private uiChatInfoLabel: HTMLLabelElement;\n  private engine: webllm.MLCEngineInterface | webllm.WebWorkerMLCEngine;\n  private config: webllm.AppConfig = appConfig;\n  private selectedModel: string;\n  private chatLoaded = false;\n  private requestInProgress = false;\n  private chatHistory: webllm.ChatCompletionMessageParam[] = [];\n  // We use a request chain to ensure that\n  // all requests send to chat are sequentialized\n  private chatRequestChain: Promise<void> = Promise.resolve();\n\n  /**\n   * An asynchronous factory constructor since we need to await getMaxStorageBufferBindingSize();\n   * this is not allowed in a constructor (which cannot be asynchronous).\n   */\n  public static CreateAsync = async (engine: webllm.MLCEngineInterface) => {\n    const chatUI = new ChatUI();\n    chatUI.engine = engine;\n    // get the elements\n    chatUI.uiChat = getElementAndCheck(\"chatui-chat\");\n    chatUI.uiChatInput = getElementAndCheck(\"chatui-input\") as HTMLInputElement;\n    chatUI.uiChatInfoLabel = getElementAndCheck(\n      \"chatui-info-label\",\n    ) as HTMLLabelElement;\n    // register event handlers\n    getElementAndCheck(\"chatui-reset-btn\").onclick = () => {\n      chatUI.onReset();\n    };\n    getElementAndCheck(\"chatui-send-btn\").onclick = () => {\n      chatUI.onGenerate();\n    };\n    // TODO: find other alternative triggers\n    getElementAndCheck(\"chatui-input\").onkeypress = (event) => {\n      if (event.keyCode === 13) {\n        chatUI.onGenerate();\n      }\n    };\n\n    // When we detect low maxStorageBufferBindingSize, we assume that the device (e.g. an Android\n    // phone) can only handle small models and make all other models unselectable. Otherwise, the\n    // browser may crash. See https://github.com/mlc-ai/web-llm/issues/209.\n    // Also use GPU vendor to decide whether it is a mobile device (hence with limited resources).\n    const androidMaxStorageBufferBindingSize = 1 << 27; // 128MB\n    const mobileVendors = new Set<string>([\"qualcomm\", \"arm\"]);\n    let restrictModels = false;\n    let maxStorageBufferBindingSize: number;\n    let gpuVendor: string;\n    try {\n      [maxStorageBufferBindingSize, gpuVendor] = await Promise.all([\n        engine.getMaxStorageBufferBindingSize(),\n        engine.getGPUVendor(),\n      ]);\n    } catch (err) {\n      chatUI.appendMessage(\"error\", \"Init error, \" + err.toString());\n      console.log(err.stack);\n      return;\n    }\n    if (\n      (gpuVendor.length != 0 && mobileVendors.has(gpuVendor)) ||\n      maxStorageBufferBindingSize <= androidMaxStorageBufferBindingSize\n    ) {\n      chatUI.appendMessage(\n        \"init\",\n        \"Your device seems to have \" +\n          \"limited resources, so we restrict the selectable models.\",\n      );\n      restrictModels = true;\n    }\n\n    // Populate modelSelector\n    const modelSelector = getElementAndCheck(\n      \"chatui-select\",\n    ) as HTMLSelectElement;\n    for (let i = 0; i < chatUI.config.model_list.length; ++i) {\n      const item = chatUI.config.model_list[i];\n      const opt = document.createElement(\"option\");\n      opt.value = item.model_id;\n      opt.innerHTML = item.model_id;\n      opt.selected = i == 0;\n      if (\n        (restrictModels &&\n          (item.low_resource_required === undefined ||\n            !item.low_resource_required)) ||\n        (item.buffer_size_required_bytes &&\n          maxStorageBufferBindingSize < item.buffer_size_required_bytes)\n      ) {\n        // Either on a low-resource device and not a low-resource model\n        // Or device's maxStorageBufferBindingSize does not satisfy the model's need (if specified)\n        const params = new URLSearchParams(location.search);\n        opt.disabled = !params.has(\"bypassRestrictions\");\n        opt.selected = false;\n      }\n      if (\n        !modelSelector.lastChild?.textContent?.startsWith(\n          opt.value.split(\"-\")[0],\n        )\n      ) {\n        modelSelector.appendChild(document.createElement(\"hr\"));\n      }\n      modelSelector.appendChild(opt);\n    }\n    modelSelector.appendChild(document.createElement(\"hr\"));\n\n    chatUI.selectedModel = modelSelector.value;\n    modelSelector.onchange = () => {\n      chatUI.onSelectChange(modelSelector);\n    };\n\n    return chatUI;\n  };\n\n  /**\n   * Push a task to the execution queue.\n   *\n   * @param task The task to be executed;\n   */\n  private pushTask(task: () => Promise<void>) {\n    const lastEvent = this.chatRequestChain;\n    this.chatRequestChain = lastEvent.then(task);\n  }\n  // Event handlers\n  // all event handler pushes the tasks to a queue\n  // that get executed sequentially\n  // the tasks previous tasks, which causes them to early stop\n  // can be interrupted by engine.interruptGenerate\n  private async onGenerate() {\n    if (this.requestInProgress) {\n      return;\n    }\n    this.pushTask(async () => {\n      await this.asyncGenerate();\n    });\n  }\n\n  private async onSelectChange(modelSelector: HTMLSelectElement) {\n    if (this.requestInProgress) {\n      // interrupt previous generation if any\n      this.engine.interruptGenerate();\n    }\n    // try reset after previous requests finishes\n    this.pushTask(async () => {\n      await this.engine.resetChat();\n      this.resetChatHistory();\n      await this.unloadChat();\n      this.selectedModel = modelSelector.value;\n      await this.asyncInitChat();\n    });\n  }\n\n  private async onReset() {\n    if (this.requestInProgress) {\n      // interrupt previous generation if any\n      this.engine.interruptGenerate();\n    }\n    // try reset after previous requests finishes\n    this.pushTask(async () => {\n      await this.engine.resetChat();\n      this.resetChatHistory();\n    });\n  }\n\n  // Internal helper functions\n  private appendMessage(kind, text) {\n    if (kind == \"init\") {\n      text = \"[System Initalize] \" + text;\n    }\n    if (this.uiChat === undefined) {\n      throw Error(\"cannot find ui chat\");\n    }\n    const msg = `\n      <div class=\"msg ${kind}-msg\">\n        <div class=\"msg-bubble\">\n          <div class=\"msg-text\">${text}</div>\n        </div>\n      </div>\n    `;\n    this.uiChat.insertAdjacentHTML(\"beforeend\", msg);\n    this.uiChat.scrollTo(0, this.uiChat.scrollHeight);\n  }\n\n  // Special care for user input such that we treat it as pure text instead of html\n  private appendUserMessage(text: string) {\n    if (this.uiChat === undefined) {\n      throw Error(\"cannot find ui chat\");\n    }\n    const msg = `\n      <div class=\"msg right-msg\">\n        <div class=\"msg-bubble\">\n          <div class=\"msg-text\"></div>\n        </div>\n      </div>\n    `;\n    this.uiChat.insertAdjacentHTML(\"beforeend\", msg);\n    // Recurse three times to get `msg-text`\n    const msgElement = this.uiChat.lastElementChild?.lastElementChild\n      ?.lastElementChild as HTMLElement;\n    msgElement.insertAdjacentText(\"beforeend\", text);\n    this.uiChat.scrollTo(0, this.uiChat.scrollHeight);\n  }\n\n  private updateLastMessage(kind, text) {\n    if (kind == \"init\") {\n      text = \"[System Initialize] \" + text;\n    }\n    if (this.uiChat === undefined) {\n      throw Error(\"cannot find ui chat\");\n    }\n    const matches = this.uiChat.getElementsByClassName(`msg ${kind}-msg`);\n    if (matches.length == 0) throw Error(`${kind} message do not exist`);\n    const msg = matches[matches.length - 1];\n    const msgText = msg.getElementsByClassName(\"msg-text\");\n    if (msgText.length != 1) throw Error(\"Expect msg-text\");\n    if (msgText[0].innerHTML == text) return;\n    const list = text.split(\"\\n\").map((t) => {\n      const item = document.createElement(\"div\");\n      item.textContent = t;\n      return item;\n    });\n    msgText[0].innerHTML = \"\";\n    list.forEach((item) => msgText[0].append(item));\n    this.uiChat.scrollTo(0, this.uiChat.scrollHeight);\n  }\n\n  private resetChatHistory() {\n    this.chatHistory = [];\n    const clearTags = [\"left\", \"right\", \"init\", \"error\"];\n    for (const tag of clearTags) {\n      // need to unpack to list so the iterator don't get affected by mutation\n      const matches = [...this.uiChat.getElementsByClassName(`msg ${tag}-msg`)];\n      for (const item of matches) {\n        this.uiChat.removeChild(item);\n      }\n    }\n    if (this.uiChatInfoLabel !== undefined) {\n      this.uiChatInfoLabel.innerHTML = \"\";\n    }\n  }\n\n  private async asyncInitChat() {\n    if (this.chatLoaded) return;\n    this.requestInProgress = true;\n    this.appendMessage(\"init\", \"\");\n    const initProgressCallback = (report) => {\n      this.updateLastMessage(\"init\", report.text);\n    };\n    this.engine.setInitProgressCallback(initProgressCallback);\n\n    try {\n      await this.engine.reload(this.selectedModel);\n    } catch (err) {\n      this.appendMessage(\"error\", \"Init error, \" + err.toString());\n      console.log(err.stack);\n      this.unloadChat();\n      this.requestInProgress = false;\n      return;\n    }\n    this.requestInProgress = false;\n    this.chatLoaded = true;\n  }\n\n  private async unloadChat() {\n    await this.engine.unload();\n    this.chatLoaded = false;\n  }\n\n  /**\n   * Run generate\n   */\n  private async asyncGenerate() {\n    await this.asyncInitChat();\n    this.requestInProgress = true;\n    const prompt = this.uiChatInput.value;\n    if (prompt == \"\") {\n      this.requestInProgress = false;\n      return;\n    }\n\n    this.appendUserMessage(prompt);\n    this.uiChatInput.value = \"\";\n    this.uiChatInput.setAttribute(\"placeholder\", \"Generating...\");\n\n    this.appendMessage(\"left\", \"\");\n    this.chatHistory.push({ role: \"user\", content: prompt });\n\n    try {\n      let curMessage = \"\";\n      let usage: webllm.CompletionUsage | undefined = undefined;\n      const completion = await this.engine.chat.completions.create({\n        stream: true,\n        messages: this.chatHistory,\n        stream_options: { include_usage: true },\n        // if model starts with \"Qwen3\", disable thinking.\n        extra_body: this.selectedModel.startsWith(\"Qwen3\")\n          ? {\n              enable_thinking: false,\n            }\n          : undefined,\n      });\n      // TODO(Charlie): Processing of � requires changes\n      for await (const chunk of completion) {\n        const curDelta = chunk.choices[0]?.delta.content;\n        if (curDelta) {\n          curMessage += curDelta;\n        }\n        this.updateLastMessage(\"left\", curMessage);\n        if (chunk.usage) {\n          usage = chunk.usage;\n        }\n      }\n      if (usage) {\n        this.uiChatInfoLabel.innerHTML =\n          `prompt_tokens: ${usage.prompt_tokens}, ` +\n          `completion_tokens: ${usage.completion_tokens}, ` +\n          `prefill: ${usage.extra.prefill_tokens_per_s.toFixed(4)} tokens/sec, ` +\n          `decoding: ${usage.extra.decode_tokens_per_s.toFixed(4)} tokens/sec`;\n      }\n      const finalMessage = await this.engine.getMessage();\n      this.updateLastMessage(\"left\", finalMessage); // TODO: Remove this after � issue is fixed\n      this.chatHistory.push({ role: \"assistant\", content: finalMessage });\n    } catch (err) {\n      this.appendMessage(\"error\", \"Generate error, \" + err.toString());\n      console.log(err.stack);\n      await this.unloadChat();\n    }\n    this.uiChatInput.setAttribute(\"placeholder\", \"Enter your message...\");\n    this.requestInProgress = false;\n  }\n}\n\nconst useWebWorker = appConfig.use_web_worker;\nlet engine: webllm.MLCEngineInterface;\n\n// Here we do not use `CreateMLCEngine()` but instantiate an engine that is not loaded with model\nif (useWebWorker) {\n  engine = new webllm.WebWorkerMLCEngine(\n    new Worker(new URL(\"./worker.ts\", import.meta.url), { type: \"module\" }),\n    { appConfig, logLevel: \"INFO\" },\n  );\n} else {\n  engine = new webllm.MLCEngine({ appConfig });\n}\nChatUI.CreateAsync(engine);\n"
  },
  {
    "path": "examples/simple-chat-ts/src/worker.ts",
    "content": "// Serve the engine workload through web worker\nimport { WebWorkerMLCEngineHandler } from \"@mlc-ai/web-llm\";\n\nconst handler = new WebWorkerMLCEngineHandler();\nself.onmessage = (msg: MessageEvent) => {\n  handler.onmessage(msg);\n};\n"
  },
  {
    "path": "examples/simple-chat-upload/.gitignore",
    "content": "src/app-config.js\n"
  },
  {
    "path": "examples/simple-chat-upload/README.md",
    "content": "# SimpleChat\n\nThis folder provides a complete implementation of a simple\nchat app based on WebLLM. To try it out, you can do the following steps\nunder this folder\n\n```bash\nnpm install\nnpm start\n```\n\nNote if you would like to hack WebLLM core package.\nYou can change web-llm dependencies as `\"file:../..\"`, and follow the build from source\ninstruction in the project to build webllm locally. This option is only recommended\nif you would like to hack WebLLM core package.\n\nDue to the differences in command-line tools between Unix/Linux and Windows systems, special adaptation is necessary for Windows. Unix/Linux systems natively support commands like `cp` for file operations, which are not directly available in Windows. To ensure cross-platform compatibility, we use a Node.js script for file copying in Windows.\n\n### Steps for Windows Users\n\n1. **Create a Node.js Script File**:\n   - In the `examples\\simple-chat` directory, create a file named `copy-config.js`.\n   - Add the following code to handle file copying:\n     ```javascript\n     const fs = require(\"fs\");\n     // Copy file\n     fs.copyFileSync(\"src/gh-config.js\", \"src/app-config.js\");\n     ```\n\n2. **Modify `package.json`**:\n   - In the `scripts` section of your `package.json`, replace Unix-style `cp` commands with our new Node.js script. For example:\n     ```json\n     \"scripts\": {\n         \"start\": \"node copy-config.js && parcel src/llm_chat.html --port 8888\",\n         \"mlc-local\": \"node copy-config.js && parcel src/llm_chat.html --port 8888\",\n         \"build\": \"node copy-config.js && parcel build src/llm_chat.html --dist-dir lib --no-content-hash\"\n     },\n     ```\n\n3. **Run the Application**:\n   - Save your changes and run `npm start` in CMD or PowerShell to start the application.\n"
  },
  {
    "path": "examples/simple-chat-upload/package.json",
    "content": "{\n  \"name\": \"simple-chat\",\n  \"version\": \"0.1.0\",\n  \"private\": true,\n  \"scripts\": {\n    \"start\": \"cp src/gh-config.js src/app-config.js && parcel src/llm_chat.html  --port 8883\",\n    \"build\": \"cp src/gh-config.js src/app-config.js && parcel build src/llm_chat.html --dist-dir lib --no-content-hash\"\n  },\n  \"devDependencies\": {\n    \"buffer\": \"^5.7.1\",\n    \"parcel\": \"^2.8.3\",\n    \"process\": \"^0.11.10\",\n    \"tslib\": \"^2.3.1\",\n    \"typescript\": \"^4.9.5\",\n    \"url\": \"^0.11.3\"\n  },\n  \"dependencies\": {\n    \"@mlc-ai/web-llm\": \"^0.2.82\"\n  }\n}\n"
  },
  {
    "path": "examples/simple-chat-upload/src/gh-config.js",
    "content": "import { prebuiltAppConfig } from \"@mlc-ai/web-llm\";\n\nexport default {\n  model_list: prebuiltAppConfig.model_list,\n  use_web_worker: true,\n};\n"
  },
  {
    "path": "examples/simple-chat-upload/src/llm_chat.css",
    "content": ".chatui {\n  display: flex;\n  position: relative;\n  flex-flow: column wrap;\n  justify-content: space-between;\n  width: 100%;\n  max-width: 867px;\n  margin: 25px 10px;\n  height: 600px;\n  border: 2px solid #ddd;\n  border-radius: 5px;\n  background-color: #1f2027;\n}\n\n.chatui-select-wrapper {\n  display: flex;\n  justify-content: center;\n  background-color: #1f2027;\n  padding: 10px 0;\n}\n\n#chatui-select {\n  width: 350px;\n  background-color: #1f2027;\n  color: white;\n  border: none;\n}\n\n#chatui-select:focus {\n  outline: none;\n}\n\n#chatui-select::-webkit-scrollbar {\n  display: none;\n}\n\n#chatui-select option {\n  background-color: #1f2027;\n  color: white;\n}\n\n#chatui-select option:hover {\n  background-color: #474747;\n  color: white;\n}\n\ns .chatui-header {\n  display: flex;\n  justify-content: space-between;\n  padding: 10px;\n  border-bottom: 2px solid #ddd;\n  background: #eee;\n  color: #666;\n}\n\n/* Used to remove tiny white lines in android devices; not sure if there is a better way */\n*,\n*::before,\n*::after {\n  box-sizing: content-box;\n}\n\n.chatui-chat {\n  flex: 1;\n  overflow-y: auto;\n  padding: 10px;\n  background-color: #1f2027;\n}\n\n.chatui-chat::-webkit-scrollbar {\n  width: 6px;\n}\n\n.chatui-chat::-webkit-scrollbar-track {\n  background: #1f2027;\n}\n\n.chatui-chat::-webkit-scrollbar-thumb {\n  background: #888;\n}\n\n.chatui-chat::-webkit-scrollbar-thumb:hover {\n  background: #555;\n}\n\n.msg {\n  display: flex;\n  align-items: flex-end;\n  margin-bottom: 10px;\n}\n\n.msg:last-of-type {\n  margin: 0;\n}\n\n.msg-bubble {\n  background-color: #f0f0f0;\n  border-radius: 8px;\n  padding: 16px;\n  margin: 5px auto;\n  width: calc(100% - 20px);\n  box-sizing: border-box;\n  color: black;\n  border: none;\n  font-size: medium;\n  margin-left: auto;\n  margin-right: auto;\n}\n\n.left-msg .msg-bubble {\n  background-color: #343541;\n  color: #ececec;\n}\n\n.error-msg .msg-bubble {\n  background-color: #343541;\n  color: #f15959;\n}\n\n.init-msg .msg-bubble {\n  background-color: #343541;\n  color: #ececec;\n}\n\n.right-msg .msg-bubble {\n  background-color: #444654;\n  color: #ececec;\n}\n\n.chatui-inputarea {\n  display: flex;\n  padding: 10px;\n  border-top: 2px solid transparent;\n  background-color: #1f2027;\n}\n\n.chatui-inputarea * {\n  padding: 10px;\n  border: none;\n  border-radius: 3px;\n  font-size: 1em;\n  color: white;\n  background: rgba(0, 0, 0, 0.3);\n}\n\n.chatui-input {\n  flex: 1;\n  background-color: #40414f;\n  color: white;\n}\n\n.chatui-reset-btn {\n  margin-left: 10px;\n  background-color: #40414f;\n  color: #fff;\n  font-weight: bold;\n  cursor: pointer;\n  background-image: url(\"img/reset.png\");\n  background-repeat: no-repeat;\n  background-position: center;\n  width: 40px;\n  background-repeat: no-repeat;\n  background-position: center;\n  background-size: 20px 20px;\n}\n\n.chatui-reset-btn:hover {\n  background-color: #03a33e;\n}\n\n.chatui-send-btn {\n  margin-left: 10px;\n  background-color: #40414f;\n  color: #fff;\n  font-weight: bold;\n  cursor: pointer;\n  background-image: url(\"img/plane.png\");\n  background-repeat: no-repeat;\n  background-position: center;\n  width: 40px;\n  background-repeat: no-repeat;\n  background-position: center;\n  background-size: 20px 20px;\n}\n\n.chatui-send-btn:hover {\n  background-color: #03a33e;\n}\n"
  },
  {
    "path": "examples/simple-chat-upload/src/llm_chat.html",
    "content": "<link href=\"./llm_chat.css\" rel=\"stylesheet\" type=\"text/css\" />\n\n<div class=\"chatui\">\n  <div class=\"chatui-select-wrapper\">\n    <select id=\"chatui-select\"></select>\n  </div>\n  <div class=\"chatui-chat\" id=\"chatui-chat\" height=\"100\"></div>\n\n  <body>\n    <input\n      type=\"file\"\n      id=\"file-input\"\n      style=\"position: absolute; top: 10px; right: 20px\"\n      multiple\n      onchange=\"uploadFiles()\"\n    />/>\n  </body>\n  <div class=\"chatui-inputarea\">\n    <input\n      id=\"chatui-input\"\n      type=\"text\"\n      class=\"chatui-input\"\n      placeholder=\"Enter your message...\"\n    />\n    <button id=\"chatui-send-btn\" class=\"chatui-send-btn\"></button>\n    <button id=\"chatui-reset-btn\" class=\"chatui-reset-btn\"></button>\n  </div>\n</div>\n\n<div class=\"chatui-extra-control\">\n  <label id=\"chatui-info-label\"></label>\n</div>\n<!--- Place script after ui to make sure ui loads first -->\n<script type=\"module\" src=\"./simple_chat.ts\"></script>\n"
  },
  {
    "path": "examples/simple-chat-upload/src/simple_chat.ts",
    "content": "import appConfig from \"./app-config\";\nimport * as webllm from \"@mlc-ai/web-llm\";\n\nfunction getElementAndCheck(id: string): HTMLElement {\n  const element = document.getElementById(id);\n  if (element == null) {\n    throw Error(\"Cannot find element \" + id);\n  }\n  return element;\n}\n\nclass ChatUI {\n  private uiChat: HTMLElement;\n  private uiChatInput: HTMLInputElement;\n  private uiChatInfoLabel: HTMLLabelElement;\n  private engine: webllm.MLCEngineInterface | webllm.WebWorkerMLCEngine;\n  private config: webllm.AppConfig = appConfig;\n  private selectedModel: string;\n  private chatLoaded = false;\n  private requestInProgress = false;\n  private chatHistory: webllm.ChatCompletionMessageParam[] = [];\n  // We use a request chain to ensure that\n  // all requests send to chat are sequentialized\n  private chatRequestChain: Promise<void> = Promise.resolve();\n\n  /**\n   * An asynchronous factory constructor since we need to await getMaxStorageBufferBindingSize();\n   * this is not allowed in a constructor (which cannot be asynchronous).\n   */\n  public static CreateAsync = async (engine: webllm.MLCEngineInterface) => {\n    const chatUI = new ChatUI();\n    chatUI.engine = engine;\n    // get the elements\n    chatUI.uiChat = getElementAndCheck(\"chatui-chat\");\n    chatUI.uiChatInput = getElementAndCheck(\"chatui-input\") as HTMLInputElement;\n    chatUI.uiChatInfoLabel = getElementAndCheck(\n      \"chatui-info-label\",\n    ) as HTMLLabelElement;\n    // register event handlers\n    getElementAndCheck(\"chatui-reset-btn\").onclick = () => {\n      chatUI.onReset();\n    };\n    getElementAndCheck(\"chatui-send-btn\").onclick = () => {\n      chatUI.onGenerate();\n    };\n    // TODO: find other alternative triggers\n    getElementAndCheck(\"chatui-input\").onkeypress = (event) => {\n      if (event.keyCode === 13) {\n        chatUI.onGenerate();\n      }\n    };\n\n    // When we detect low maxStorageBufferBindingSize, we assume that the device (e.g. an Android\n    // phone) can only handle small models and make all other models unselectable. Otherwise, the\n    // browser may crash. See https://github.com/mlc-ai/web-llm/issues/209.\n    // Also use GPU vendor to decide whether it is a mobile device (hence with limited resources).\n    const androidMaxStorageBufferBindingSize = 1 << 27; // 128MB\n    const mobileVendors = new Set<string>([\"qualcomm\", \"arm\"]);\n    let restrictModels = false;\n    let maxStorageBufferBindingSize: number;\n    let gpuVendor: string;\n    try {\n      [maxStorageBufferBindingSize, gpuVendor] = await Promise.all([\n        engine.getMaxStorageBufferBindingSize(),\n        engine.getGPUVendor(),\n      ]);\n    } catch (err) {\n      chatUI.appendMessage(\"error\", \"Init error, \" + err.toString());\n      console.log(err.stack);\n      return;\n    }\n    if (\n      (gpuVendor.length != 0 && mobileVendors.has(gpuVendor)) ||\n      maxStorageBufferBindingSize <= androidMaxStorageBufferBindingSize\n    ) {\n      chatUI.appendMessage(\n        \"init\",\n        \"Your device seems to have \" +\n          \"limited resources, so we restrict the selectable models.\",\n      );\n      restrictModels = true;\n    }\n\n    // Populate modelSelector\n    const modelSelector = getElementAndCheck(\n      \"chatui-select\",\n    ) as HTMLSelectElement;\n    for (let i = 0; i < chatUI.config.model_list.length; ++i) {\n      const item = chatUI.config.model_list[i];\n      const opt = document.createElement(\"option\");\n      opt.value = item.model_id;\n      opt.innerHTML = item.model_id;\n      opt.selected = i == 0;\n      if (\n        (restrictModels &&\n          (item.low_resource_required === undefined ||\n            !item.low_resource_required)) ||\n        (item.buffer_size_required_bytes &&\n          maxStorageBufferBindingSize < item.buffer_size_required_bytes)\n      ) {\n        // Either on a low-resource device and not a low-resource model\n        // Or device's maxStorageBufferBindingSize does not satisfy the model's need (if specified)\n        const params = new URLSearchParams(location.search);\n        opt.disabled = !params.has(\"bypassRestrictions\");\n        opt.selected = false;\n      }\n      if (\n        !modelSelector.lastChild?.textContent?.startsWith(\n          opt.value.split(\"-\")[0],\n        )\n      ) {\n        modelSelector.appendChild(document.createElement(\"hr\"));\n      }\n      modelSelector.appendChild(opt);\n    }\n    modelSelector.appendChild(document.createElement(\"hr\"));\n\n    chatUI.selectedModel = modelSelector.value;\n    modelSelector.onchange = () => {\n      chatUI.onSelectChange(modelSelector);\n    };\n\n    return chatUI;\n  };\n\n  /**\n   * Push a task to the execution queue.\n   *\n   * @param task The task to be executed;\n   */\n  private pushTask(task: () => Promise<void>) {\n    const lastEvent = this.chatRequestChain;\n    this.chatRequestChain = lastEvent.then(task);\n  }\n  // Event handlers\n  // all event handler pushes the tasks to a queue\n  // that get executed sequentially\n  // the tasks previous tasks, which causes them to early stop\n  // can be interrupted by engine.interruptGenerate\n  private async onGenerate() {\n    if (this.requestInProgress) {\n      return;\n    }\n    this.pushTask(async () => {\n      await this.asyncGenerate();\n    });\n  }\n\n  private async onSelectChange(modelSelector: HTMLSelectElement) {\n    if (this.requestInProgress) {\n      // interrupt previous generation if any\n      this.engine.interruptGenerate();\n    }\n    // try reset after previous requests finishes\n    this.pushTask(async () => {\n      await this.engine.resetChat();\n      this.resetChatHistory();\n      await this.unloadChat();\n      this.selectedModel = modelSelector.value;\n      await this.asyncInitChat();\n    });\n  }\n\n  private async onReset() {\n    if (this.requestInProgress) {\n      // interrupt previous generation if any\n      this.engine.interruptGenerate();\n    }\n    // try reset after previous requests finishes\n    this.pushTask(async () => {\n      await this.engine.resetChat();\n      this.resetChatHistory();\n    });\n  }\n\n  // Internal helper functions\n  private appendMessage(kind, text) {\n    if (kind == \"init\") {\n      text = \"[System Initalize] \" + text;\n    }\n    if (this.uiChat === undefined) {\n      throw Error(\"cannot find ui chat\");\n    }\n    const msg = `\n      <div class=\"msg ${kind}-msg\">\n        <div class=\"msg-bubble\">\n          <div class=\"msg-text\">${text}</div>\n        </div>\n      </div>\n    `;\n    this.uiChat.insertAdjacentHTML(\"beforeend\", msg);\n    this.uiChat.scrollTo(0, this.uiChat.scrollHeight);\n  }\n\n  private updateLastMessage(kind, text) {\n    if (kind == \"init\") {\n      text = \"[System Initalize] \" + text;\n    }\n    if (this.uiChat === undefined) {\n      throw Error(\"cannot find ui chat\");\n    }\n    const matches = this.uiChat.getElementsByClassName(`msg ${kind}-msg`);\n    if (matches.length == 0) throw Error(`${kind} message do not exist`);\n    const msg = matches[matches.length - 1];\n    const msgText = msg.getElementsByClassName(\"msg-text\");\n    if (msgText.length != 1) throw Error(\"Expect msg-text\");\n    if (msgText[0].innerHTML == text) return;\n    const list = text.split(\"\\n\").map((t) => {\n      const item = document.createElement(\"div\");\n      item.textContent = t;\n      return item;\n    });\n    msgText[0].innerHTML = \"\";\n    list.forEach((item) => msgText[0].append(item));\n    this.uiChat.scrollTo(0, this.uiChat.scrollHeight);\n  }\n\n  private resetChatHistory() {\n    this.chatHistory = [];\n    const clearTags = [\"left\", \"right\", \"init\", \"error\"];\n    for (const tag of clearTags) {\n      // need to unpack to list so the iterator don't get affected by mutation\n      const matches = [...this.uiChat.getElementsByClassName(`msg ${tag}-msg`)];\n      for (const item of matches) {\n        this.uiChat.removeChild(item);\n      }\n    }\n    if (this.uiChatInfoLabel !== undefined) {\n      this.uiChatInfoLabel.innerHTML = \"\";\n    }\n  }\n\n  private async asyncInitChat() {\n    if (this.chatLoaded) return;\n    this.requestInProgress = true;\n    this.appendMessage(\"init\", \"\");\n    const initProgressCallback = (report) => {\n      this.updateLastMessage(\"init\", report.text);\n    };\n    this.engine.setInitProgressCallback(initProgressCallback);\n\n    try {\n      await this.engine.reload(this.selectedModel);\n    } catch (err) {\n      this.appendMessage(\"error\", \"Init error, \" + err.toString());\n      console.log(err.stack);\n      this.unloadChat();\n      this.requestInProgress = false;\n      return;\n    }\n    this.requestInProgress = false;\n    this.chatLoaded = true;\n  }\n\n  private async unloadChat() {\n    await this.engine.unload();\n    this.chatLoaded = false;\n  }\n\n  /**\n   * Run generate\n   */\n  private async asyncGenerate() {\n    await this.asyncInitChat();\n    this.requestInProgress = true;\n    const prompt = this.uiChatInput.value;\n    if (prompt == \"\") {\n      this.requestInProgress = false;\n      return;\n    }\n\n    this.appendMessage(\"right\", prompt);\n    this.uiChatInput.value = \"\";\n    this.uiChatInput.setAttribute(\"placeholder\", \"Generating...\");\n\n    this.appendMessage(\"left\", \"\");\n    this.chatHistory.push({ role: \"user\", content: prompt });\n\n    try {\n      let curMessage = \"\";\n      let usage: webllm.CompletionUsage | undefined = undefined;\n      const completion = await this.engine.chat.completions.create({\n        stream: true,\n        messages: this.chatHistory,\n        stream_options: { include_usage: true },\n      });\n      // TODO(Charlie): Processing of � requires changes\n      for await (const chunk of completion) {\n        const curDelta = chunk.choices[0]?.delta.content;\n        if (curDelta) {\n          curMessage += curDelta;\n        }\n        this.updateLastMessage(\"left\", curMessage);\n        if (chunk.usage) {\n          usage = chunk.usage;\n        }\n      }\n      if (usage) {\n        this.uiChatInfoLabel.innerHTML =\n          `prompt_tokens: ${usage.prompt_tokens}, ` +\n          `completion_tokens: ${usage.completion_tokens}, ` +\n          `prefill: ${usage.extra.prefill_tokens_per_s.toFixed(4)} tokens/sec, ` +\n          `decoding: ${usage.extra.decode_tokens_per_s.toFixed(4)} tokens/sec`;\n      }\n      const finalMessage = await this.engine.getMessage();\n      this.updateLastMessage(\"left\", finalMessage); // TODO: Remove this after � issue is fixed\n      this.chatHistory.push({ role: \"assistant\", content: finalMessage });\n    } catch (err) {\n      this.appendMessage(\"error\", \"Generate error, \" + err.toString());\n      console.log(err.stack);\n      await this.unloadChat();\n    }\n    this.uiChatInput.setAttribute(\"placeholder\", \"Enter your message...\");\n    this.requestInProgress = false;\n  }\n}\n\nconst useWebWorker = appConfig.use_web_worker;\nlet engine: webllm.MLCEngineInterface;\n\n// Here we do not use `CreateMLCEngine()` but instantiate an engine that is not loaded with model\nif (useWebWorker) {\n  engine = new webllm.WebWorkerMLCEngine(\n    new Worker(new URL(\"./worker.ts\", import.meta.url), { type: \"module\" }),\n    { appConfig },\n  );\n} else {\n  engine = new webllm.MLCEngine({ appConfig });\n}\nChatUI.CreateAsync(engine);\n\nfunction getFileType(file: File) {\n  if (file.name.includes(\"wasm\")) {\n    return \"webllm/wasm\";\n  } else if (\n    file.name.includes(\".bin\") ||\n    file.name.includes(\"ndarray-cache.json\")\n  ) {\n    return \"webllm/model\";\n  } else if (file.name.includes(\"mlc-chat-config.json\")) {\n    return \"webllm/config\";\n  } else {\n    console.log(\"No model file suffix found\");\n    return \"file-cache\";\n  }\n}\n\nasync function uploadToIndexedDB(file: File) {\n  let db;\n  const request = indexedDB.open(getFileType(file), 1);\n  request.onupgradeneeded = (event) => {\n    db = (event.target as IDBOpenDBRequest).result;\n    if (!db.objectStoreNames.contains(\"urls\")) {\n      db.createObjectStore(\"urls\", { keyPath: \"url\" });\n    }\n  };\n  request.onsuccess = (event) => {\n    db = (event.target as IDBOpenDBRequest).result;\n  };\n  request.onerror = (event) => {\n    console.error(\"Database error: \", (event.target as IDBOpenDBRequest).error);\n  };\n  const transaction = db.transaction(\"files\", \"readwrite\");\n  const store = transaction.objectStore(\"files\");\n  const reader = new FileReader();\n  reader.onload = async (e) => {\n    if (e.target === null || e.target.result === null) {\n      console.error(\"Do not read any files\");\n      return;\n    }\n    const url = file.name;\n    store.add(e.target.result, url);\n  };\n  transaction.oncomplete = function () {\n    alert(\"All files have been uploaded to IndexedDB.\");\n  };\n  transaction.onerror = function (event) {\n    console.error(\"Error uploading files:\", event);\n  };\n}\n\nasync function cacheFile(file: File, response: Response) {\n  try {\n    const cache = await caches.open(getFileType(file)); // Ensure getFileType is a synchronous function or awaited if async\n    console.log(\"Put response into cache:\", response);\n    await cache.put(file.name, response);\n  } catch (error) {\n    console.error(\"Failed to cache the file:\", error);\n  }\n}\n\nasync function uploadFiles(): Promise<void> {\n  const input = document.getElementById(\"file-input\") as HTMLInputElement;\n  if (!input.files || input.files.length === 0) {\n    alert(\"No files selected.\");\n    return;\n  }\n  if (appConfig.useIndexedDBCache) {\n    for (const file of input.files) {\n      uploadToIndexedDB(file);\n    }\n  } else {\n    for (const file of input.files) {\n      const reader = new FileReader();\n      reader.onload = async (e) => {\n        if (e.target === null || e.target.result === null) {\n          console.error(\"Do not read any files\");\n          return;\n        }\n        const arrayBuffer = e.target.result as ArrayBuffer;\n        const response = new Response(arrayBuffer, {\n          status: 200,\n          statusText: \"OK\",\n          headers: {\n            \"Content-Type\": \"application/octet-stream\",\n            \"Content-Length\": arrayBuffer.byteLength.toString(),\n          },\n        });\n        await cacheFile(file, response);\n      };\n      if (\n        file.name.includes(\"mlc-chat-config.json\") ||\n        file.name.includes(\"ndarray-cache.json\")\n      ) {\n        reader.readAsText(file);\n      } else {\n        reader.readAsArrayBuffer(file);\n      }\n    }\n  }\n}\n\n(window as any).uploadFiles = uploadFiles;\n"
  },
  {
    "path": "examples/simple-chat-upload/src/worker.ts",
    "content": "// Serve the engine workload through web worker\nimport { WebWorkerMLCEngineHandler } from \"@mlc-ai/web-llm\";\n\nconst handler = new WebWorkerMLCEngineHandler();\nself.onmessage = (msg: MessageEvent) => {\n  handler.onmessage(msg);\n};\n"
  },
  {
    "path": "examples/streaming/README.md",
    "content": "### OpenAI API Demos\n\nRun `npm install` first, followed by `npm start`.\n\nNote if you would like to hack WebLLM core package,\nyou can change web-llm dependencies as `\"file:../..\"`, and follow the build from source\ninstruction in the project to build webllm locally. This option is only recommended\nif you would like to hack WebLLM core package.\n"
  },
  {
    "path": "examples/streaming/package.json",
    "content": "{\n  \"name\": \"streaming\",\n  \"version\": \"0.1.0\",\n  \"private\": true,\n  \"scripts\": {\n    \"start\": \"parcel src/streaming.html  --port 8888\",\n    \"build\": \"parcel build src/streaming.html --dist-dir lib\"\n  },\n  \"devDependencies\": {\n    \"buffer\": \"^5.7.1\",\n    \"parcel\": \"^2.8.3\",\n    \"process\": \"^0.11.10\",\n    \"tslib\": \"^2.3.1\",\n    \"typescript\": \"^4.9.5\",\n    \"url\": \"^0.11.3\"\n  },\n  \"dependencies\": {\n    \"@mlc-ai/web-llm\": \"^0.2.82\"\n  }\n}\n"
  },
  {
    "path": "examples/streaming/src/streaming.html",
    "content": "<!doctype html>\n<html>\n  <script>\n    webLLMGlobal = {};\n  </script>\n\n  <body>\n    <h2>WebLLM Test Page</h2>\n    Open console to see output\n    <br />\n    <br />\n    <label id=\"init-label\"> </label>\n    <h3>Response</h3>\n    <label id=\"generate-label\"> </label>\n    <script type=\"module\" src=\"./streaming.ts\"></script>\n  </body>\n</html>\n"
  },
  {
    "path": "examples/streaming/src/streaming.ts",
    "content": "import * as webllm from \"@mlc-ai/web-llm\";\n\nfunction setLabel(id: string, text: string) {\n  const label = document.getElementById(id);\n  if (label == null) {\n    throw Error(\"Cannot find label \" + id);\n  }\n  label.innerText = text;\n}\n\n/**\n * We demonstrate chat completion with streaming, where delta is sent while generating response.\n */\nasync function main() {\n  const initProgressCallback = (report: webllm.InitProgressReport) => {\n    setLabel(\"init-label\", report.text);\n  };\n  const selectedModel = \"Llama-3.1-8B-Instruct-q4f32_1-MLC\";\n  const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(\n    selectedModel,\n    { initProgressCallback: initProgressCallback },\n  );\n\n  const request: webllm.ChatCompletionRequest = {\n    stream: true,\n    stream_options: { include_usage: true },\n    messages: [\n      {\n        role: \"system\",\n        content:\n          \"You are a pirate chatbot who always responds in pirate speak!\",\n      },\n      { role: \"user\", content: \"Who are you?\" },\n    ],\n    logprobs: true,\n    top_logprobs: 2,\n  };\n\n  const asyncChunkGenerator = await engine.chat.completions.create(request);\n  let message = \"\";\n  for await (const chunk of asyncChunkGenerator) {\n    console.log(chunk);\n    message += chunk.choices[0]?.delta?.content || \"\";\n    setLabel(\"generate-label\", message);\n    if (chunk.usage) {\n      console.log(chunk.usage); // only last chunk has usage\n    }\n    // engine.interruptGenerate();  // works with interrupt as well\n  }\n  console.log(\"Final message:\\n\", await engine.getMessage()); // the concatenated message\n}\n\nmain();\n"
  },
  {
    "path": "examples/structural-tag-tool-use/README.md",
    "content": "# Structural tag MCP-style tool calls\n\nRun `npm install`, then `npm start` to launch a minimal page that prints progress and logs to the browser console.\n\nThis example demonstrates how to:\n\n- Define a structural tag that forces an MCP-style `<tool_call>...</tool_call>` block with `{\"name\": ..., \"arguments\": ...}` payloads.\n- Ask WebLLM for a tool call with `response_format.type = \"structural_tag\"`, parse the call, and dispatch to a stubbed tool implementation.\n- Send the tool result back via a `tool` message and request a final natural-language answer.\n\nOpen the console to see the enforced tool call, the stubbed tool response, and the final assistant reply.\n"
  },
  {
    "path": "examples/structural-tag-tool-use/package.json",
    "content": "{\n  \"name\": \"structural-tag-tool-use\",\n  \"version\": \"0.1.0\",\n  \"private\": true,\n  \"scripts\": {\n    \"start\": \"parcel src/mcp_structural_tag.html --port 8887\",\n    \"build\": \"parcel build src/mcp_structural_tag.html --dist-dir lib\"\n  },\n  \"devDependencies\": {\n    \"buffer\": \"^5.7.1\",\n    \"parcel\": \"2.8.3\",\n    \"process\": \"^0.11.10\",\n    \"tslib\": \"^2.3.1\",\n    \"typescript\": \"^4.9.5\",\n    \"url\": \"^0.11.3\"\n  },\n  \"dependencies\": {\n    \"@mlc-ai/web-llm\": \"file:../..\"\n  }\n}\n"
  },
  {
    "path": "examples/structural-tag-tool-use/src/mcp_structural_tag.html",
    "content": "<!doctype html>\n<html>\n  <script>\n    webLLMGlobal = {};\n  </script>\n  <body>\n    <h2>Structural tag MCP-style tool calls</h2>\n    <p>\n      Open the console to see the enforced tool call, tool response, and final\n      reply.\n    </p>\n    <label id=\"init-label\"></label>\n    <pre id=\"log\"></pre>\n    <script type=\"module\" src=\"./mcp_structural_tag.ts\"></script>\n  </body>\n</html>\n"
  },
  {
    "path": "examples/structural-tag-tool-use/src/mcp_structural_tag.ts",
    "content": "import * as webllm from \"@mlc-ai/web-llm\";\n\ntype ToolInvocation = {\n  name: string;\n  arguments: Record<string, unknown>;\n};\n\ntype ToolDefinition = {\n  name: string;\n  description: string;\n  schema: Record<string, unknown>;\n};\n\nconst tools: ToolDefinition[] = [\n  {\n    name: \"get_weather\",\n    description: \"Fetch an approximate weather report for a city.\",\n    schema: {\n      type: \"object\",\n      properties: {\n        location: { type: \"string\", description: \"City name, e.g. Tokyo\" },\n        unit: {\n          type: \"string\",\n          enum: [\"celsius\", \"fahrenheit\"],\n          description: \"Temperature unit\",\n        },\n      },\n      required: [\"location\"],\n    },\n  },\n  {\n    name: \"get_time\",\n    description: \"Return the current time in a given IANA timezone.\",\n    schema: {\n      type: \"object\",\n      properties: {\n        timezone: {\n          type: \"string\",\n          description: \"IANA timezone name, defaults to UTC\",\n        },\n      },\n      required: [],\n    },\n  },\n];\n\nconst mcpStructuralTag = {\n  type: \"structural_tag\",\n  format: {\n    type: \"triggered_tags\",\n    triggers: [\"<tool_call>\"],\n    tags: tools.map((tool) => ({\n      begin: `<tool_call>\\n{\"name\": \"${tool.name}\", \"arguments\": `,\n      content: { type: \"json_schema\", json_schema: tool.schema },\n      end: \"}\\n</tool_call>\",\n    })),\n    at_least_one: true,\n    stop_after_first: false,\n  },\n} as const;\n\nconst initProgressCallback = (report: webllm.InitProgressReport) => {\n  setLabel(\"init-label\", report.text);\n};\n\nfunction setLabel(id: string, text: string) {\n  const label = document.getElementById(id);\n  if (label == null) {\n    throw Error(\"Cannot find label \" + id);\n  }\n  label.innerText = text;\n}\n\nfunction appendLog(text: string) {\n  const log = document.getElementById(\"log\");\n  if (log != null) {\n    log.textContent += `${text}\\n`;\n  }\n  console.log(text);\n}\n\nfunction parseToolCallBlocks(\n  content: string | null | undefined,\n): ToolInvocation[] {\n  if (!content) {\n    throw new Error(\"Assistant reply did not contain a tool call.\");\n  }\n  const regex = /<tool_call>\\s*({[\\s\\S]*?})\\s*<\\/tool_call>/g;\n  const calls: ToolInvocation[] = [];\n  let match: RegExpExecArray | null;\n  while ((match = regex.exec(content)) !== null) {\n    const payload = JSON.parse(match[1]);\n    if (typeof payload.name !== \"string\" || payload.arguments === undefined) {\n      continue;\n    }\n    calls.push({ name: payload.name, arguments: payload.arguments });\n  }\n  if (calls.length === 0) {\n    throw new Error(\"Failed to find any <tool_call> blocks.\");\n  }\n  return calls;\n}\n\nasync function runTool(call: ToolInvocation): Promise<Record<string, unknown>> {\n  if (call.name === \"get_weather\") {\n    const location = String(call.arguments.location ?? \"\").trim() || \"unknown\";\n    const unit = (call.arguments.unit as string) ?? \"celsius\";\n    return {\n      location,\n      unit,\n      temperature: unit === \"fahrenheit\" ? 72.0 : 22.2,\n      conditions: \"Clear skies\",\n      source: \"demo-weather-kit\",\n    };\n  }\n  if (call.name === \"get_time\") {\n    const timezone = (call.arguments.timezone as string) ?? \"UTC\";\n    return {\n      timezone,\n      iso_time: new Date().toISOString(),\n      note: \"Demo tool uses local clock only.\",\n    };\n  }\n  return { error: `Tool ${call.name} is not implemented in the demo.` };\n}\n\nasync function main() {\n  try {\n    appendLog(\"Loading model...\");\n    const selectedModel = \"Llama-3.2-1B-Instruct-q4f16_1-MLC\";\n    const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(\n      selectedModel,\n      { initProgressCallback: initProgressCallback, logLevel: \"INFO\" },\n    );\n\n    const systemPrompt =\n      \"You are a MCP assistant. \" +\n      'Use the provided tools and emit one or more <tool_call> blocks (one per tool you need) with a JSON body {\"name\": ..., \"arguments\": ...}. ' +\n      \"Do not add extra prose when calling a tool.\" +\n      \" Available tools: \" +\n      JSON.stringify(\n        tools.map((tool) => ({\n          name: tool.name,\n          description: tool.description,\n        })),\n        null,\n        2,\n      );\n\n    const messages: webllm.ChatCompletionMessageParam[] = [\n      { role: \"system\", content: systemPrompt },\n      {\n        role: \"user\",\n        content:\n          \"Give me the weather in Paris in celsius and also tell me the current time in UTC.\",\n      },\n    ];\n\n    const responseFormat: webllm.ResponseFormat = {\n      type: \"structural_tag\",\n      structural_tag: mcpStructuralTag,\n    };\n\n    appendLog(\"Requesting constrained tool call...\");\n    const toolCallReply = await engine.chat.completions.create({\n      stream: false,\n      messages,\n      max_tokens: 1024,\n      response_format: responseFormat,\n    });\n\n    const toolCallContent = toolCallReply.choices[0].message.content ?? \"\";\n    appendLog(`Assistant tool call:\\n${toolCallContent}`);\n    const parsedCalls = parseToolCallBlocks(toolCallContent);\n    const toolCalls = parsedCalls.map((call, idx) => {\n      const toolCallId = `${call.name}-call-${idx + 1}`;\n      return { id: toolCallId, call };\n    });\n    messages.push({\n      role: \"assistant\",\n      content: toolCallContent,\n      tool_calls: toolCalls.map(({ id, call }) => ({\n        id,\n        type: \"function\",\n        function: {\n          name: call.name,\n          arguments: JSON.stringify(call.arguments),\n        },\n      })),\n    } as webllm.ChatCompletionMessageParam);\n\n    for (const { id, call } of toolCalls) {\n      const toolResult = await runTool(call);\n      messages.push({\n        role: \"tool\",\n        tool_call_id: id,\n        content: JSON.stringify(toolResult),\n      });\n      appendLog(\n        `Tool response for ${call.name}:\\n${JSON.stringify(toolResult, null, 2)}`,\n      );\n    }\n\n    messages.push({\n      role: \"user\",\n      content:\n        \"You have been given one or more tool responses above. Summarize ALL tool results in a single reply. Include both the weather details and the time information. Do not make up any values.\",\n    });\n\n    appendLog(\"Requesting final assistant message...\");\n    const finalReply = await engine.chat.completions.create({\n      stream: false,\n      messages,\n      max_tokens: 256,\n    });\n    const finalContent = finalReply.choices[0].message.content ?? \"\";\n    appendLog(`Final assistant message:\\n${finalContent}`);\n  } catch (err) {\n    const message = err instanceof Error ? err.message : String(err);\n    appendLog(`Error: ${message}`);\n    console.error(err);\n  }\n}\n\nvoid main();\n"
  },
  {
    "path": "examples/text-completion/README.md",
    "content": "# WebLLM Get Started App\n\nThis folder provides a minimum demo to show WebLLM API in a webapp setting.\nTo try it out, you can do the following steps under this folder\n\n```bash\nnpm install\nnpm start\n```\n\nNote if you would like to hack WebLLM core package.\nYou can change web-llm dependencies as `\"file:../..\"`, and follow the build from source\ninstruction in the project to build webllm locally. This option is only recommended\nif you would like to hack WebLLM core package.\n"
  },
  {
    "path": "examples/text-completion/package.json",
    "content": "{\n  \"name\": \"text-completion\",\n  \"version\": \"0.1.0\",\n  \"private\": true,\n  \"scripts\": {\n    \"start\": \"parcel src/text_completion.html  --port 8888\",\n    \"build\": \"parcel build src/text_completion.html --dist-dir lib\"\n  },\n  \"devDependencies\": {\n    \"buffer\": \"^5.7.1\",\n    \"parcel\": \"^2.8.3\",\n    \"process\": \"^0.11.10\",\n    \"tslib\": \"^2.3.1\",\n    \"typescript\": \"^4.9.5\",\n    \"url\": \"^0.11.3\"\n  },\n  \"dependencies\": {\n    \"@mlc-ai/web-llm\": \"^0.2.82\"\n  }\n}\n"
  },
  {
    "path": "examples/text-completion/src/text_completion.html",
    "content": "<!doctype html>\n<html>\n  <script>\n    webLLMGlobal = {};\n  </script>\n  <body>\n    <h2>WebLLM Test Page</h2>\n    Open console to see output\n    <br />\n    <br />\n    <label id=\"init-label\"> </label>\n\n    <h3>Prompt</h3>\n    <label id=\"prompt-label\"> </label>\n\n    <h3>Response</h3>\n    <label id=\"generate-label\"> </label>\n    <br />\n    <label id=\"stats-label\"> </label>\n\n    <script type=\"module\" src=\"./text_completion.ts\"></script>\n  </body>\n</html>\n"
  },
  {
    "path": "examples/text-completion/src/text_completion.ts",
    "content": "import * as webllm from \"@mlc-ai/web-llm\";\n\nfunction setLabel(id: string, text: string) {\n  const label = document.getElementById(id);\n  if (label == null) {\n    throw Error(\"Cannot find label \" + id);\n  }\n  label.innerText = text;\n}\n\nasync function main() {\n  const initProgressCallback = (report: webllm.InitProgressReport) => {\n    setLabel(\"init-label\", report.text);\n  };\n\n  // Unlike \"Llama-3.1-8B-Instruct-q4f32_1-MLC\", this is a base model\n  const selectedModel = \"Llama-3.1-8B-q4f32_1-MLC\";\n\n  const appConfig: webllm.AppConfig = {\n    model_list: [\n      {\n        model: \"https://huggingface.co/mlc-ai/Llama-3.1-8B-q4f32_1-MLC\", // a base model\n        model_id: selectedModel,\n        model_lib:\n          webllm.modelLibURLPrefix +\n          webllm.modelVersion +\n          \"/Llama-3_1-8B-Instruct-q4f32_1-ctx4k_cs1k-webgpu.wasm\",\n        overrides: {\n          context_window_size: 2048,\n        },\n      },\n    ],\n  };\n  const engine: webllm.MLCEngineInterface = await webllm.CreateMLCEngine(\n    selectedModel,\n    {\n      appConfig: appConfig,\n      initProgressCallback: initProgressCallback,\n      logLevel: \"INFO\",\n    },\n  );\n\n  const reply0 = await engine.completions.create({\n    prompt: \"List 3 US states: \",\n    // below configurations are all optional\n    echo: true,\n    n: 2,\n    max_tokens: 64,\n    logprobs: true,\n    top_logprobs: 2,\n  });\n  console.log(reply0);\n  console.log(reply0.usage);\n\n  // To change model, either create a new engine via `CreateMLCEngine()`, or call `engine.reload(modelId)`\n}\n\nmain();\n"
  },
  {
    "path": "examples/vision-model/README.md",
    "content": "# WebLLM Get Started App\n\nThis folder provides a minimum demo to show WebLLM API in a webapp setting.\nTo try it out, you can do the following steps under this folder\n\n```bash\nnpm install\nnpm start\n```\n\nNote if you would like to hack WebLLM core package.\nYou can change web-llm dependencies as `\"file:../..\"`, and follow the build from source\ninstruction in the project to build webllm locally. This option is only recommended\nif you would like to hack WebLLM core package.\n"
  },
  {
    "path": "examples/vision-model/package.json",
    "content": "{\n  \"name\": \"get-started\",\n  \"version\": \"0.1.0\",\n  \"private\": true,\n  \"scripts\": {\n    \"start\": \"parcel src/vision_model.html  --port 8888\",\n    \"build\": \"parcel build src/vision_model.html --dist-dir lib\"\n  },\n  \"devDependencies\": {\n    \"buffer\": \"^5.7.1\",\n    \"parcel\": \"^2.8.3\",\n    \"process\": \"^0.11.10\",\n    \"tslib\": \"^2.3.1\",\n    \"typescript\": \"^4.9.5\",\n    \"url\": \"^0.11.3\"\n  },\n  \"dependencies\": {\n    \"@mlc-ai/web-llm\": \"^0.2.82\"\n  }\n}\n"
  },
  {
    "path": "examples/vision-model/src/utils.ts",
    "content": "export function getImageDataFromURL(url: string): Promise<ImageData> {\n  return new Promise((resolve, reject) => {\n    // Converts img to any, and later `as CanvasImageSource`, otherwise build complains\n    const img: any = new Image();\n    img.crossOrigin = \"anonymous\"; // Important for CORS\n    img.onload = () => {\n      const canvas: HTMLCanvasElement = document.createElement(\"canvas\");\n      const ctx: CanvasRenderingContext2D = canvas.getContext(\"2d\")!;\n      canvas.width = img.width;\n      canvas.height = img.height;\n      ctx.drawImage(img as CanvasImageSource, 0, 0);\n\n      const imageData = ctx.getImageData(0, 0, img.width, img.height);\n      resolve(imageData);\n    };\n    img.onerror = () => reject(new Error(\"Failed to load image\"));\n    img.src = url;\n  });\n}\n\nexport async function imageURLToBase64(url: string): Promise<string> {\n  const imageData: ImageData = await getImageDataFromURL(url);\n  const canvas = document.createElement(\"canvas\");\n  const ctx = canvas.getContext(\"2d\");\n\n  canvas.width = imageData.width;\n  canvas.height = imageData.height;\n\n  ctx!.putImageData(imageData, 0, 0);\n\n  return canvas.toDataURL();\n}\n"
  },
  {
    "path": "examples/vision-model/src/vision_model.html",
    "content": "<!doctype html>\n<html>\n  <script>\n    webLLMGlobal = {};\n  </script>\n  <body>\n    <h2>WebLLM Test Page</h2>\n    Open console to see output\n    <br />\n    <br />\n    <label id=\"init-label\"> </label>\n\n    <h3>Prompt</h3>\n    <label id=\"prompt-label\"> </label>\n\n    <h3>Response</h3>\n    <label id=\"generate-label\"> </label>\n    <br />\n    <label id=\"stats-label\"> </label>\n\n    <script type=\"module\" src=\"./vision_model.ts\"></script>\n  </body>\n</html>\n"
  },
  {
    "path": "examples/vision-model/src/vision_model.ts",
    "content": "import * as webllm from \"@mlc-ai/web-llm\";\nimport { imageURLToBase64 } from \"./utils\";\n\nfunction setLabel(id: string, text: string) {\n  const label = document.getElementById(id);\n  if (label == null) {\n    throw Error(\"Cannot find label \" + id);\n  }\n  label.innerText = text;\n}\n\nconst USE_WEB_WORKER = true;\n\nconst proxyUrl = \"https://cors-anywhere.herokuapp.com/\";\nconst url_https_street = \"https://www.ilankelman.org/stopsigns/australia.jpg\";\nconst url_https_tree = \"https://www.ilankelman.org/sunset.jpg\";\nconst url_https_sea =\n  \"https://www.islandvulnerability.org/index/silhouette.jpg\";\n\nasync function main() {\n  // can feed request with either base64 or http url\n  const url_base64_street = await imageURLToBase64(proxyUrl + url_https_street);\n\n  const initProgressCallback = (report: webllm.InitProgressReport) => {\n    setLabel(\"init-label\", report.text);\n  };\n  const selectedModel = \"Phi-3.5-vision-instruct-q4f16_1-MLC\";\n\n  const engineConfig: webllm.MLCEngineConfig = {\n    initProgressCallback: initProgressCallback,\n    logLevel: \"INFO\", // specify the log level\n  };\n  const chatOpts = {\n    context_window_size: 6144,\n  };\n\n  const engine: webllm.MLCEngineInterface = USE_WEB_WORKER\n    ? await webllm.CreateWebWorkerMLCEngine(\n        new Worker(new URL(\"./worker.ts\", import.meta.url), {\n          type: \"module\",\n        }),\n        selectedModel,\n        engineConfig,\n        chatOpts,\n      )\n    : await webllm.CreateMLCEngine(selectedModel, engineConfig, chatOpts);\n\n  // 1. Prefill two images\n  const messages: webllm.ChatCompletionMessageParam[] = [\n    {\n      role: \"user\",\n      content: [\n        { type: \"text\", text: \"List the items in each image concisely.\" },\n        {\n          type: \"image_url\",\n          image_url: {\n            url: url_base64_street,\n          },\n        },\n        {\n          type: \"image_url\",\n          image_url: {\n            url: proxyUrl + url_https_sea,\n          },\n        },\n      ],\n    },\n  ];\n  const request0: webllm.ChatCompletionRequest = {\n    stream: false, // can be streaming, same behavior\n    messages: messages,\n  };\n  const reply0 = await engine.chat.completions.create(request0);\n  const replyMessage0 = await engine.getMessage();\n  console.log(reply0);\n  console.log(replyMessage0);\n  console.log(reply0.usage);\n\n  // 2. A follow up text-only question\n  messages.push({ role: \"assistant\", content: replyMessage0 });\n  messages.push({ role: \"user\", content: \"What is special about each image?\" });\n  const request1: webllm.ChatCompletionRequest = {\n    stream: false, // can be streaming, same behavior\n    messages: messages,\n  };\n  const reply1 = await engine.chat.completions.create(request1);\n  const replyMessage1 = await engine.getMessage();\n  console.log(reply1);\n  console.log(replyMessage1);\n  console.log(reply1.usage);\n\n  // 3. A follow up single-image question\n  messages.push({ role: \"assistant\", content: replyMessage1 });\n  messages.push({\n    role: \"user\",\n    content: [\n      { type: \"text\", text: \"What about this image? Answer concisely.\" },\n      {\n        type: \"image_url\",\n        image_url: { url: proxyUrl + url_https_tree },\n      },\n    ],\n  });\n  const request2: webllm.ChatCompletionRequest = {\n    stream: false, // can be streaming, same behavior\n    messages: messages,\n  };\n  const reply2 = await engine.chat.completions.create(request2);\n  const replyMessage2 = await engine.getMessage();\n  console.log(reply2);\n  console.log(replyMessage2);\n  console.log(reply2.usage);\n}\n\nmain();\n"
  },
  {
    "path": "examples/vision-model/src/worker.ts",
    "content": "import { WebWorkerMLCEngineHandler } from \"@mlc-ai/web-llm\";\n\nconst handler = new WebWorkerMLCEngineHandler();\n\nself.onmessage = (msg: MessageEvent) => {\n  handler.onmessage(msg);\n};\n"
  },
  {
    "path": "jest.config.cjs",
    "content": "module.exports = {\n    preset: \"ts-jest\",\n    testEnvironment: \"node\",\n    roots: [\"<rootDir>/tests\", \"<rootDir>/src\"],\n    modulePathIgnorePatterns: [\"<rootDir>/examples/\"],\n    collectCoverageFrom: [\"src/**/*.{ts,tsx}\", \"!src/**/*.d.ts\"],\n    coverageThreshold: {\n        global: {\n            statements: 25,\n            branches: 20,\n            functions: 20,\n            lines: 25,\n        },\n        \"./src/engine.ts\": {\n            statements: 35,\n            branches: 25,\n            functions: 40,\n            lines: 35,\n        },\n    },\n};\n"
  },
  {
    "path": "licenses/license.openai_node.txt",
    "content": "                                 Apache License\n                           Version 2.0, January 2004\n                        http://www.apache.org/licenses/\n\n   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n   1. Definitions.\n\n      \"License\" shall mean the terms and conditions for use, reproduction,\n      and distribution as defined by Sections 1 through 9 of this document.\n\n      \"Licensor\" shall mean the copyright owner or entity authorized by\n      the copyright owner that is granting the License.\n\n      \"Legal Entity\" shall mean the union of the acting entity and all\n      other entities that control, are controlled by, or are under common\n      control with that entity. For the purposes of this definition,\n      \"control\" means (i) the power, direct or indirect, to cause the\n      direction or management of such entity, whether by contract or\n      otherwise, or (ii) ownership of fifty percent (50%) or more of the\n      outstanding shares, or (iii) beneficial ownership of such entity.\n\n      \"You\" (or \"Your\") shall mean an individual or Legal Entity\n      exercising permissions granted by this License.\n\n      \"Source\" form shall mean the preferred form for making modifications,\n      including but not limited to software source code, documentation\n      source, and configuration files.\n\n      \"Object\" form shall mean any form resulting from mechanical\n      transformation or translation of a Source form, including but\n      not limited to compiled object code, generated documentation,\n      and conversions to other media types.\n\n      \"Work\" shall mean the work of authorship, whether in Source or\n      Object form, made available under the License, as indicated by a\n      copyright notice that is included in or attached to the work\n      (an example is provided in the Appendix below).\n\n      \"Derivative Works\" shall mean any work, whether in Source or Object\n      form, that is based on (or derived from) the Work and for which the\n      editorial revisions, annotations, elaborations, or other modifications\n      represent, as a whole, an original work of authorship. For the purposes\n      of this License, Derivative Works shall not include works that remain\n      separable from, or merely link (or bind by name) to the interfaces of,\n      the Work and Derivative Works thereof.\n\n      \"Contribution\" shall mean any work of authorship, including\n      the original version of the Work and any modifications or additions\n      to that Work or Derivative Works thereof, that is intentionally\n      submitted to Licensor for inclusion in the Work by the copyright owner\n      or by an individual or Legal Entity authorized to submit on behalf of\n      the copyright owner. For the purposes of this definition, \"submitted\"\n      means any form of electronic, verbal, or written communication sent\n      to the Licensor or its representatives, including but not limited to\n      communication on electronic mailing lists, source code control systems,\n      and issue tracking systems that are managed by, or on behalf of, the\n      Licensor for the purpose of discussing and improving the Work, but\n      excluding communication that is conspicuously marked or otherwise\n      designated in writing by the copyright owner as \"Not a Contribution.\"\n\n      \"Contributor\" shall mean Licensor and any individual or Legal Entity\n      on behalf of whom a Contribution has been received by Licensor and\n      subsequently incorporated within the Work.\n\n   2. Grant of Copyright License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      copyright license to reproduce, prepare Derivative Works of,\n      publicly display, publicly perform, sublicense, and distribute the\n      Work and such Derivative Works in Source or Object form.\n\n   3. Grant of Patent License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      (except as stated in this section) patent license to make, have made,\n      use, offer to sell, sell, import, and otherwise transfer the Work,\n      where such license applies only to those patent claims licensable\n      by such Contributor that are necessarily infringed by their\n      Contribution(s) alone or by combination of their Contribution(s)\n      with the Work to which such Contribution(s) was submitted. If You\n      institute patent litigation against any entity (including a\n      cross-claim or counterclaim in a lawsuit) alleging that the Work\n      or a Contribution incorporated within the Work constitutes direct\n      or contributory patent infringement, then any patent licenses\n      granted to You under this License for that Work shall terminate\n      as of the date such litigation is filed.\n\n   4. Redistribution. You may reproduce and distribute copies of the\n      Work or Derivative Works thereof in any medium, with or without\n      modifications, and in Source or Object form, provided that You\n      meet the following conditions:\n\n      (a) You must give any other recipients of the Work or\n          Derivative Works a copy of this License; and\n\n      (b) You must cause any modified files to carry prominent notices\n          stating that You changed the files; and\n\n      (c) You must retain, in the Source form of any Derivative Works\n          that You distribute, all copyright, patent, trademark, and\n          attribution notices from the Source form of the Work,\n          excluding those notices that do not pertain to any part of\n          the Derivative Works; and\n\n      (d) If the Work includes a \"NOTICE\" text file as part of its\n          distribution, then any Derivative Works that You distribute must\n          include a readable copy of the attribution notices contained\n          within such NOTICE file, excluding those notices that do not\n          pertain to any part of the Derivative Works, in at least one\n          of the following places: within a NOTICE text file distributed\n          as part of the Derivative Works; within the Source form or\n          documentation, if provided along with the Derivative Works; or,\n          within a display generated by the Derivative Works, if and\n          wherever such third-party notices normally appear. The contents\n          of the NOTICE file are for informational purposes only and\n          do not modify the License. You may add Your own attribution\n          notices within Derivative Works that You distribute, alongside\n          or as an addendum to the NOTICE text from the Work, provided\n          that such additional attribution notices cannot be construed\n          as modifying the License.\n\n      You may add Your own copyright statement to Your modifications and\n      may provide additional or different license terms and conditions\n      for use, reproduction, or distribution of Your modifications, or\n      for any such Derivative Works as a whole, provided Your use,\n      reproduction, and distribution of the Work otherwise complies with\n      the conditions stated in this License.\n\n   5. Submission of Contributions. Unless You explicitly state otherwise,\n      any Contribution intentionally submitted for inclusion in the Work\n      by You to the Licensor shall be under the terms and conditions of\n      this License, without any additional terms or conditions.\n      Notwithstanding the above, nothing herein shall supersede or modify\n      the terms of any separate license agreement you may have executed\n      with Licensor regarding such Contributions.\n\n   6. Trademarks. This License does not grant permission to use the trade\n      names, trademarks, service marks, or product names of the Licensor,\n      except as required for reasonable and customary use in describing the\n      origin of the Work and reproducing the content of the NOTICE file.\n\n   7. Disclaimer of Warranty. Unless required by applicable law or\n      agreed to in writing, Licensor provides the Work (and each\n      Contributor provides its Contributions) on an \"AS IS\" BASIS,\n      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n      implied, including, without limitation, any warranties or conditions\n      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n      PARTICULAR PURPOSE. You are solely responsible for determining the\n      appropriateness of using or redistributing the Work and assume any\n      risks associated with Your exercise of permissions under this License.\n\n   8. Limitation of Liability. In no event and under no legal theory,\n      whether in tort (including negligence), contract, or otherwise,\n      unless required by applicable law (such as deliberate and grossly\n      negligent acts) or agreed to in writing, shall any Contributor be\n      liable to You for damages, including any direct, indirect, special,\n      incidental, or consequential damages of any character arising as a\n      result of this License or out of the use or inability to use the\n      Work (including but not limited to damages for loss of goodwill,\n      work stoppage, computer failure or malfunction, or any and all\n      other commercial damages or losses), even if such Contributor\n      has been advised of the possibility of such damages.\n\n   9. Accepting Warranty or Additional Liability. While redistributing\n      the Work or Derivative Works thereof, You may choose to offer,\n      and charge a fee for, acceptance of support, warranty, indemnity,\n      or other liability obligations and/or rights consistent with this\n      License. However, in accepting such obligations, You may act only\n      on Your own behalf and on Your sole responsibility, not on behalf\n      of any other Contributor, and only if You agree to indemnify,\n      defend, and hold each Contributor harmless for any liability\n      incurred by, or claims asserted against, such Contributor by reason\n      of your accepting any such warranty or additional liability.\n\n   END OF TERMS AND CONDITIONS\n\n   APPENDIX: How to apply the Apache License to your work.\n\n      To apply the Apache License to your work, attach the following\n      boilerplate notice, with the fields enclosed by brackets \"[]\"\n      replaced with your own identifying information. (Don't include\n      the brackets!)  The text should be enclosed in the appropriate\n      comment syntax for the file format. We also recommend that a\n      file or class name and description of purpose be included on the\n      same \"printed page\" as the copyright notice for easier\n      identification within third-party archives.\n\n   Copyright 2024 OpenAI\n\n   Licensed under the Apache License, Version 2.0 (the \"License\");\n   you may not use this file except in compliance with the License.\n   You may obtain a copy of the License at\n\n       http://www.apache.org/licenses/LICENSE-2.0\n\n   Unless required by applicable law or agreed to in writing, software\n   distributed under the License is distributed on an \"AS IS\" BASIS,\n   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n   See the License for the specific language governing permissions and\n   limitations under the License."
  },
  {
    "path": "package.json",
    "content": "{\n  \"name\": \"@mlc-ai/web-llm\",\n  \"version\": \"0.2.82\",\n  \"description\": \"Hardware accelerated language model chats on browsers\",\n  \"main\": \"lib/index.js\",\n  \"types\": \"lib/index.d.ts\",\n  \"type\": \"module\",\n  \"scripts\": {\n    \"build\": \"rollup -c && ./cleanup-index-js.sh\",\n    \"lint\": \"npx eslint ./src/ ./tests/ ./examples/ && npx prettier ./src/ ./tests/ ./examples/ --check\",\n    \"test\": \"jest --coverage\",\n    \"format\": \"prettier --write \\\"./src/\\\" \\\"./examples/\\\" \\\"./tests/\\\"\",\n    \"prepare\": \"husky\"\n  },\n  \"files\": [\n    \"lib\"\n  ],\n  \"repository\": {\n    \"type\": \"git\",\n    \"url\": \"git+https://github.com/mlc-ai/web-llm\"\n  },\n  \"keywords\": [\n    \"llm\",\n    \"large language model\",\n    \"machine learning\"\n  ],\n  \"license\": \"Apache-2.0\",\n  \"homepage\": \"https://github.com/mlc-ai/web-llm\",\n  \"devDependencies\": {\n    \"@eslint/js\": \"^9.9.0\",\n    \"@eslint/eslintrc\": \"^3.3.1\",\n    \"@mlc-ai/web-runtime\": \"^0.24.0-dev1\",\n    \"@mlc-ai/web-tokenizers\": \"^0.1.6\",\n    \"@mlc-ai/web-xgrammar\": \"0.1.27\",\n    \"@next/eslint-plugin-next\": \"^16.0.0\",\n    \"@rollup/plugin-commonjs\": \"^29.0.0\",\n    \"@rollup/plugin-node-resolve\": \"^16.0.3\",\n    \"@types/chrome\": \"^0.0.266\",\n    \"@types/jest\": \"^30.0.0\",\n    \"@types/serviceworker\": \"^0.0.86\",\n    \"@webgpu/types\": \"^0.1.24\",\n    \"buffer\": \"^5.7.1\",\n    \"eslint\": \"^9.39.1\",\n    \"eslint-config-prettier\": \"^10.1.8\",\n    \"eslint-plugin-prettier\": \"^5.5.4\",\n    \"husky\": \"^9.0.11\",\n    \"jest\": \"^30.2.0\",\n    \"prettier\": \"3.6.2\",\n    \"process\": \"^0.11.10\",\n    \"rollup\": \"^4.53.3\",\n    \"rollup-plugin-ignore\": \"^1.0.10\",\n    \"rollup-plugin-typescript2\": \"^0.36.0\",\n    \"ts-jest\": \"^29.4.6\",\n    \"tslib\": \"^2.3.1\",\n    \"typescript-eslint\": \"^8.47.0\",\n    \"typescript\": \"^4.9.5\"\n  },\n  \"dependencies\": {\n    \"loglevel\": \"^1.9.1\"\n  },\n  \"overrides\": {\n    \"test-exclude\": \"^7.0.1\",\n    \"glob\": \"^13.0.0\"\n  }\n}\n"
  },
  {
    "path": "rollup.config.js",
    "content": "import { nodeResolve } from '@rollup/plugin-node-resolve';\nimport ignore from \"rollup-plugin-ignore\";\nimport commonjs from '@rollup/plugin-commonjs';\nimport typescript from 'rollup-plugin-typescript2';\n\nexport default {\n    input: 'src/index.ts',\n    output: [\n        {\n            file: 'lib/index.js',\n            exports: 'named',\n            format: 'es',\n            sourcemap: true,\n            globals: {'ws': 'ws',\n                      'perf_hooks': 'perf_hooks'}\n        }\n    ],\n    plugins: [\n        ignore([\"fs\", \"path\", \"crypto\"]),\n        nodeResolve({ browser: true }),\n        commonjs({\n            ignoreDynamicRequires: true,\n        }),\n        typescript({\n            rollupCommonJSResolveHack: false,\n            clean: true\n        })\n    ]\n};\n"
  },
  {
    "path": "scripts/gh_deploy_site.sh",
    "content": "#!/bin/bash\nset -euxo pipefail\n\nexport PYTHONPATH=$PWD/python\ncd docs && make html && cd ..\ncd site && jekyll b && cd ..\nrm -rf site/_site/docs\ncp -r docs/_build/html site/_site/docs\n\ngit fetch\ngit checkout -B gh-pages origin/gh-pages\nrm -rf docs .gitignore\nmkdir -p docs\ncp -rf site/_site/* docs\ntouch docs/.nojekyll\necho \"webllm.mlc.ai\" >> docs/CNAME\n\nDATE=`date`\ngit add docs && git commit -am \"Build at ${DATE}\"\ngit push origin gh-pages\ngit checkout main && git submodule update\necho \"Finish deployment at ${DATE}\"\n"
  },
  {
    "path": "scripts/local_deploy_site.sh",
    "content": "#!/bin/bash\nset -euxo pipefail\n\ncd examples/simple-chat\nrm -rf lib\nnpm run build\ncd ../..\n\ncp examples/simple-chat/lib/* site\n\ncd site && jekyll serve  --host localhost --port 8888\n"
  },
  {
    "path": "scripts/prep_deps.sh",
    "content": "#!/bin/bash\n# This file prepares all the necessary dependencies for the web build.\nset -euxo pipefail\n\nemcc --version\nnpm --version\n\nTVM_SOURCE_DIR_SET=\"${TVM_SOURCE_DIR:-}\"\n\nif [[ -z ${TVM_SOURCE_DIR_SET} ]]; then\n    if [[ ! -d \"3rdparty/tvm-unity\" ]]; then\n        echo \"Do not find TVM_SOURCE_DIR env variable, cloning a version as source\".\n        git clone https://github.com/mlc-ai/relax 3rdparty/tvm-unity --recursive\n    fi\n    export TVM_SOURCE_DIR=\"${TVM_SOURCE_DIR:-3rdparty/tvm-unity}\"\nfi\n\ncd ${TVM_SOURCE_DIR}/web && make && npm install && npm run build && cd -\nrm -rf tvm_home\nln -s ${TVM_SOURCE_DIR} tvm_home\nnpm install\n"
  },
  {
    "path": "scripts/serve_mlc_llm_dist.sh",
    "content": "#!/bin/bash\n# This file prepares all the necessary dependencies for the web build.\nset -euxo pipefail\n\nnpm --version\n\nMLC_LLM_HOME_SET=\"${MLC_LLM_HOME:-}\"\n\nif [[ -z ${MLC_LLM_HOME_SET} ]]; then\n    echo \"Do not find MLC_LLM_HOME env variable, need to set this to work\".\nfi\ncd ${MLC_LLM_HOME}/dist\necho \"Serving ${MLC_LLM_HOME}/dist for local debugging purposes\"\nnpx http-server -p 8000 --cors\ncd -\n"
  },
  {
    "path": "site/.gitignore",
    "content": "dist\nllm-chat-config.json\n_includes/stable_diffusion.html\n_site\nllm_chat.*\n"
  },
  {
    "path": "site/_config.yml",
    "content": "name: \"WebLLM\"\nshort_name: \"WebLLM\"\n\nurl: https://webllm.mlc.ai\n\nexclude: [README.md, serve_local.sh]\n\nplugins:\n  - jekyll-remote-theme\n\nremote_theme: mlc-ai/jekyll-theme-mlc\n\n# Colorize code snippets with the rogue module if we want to deploy on GH.\nhighlighter: rouge\n\nmarkdown: kramdown\n\n# The path structure for blog posts.\npermalink: /blog/:year/:month/:day/:title.html\n\n# Number of news stories on the front page.\nfront_page_news: 8\n\n# Base pathname for links.\nbase: \"\"\n\n# make pages for the _projects folder\ncollections:\n  projects:\n    output: true\n\ncourse_title:\n\n# Navigation bar links.\nnavigation:\n  - title: Home\n    link: /\n  - title: GitHub\n    link: https://github.com/mlc-ai/web-llm\n"
  },
  {
    "path": "site/_includes/head.html",
    "content": "<meta name=\"description\" content=\"WebLLM: High-Performance In-Browser LLM Inference Engine\">\n<meta\n  http-equiv=\"origin-trial\"\n  content=\"Agx76XA0ITxMPF0Z8rbbcMllwuxsyp9qdtQaXlLqu1JUrdHB6FPonuyIKJ3CsBREUkeioJck4nn3KO0c0kkwqAMAAABJeyJvcmlnaW4iOiJodHRwOi8vbG9jYWxob3N0Ojg4ODgiLCJmZWF0dXJlIjoiV2ViR1BVIiwiZXhwaXJ5IjoxNjkxNzExOTk5fQ==\"\n/>\n<meta\n  http-equiv=\"origin-trial\"\n  content=\"AnmwqQ1dtYDQTYkZ5iMtHdINCaxjE94uWQBKp2yOz1wPTcjSRtOHUGQG+r2BxsEuM0qhxTVnuTjyh31HgTeA8gsAAABZeyJvcmlnaW4iOiJodHRwczovL21sYy5haTo0NDMiLCJmZWF0dXJlIjoiV2ViR1BVIiwiZXhwaXJ5IjoxNjkxNzExOTk5LCJpc1N1YmRvbWFpbiI6dHJ1ZX0=\"\n/>\n<script src=\"https://code.jquery.com/jquery-3.6.3.min.js\" integrity=\"sha256-pvPw+upLPUjgMXY0G+8O0xUf+/Im1MZjXxxgOcBQBXU=\" crossorigin=\"anonymous\"></script>\n<link rel=\"stylesheet\" href=\"{{ '/assets/css/hero.css' | relative_url }}\" />"
  },
  {
    "path": "site/_includes/hero.html",
    "content": "<section id=\"hero\">\n  <div class=\"heading-container\">\n    <h1>WebLLM: High-Performance In-Browser LLM Inference Engine</h1>\n    <div class=\"link-container\">\n      <a class=\"get-start-link\" href=\"/docs\">\n        <span class=\"get-start-link-content\">\n          <span>Get Started</span>\n          <span class=\"arrow-container\">{% include arrow.svg %}</span></span\n        >\n      </a>\n      <a class=\"chat-link moving-border\" href=\"https://chat.webllm.ai\">\n        <span class=\"border\"></span>\n        <span class=\"chat-link-content\">\n          <span>Chat with WebLLM</span>\n          <span class=\"arrow-container\">{% include arrow.svg %}</span>\n          </span>\n      </a>\n    </div>\n  </div>\n  <div class=\"video-container\">\n    <video\n      autoplay\n      playsinline\n      muted\n      poster=\"{{ site.base }}/assets/img/fig/Pittsburgh.png\"\n    >\n      <source\n        src=\"{{ site.base }}/assets/video/Pittsburgh.webm\"\n        type=\"video/webm\"\n      />\n      <source\n        src=\"{{ site.base }}/assets/video/Pittsburgh.mp4\"\n        type=\"video/mp4\"\n      />\n    </video>\n  </div>\n</section>\n\n<script>\n  (function() {\n\n  function handlerIn(e) {\n    $(this).addClass(\"expanded\");\n  }\n  function handlerOut(e) {\n    $(this).removeClass(\"expanded\");\n  }\n\n  $(\".chat-link\").hover(handlerIn, handlerOut);\n  $(\".github-link\").hover(handlerIn, handlerOut);\n\n  var video = $(\"video\")[0];\n  video.play().then((_) => {\n    let observer = new IntersectionObserver(\n      (entries) => {\n        entries.forEach((entry) => {\n          if (\n              entry.intersectionRatio !== 1 &&\n              !video.paused\n          ) {\n            video.pause();\n          } else if (video.paused) {\n            video.play();\n          }\n        });\n      },\n        { threshold: 0.2 }\n    );\n    observer.observe(video);\n  });\n})()\n</script>\n"
  },
  {
    "path": "site/assets/css/hero.scss",
    "content": "---\n---\n\n#hero {\n    background: radial-gradient(100% 50rem at center 50rem, #3352cb, #ffffff);\n    padding: 3rem;\n    width: 100vw;\n    margin-left: calc(50% - 50vw);\n    margin-top: -20px;\n    display: flex;\n    flex-direction: column;\n    align-items: center;\n\n    a {\n        color: black;\n    }\n\n    .heading-container {\n        display: flex;\n        flex-direction: column;\n        align-items: center;\n        font-family: \"Mona Sans\", \"MonaSansFallback\", -apple-system, BlinkMacSystemFont, \"Segoe UI\", Helvetica, Arial, sans-serif, \"Apple Color Emoji\", \"Segoe UI Emoji\";\n        margin: auto;\n\n        a {\n            min-width: fit-content;\n            max-width: 16rem;\n            flex-grow: 1;\n        }\n\n        h1 {\n            text-align: center;\n            font-size: 2rem;\n            font-weight: 700;\n        }\n\n        .link-container {\n            display: flex;\n            margin-top: 2rem;\n            align-items: center;\n            flex-wrap: wrap;\n            font-size: 1rem;\n            word-break: keep-all;\n            font-weight: 600;\n            gap: 1rem;\n            justify-content: center;\n\n            .get-start-link {\n                display: inline-flex;\n                gap: 1rem;\n                border-radius: 9999px;\n                vertical-align: middle;\n                align-items: center;\n                justify-content: center;\n                text-decoration: none;\n                cursor: pointer;\n                height: fit-content;\n                // padding: .25rem;\n\n                .get-start-link-content {\n                    width: 100%;\n                    height: 100%;\n                    z-index: 1;\n                    border-radius: 9999px;\n                    padding: 1rem 1.75rem;\n                    background-color: #000000;\n                    display: inline-flex;\n                    gap: .5rem;\n                    display: inline-flex;\n                    justify-content: center;\n                    color: rgb(229 229 229);\n\n                    .icon {\n                        display: inline-flex;\n                        align-items: center;\n\n                        svg {\n                            height: 1.5rem;\n                        }\n                    }\n                }\n            }\n\n            .chat-link {\n                display: inline-flex;\n                gap: 1rem;\n                background-color: white;\n                border-radius: 9999px;\n                vertical-align: middle;\n                align-items: center;\n                justify-content: center;\n                text-decoration: none;\n                cursor: pointer;\n                height: fit-content;\n                padding: .25rem;\n\n                .chat-link-content {\n                    width: 100%;\n                    height: 100%;\n                    z-index: 1;\n                    border-radius: 9999px;\n                    padding: 1rem 1.75rem;\n                    background-color: white;\n                    display: inline-flex;\n                    justify-content: center;\n                }\n            }\n\n            .arrow-container {\n                margin-left: .25rem;\n                display: inline-flex;\n                align-items: center;\n            }\n        }\n    }\n\n    .arrow-expandable {\n        stroke-dasharray: 10;\n        stroke-dashoffset: 10;\n        transition: stroke-dashoffset 200ms;\n    }\n\n    .expanded {\n        .arrow-expandable {\n            stroke-dashoffset: 20;\n        }\n    }\n\n    .video-container {\n        position: relative;\n        border-radius: 20px;\n        overflow: hidden;\n        background-color: #13113c;\n        box-shadow: 0 0 0 1px rgba(0, 0, 0, .08);\n        margin-top: 96px;\n        width: 100%;\n        max-width: 1024px;\n        aspect-ratio: 2286 / 1684;\n\n        video {\n            width: 100%;\n            height: auto;\n            border-radius: inherit;\n        }\n    }\n}\n\n.moving-border {\n    overflow: hidden;\n    position: relative;\n\n    .border {\n        position: absolute;\n        inset: -1000%;\n        animation: spin 3s linear infinite;\n        border-radius: 1rem;\n        background-image: conic-gradient(from 90deg at 50% 50%, #e2cbff 0, #393bb2 50%, #e2cbff 100%);\n    }\n}\n\n@media screen and (min-width:640px) {\n    #hero {\n        padding: 6rem;\n\n        .heading-container {\n            max-width: 40rem;\n\n            h1 {\n                font-size: 3rem;\n            }\n        }\n    }\n}\n\n\n@media screen and (min-width:768px) {\n    #hero {\n        .heading-container {\n            max-width: 45rem;\n\n            h1 {\n                font-size: 3.2rem;\n            }\n\n            .link-container {\n                font-size: 1.2rem;\n            }\n        }\n    }\n}\n\n@media screen and (min-width:1024px) {\n    #hero {\n        padding: 8rem;\n\n        .heading-container {\n            max-width: 50rem;\n\n            h1 {\n\n                font-size: 3.5rem;\n            }\n        }\n    }\n\n}\n\n@media screen and (min-width:1280px) {\n    #hero {\n        .heading-container {\n            max-width: 60rem;\n\n            h1 {\n\n                font-size: 4rem;\n            }\n        }\n    }\n}\n\n\n@media screen and (min-width:1760px) {\n    #hero {\n        background: radial-gradient(1440px 100% at 75% 50%, #3352cb, #ffffff);\n        flex-direction: row;\n        gap: 4rem;\n        padding-bottom: 12rem;\n    }\n}\n\n@keyframes spin {\n    100% {\n        transform: rotate(1turn);\n    }\n}"
  },
  {
    "path": "site/index.md",
    "content": "---\nlayout: default\ntitle: Home\nnotitle: true\n---\n\n{% include hero.html %}\n\n## Overview\n\nWe have been seeing amazing progress in generative AI and LLM recently. Thanks to the open-source efforts like LLaMA, Alpaca, Vicuna and Dolly, we start to see an exciting future of building our own open source language models and personal AI assistant.\n\nThese models are usually big and compute-heavy. To build a chat service, we will need a large cluster to run an inference server, while clients send requests to servers and retrieve the inference output. We also usually have to run on a specific type of GPUs where popular deep-learning frameworks are readily available.\n\nThis project is our step to bring more diversity to the ecosystem. Specifically, can we simply bake LLMs directly into the client side and directly run them inside a browser? If that can be realized, we could offer support for client personal AI models with the benefit of cost reduction, enhancement for personalization and privacy protection. The client side is getting pretty powerful.\n\nWon’t it be even more amazing if we can simply open up a browser and directly bring AI natively to your browser tab? There is some level of readiness in the ecosystem. This project provides an affirmative answer to the question.\n\n## Key Features\n- **In-Browser Inference**: WebLLM is a high-performance, in-browser language model inference engine that leverages WebGPU for hardware acceleration, enabling powerful LLM operations directly within web browsers without server-side processing.\n\n- [**Full OpenAI API Compatibility**](https://github.com/mlc-ai/web-llm?tab=readme-ov-file#full-openai-compatibility): Seamlessly integrate your app with WebLLM using OpenAI API with functionalities such as JSON-mode, function-calling, streaming, and more.\n\n- [**Extensive Model Support**](https://github.com/mlc-ai/web-llm?tab=readme-ov-file#built-in-models): WebLLM natively supports a range of models including Llama, Phi, Gemma, RedPajama, Mistral, Qwen(通义千问), and many others, making it versatile for various AI tasks.\n\n- [**Custom Model Integration**](https://github.com/mlc-ai/web-llm?tab=readme-ov-file#custom-models): Easily integrate and deploy custom models in MLC format, allowing you to adapt WebLLM to specific needs and scenarios, enhancing flexibility in model deployment.\n\n- **Plug-and-Play Integration**: Easily integrate WebLLM into your projects using package managers like NPM and Yarn, or directly via CDN, complete with comprehensive [examples](https://github.com/mlc-ai/web-llm/tree/main/examples) and a modular design for connecting with UI components.\n\n- **Streaming & Real-Time Interactions**: Supports streaming chat completions, allowing real-time output generation which enhances interactive applications like chatbots and virtual assistants.\n\n- **Web Worker & Service Worker Support**: Optimize UI performance and manage the lifecycle of models efficiently by offloading computations to separate worker threads or service workers.\n\n- **Chrome Extension Support**: Extend the functionality of web browsers through custom Chrome extensions using WebLLM, with examples available for building both basic and advanced extensions.\n\n## Disclaimer\n\nThe [demo site](https://chat.webllm.ai) is for research purposes only, subject to the model License of LLaMA, Vicuna and RedPajama. Please contact us if you find any potential violation.\n"
  },
  {
    "path": "src/cache_util.ts",
    "content": "import * as tvmjs from \"@mlc-ai/web-runtime\";\nimport {\n  AppConfig,\n  ChatConfig,\n  ModelRecord,\n  prebuiltAppConfig,\n} from \"./config\";\nimport { cleanModelUrl } from \"./support\";\nimport { ModelNotFoundError, UnsupportedTokenizerFilesError } from \"./error\";\nimport { Tokenizer } from \"@mlc-ai/web-tokenizers\";\n\nfunction findModelRecord(modelId: string, appConfig?: AppConfig): ModelRecord {\n  const matchedItem = appConfig?.model_list.find(\n    (item) => item.model_id == modelId,\n  );\n  if (matchedItem !== undefined) {\n    return matchedItem;\n  }\n  throw new ModelNotFoundError(modelId);\n}\n\nexport async function hasModelInCache(\n  modelId: string,\n  appConfig?: AppConfig,\n): Promise<boolean> {\n  if (appConfig === undefined) {\n    appConfig = prebuiltAppConfig;\n  }\n  const modelRecord = findModelRecord(modelId, appConfig);\n  const modelUrl = cleanModelUrl(modelRecord.model);\n  const cacheType = appConfig.useIndexedDBCache ? \"indexeddb\" : \"cache\";\n  return tvmjs.hasTensorInCache(modelUrl, \"webllm/model\", cacheType);\n}\n\nexport async function deleteModelAllInfoInCache(\n  modelId: string,\n  appConfig?: AppConfig,\n) {\n  // function to delete model all information in cache\n  if (appConfig === undefined) {\n    appConfig = prebuiltAppConfig;\n  }\n  // delete model and tokenizer in Cache\n  await deleteModelInCache(modelId, appConfig);\n  // delete wasm in cache\n  await deleteModelWasmInCache(modelId, appConfig);\n  // delete chat config\n  await deleteChatConfigInCache(modelId, appConfig);\n}\n\nexport async function deleteModelInCache(\n  modelId: string,\n  appConfig?: AppConfig,\n) {\n  // delete the model NDArray In Cache\n  if (appConfig === undefined) {\n    appConfig = prebuiltAppConfig;\n  }\n  const modelRecord = findModelRecord(modelId, appConfig);\n  const modelUrl = cleanModelUrl(modelRecord.model);\n  let modelCache: tvmjs.ArtifactCacheTemplate;\n  if (appConfig.useIndexedDBCache) {\n    tvmjs.deleteTensorCache(modelUrl, \"webllm/model\", \"indexeddb\");\n    modelCache = new tvmjs.ArtifactIndexedDBCache(\"webllm/model\");\n  } else {\n    tvmjs.deleteTensorCache(modelUrl, \"webllm/model\", \"cache\");\n    modelCache = new tvmjs.ArtifactCache(\"webllm/model\");\n  }\n  await modelCache.deleteInCache(new URL(\"tokenizer.model\", modelUrl).href);\n  await modelCache.deleteInCache(new URL(\"tokenizer.json\", modelUrl).href);\n}\n\nexport async function deleteChatConfigInCache(\n  modelId: string,\n  appConfig?: AppConfig,\n) {\n  // delete the chat configuration in Cache\n  if (appConfig === undefined) {\n    appConfig = prebuiltAppConfig;\n  }\n  const modelRecord = findModelRecord(modelId, appConfig);\n  let configCache: tvmjs.ArtifactCacheTemplate;\n  if (appConfig.useIndexedDBCache) {\n    configCache = new tvmjs.ArtifactIndexedDBCache(\"webllm/config\");\n  } else {\n    configCache = new tvmjs.ArtifactCache(\"webllm/config\");\n  }\n  const modelUrl = cleanModelUrl(modelRecord.model);\n  const configUrl = new URL(\"mlc-chat-config.json\", modelUrl).href;\n  await configCache.deleteInCache(configUrl);\n}\n\nexport async function deleteModelWasmInCache(\n  modelId: string,\n  appConfig?: AppConfig,\n) {\n  // delete the wasm in Cache\n  if (appConfig === undefined) {\n    appConfig = prebuiltAppConfig;\n  }\n  const modelRecord = findModelRecord(modelId, appConfig);\n  let wasmCache: tvmjs.ArtifactCacheTemplate;\n  if (appConfig.useIndexedDBCache) {\n    wasmCache = new tvmjs.ArtifactIndexedDBCache(\"webllm/wasm\");\n  } else {\n    wasmCache = new tvmjs.ArtifactCache(\"webllm/wasm\");\n  }\n  await wasmCache.deleteInCache(modelRecord.model_lib);\n}\n\n/**\n *\n * @param baseUrl The link to which we can find tokenizer files, usually is a `ModelRecord.model`.\n * @param config A ChatConfig, usually loaded from `mlc-chat-config.json` in `baseUrl`.\n * @param appConfig An AppConfig, usually `webllm.prebuiltAppConfig` if not defined by user.\n * @param logger Logging function, console.log by default.\n * @returns\n */\nexport async function asyncLoadTokenizer(\n  baseUrl: string,\n  config: ChatConfig,\n  appConfig: AppConfig,\n  logger: (msg: string) => void = console.log,\n): Promise<Tokenizer> {\n  let modelCache: tvmjs.ArtifactCacheTemplate;\n  if (appConfig.useIndexedDBCache) {\n    modelCache = new tvmjs.ArtifactIndexedDBCache(\"webllm/model\");\n  } else {\n    modelCache = new tvmjs.ArtifactCache(\"webllm/model\");\n  }\n\n  if (config.tokenizer_files.includes(\"tokenizer.json\")) {\n    const url = new URL(\"tokenizer.json\", baseUrl).href;\n    const model = await modelCache.fetchWithCache(url, \"arraybuffer\");\n    return Tokenizer.fromJSON(model);\n  } else if (config.tokenizer_files.includes(\"tokenizer.model\")) {\n    logger(\n      \"Using `tokenizer.model` since we cannot locate `tokenizer.json`.\\n\" +\n        \"It is recommended to use `tokenizer.json` to ensure all token mappings are included, \" +\n        \"since currently, files like `added_tokens.json`, `tokenizer_config.json` are ignored.\\n\" +\n        \"Consider converting `tokenizer.model` to `tokenizer.json` by compiling the model \" +\n        \"with MLC again, or see if MLC's huggingface provides this file.\",\n    );\n    const url = new URL(\"tokenizer.model\", baseUrl).href;\n    const model = await modelCache.fetchWithCache(url, \"arraybuffer\");\n    return Tokenizer.fromSentencePiece(model);\n  }\n  throw new UnsupportedTokenizerFilesError(config.tokenizer_files);\n}\n"
  },
  {
    "path": "src/config.ts",
    "content": "import log from \"loglevel\";\nimport { ResponseFormat } from \"./openai_api_protocols\";\nimport { LogitProcessor, InitProgressCallback, LogLevel } from \"./types\";\nimport {\n  DependencyError,\n  InvalidNumberStringError,\n  MinValueError,\n  NonNegativeError,\n  RangeError,\n} from \"./error\";\n\n/**\n * Conversation template config\n */\nexport interface ConvTemplateConfig {\n  system_template: string;\n  system_message: string;\n  roles: Record<Role, string>;\n  role_templates?: Partial<Record<Role, string>>;\n  seps: Array<string>;\n  role_content_sep?: string;\n  role_empty_sep?: string;\n  stop_str: Array<string>;\n  system_prefix_token_ids?: Array<number>;\n  stop_token_ids: Array<number>;\n  add_role_after_system_message?: boolean;\n}\n\nexport enum Role {\n  user = \"user\",\n  assistant = \"assistant\",\n  tool = \"tool\",\n}\n\nexport const DefaultLogLevel: LogLevel = \"WARN\";\n\n/**\n * Place holders that can be used in role templates.\n * For example, a role template of\n * `<<question>> ${MessagePlaceholders.USER} <<function>> ${MessagePlaceholders.FUNCTION}`\n * will insert the user message to ${MessagePlaceholders.USER}\n * and insert the function message to ${MessagePlaceholders.FUNCTION}\n * at run time.\n */\nexport enum MessagePlaceholders {\n  system = \"{system_message}\",\n  user = \"{user_message}\",\n  assistant = \"{assistant_message}\",\n  tool = \"{tool_message}\",\n  function = \"{function_string}\",\n  hermes_tools = \"{hermes_tools}\",\n}\n\n/**\n * Information about the tokenizer. Currently, only `token_postproc_method` is used to\n * post process the token table when using grammar.\n */\nexport interface TokenizerInfo {\n  token_postproc_method: string;\n  prepend_space_in_encode: boolean;\n  strip_space_in_decode: boolean;\n}\n\n/**\n * Config of one chat model, a data structure representing `mlc-chat-config.json`.\n * This only corresponds to the chat-related fields and `tokenizer_files` of `mlc-chat-config.json`.\n * Only these fields affect the conversation in runtime.\n * i.e. The third part in https://llm.mlc.ai/docs/get_started/mlc_chat_config.html.\n *\n * This is initialized in `MLCEngine.reload()` with the model's `mlc-chat-config.json`.\n */\nexport interface ChatConfig {\n  // First three fields affect the entire conversation, i.e. used in `MLCEngine.reload()`\n  tokenizer_files: Array<string>;\n  tokenizer_info?: TokenizerInfo;\n  token_table_postproc_method?: string; // TODO: backward compatibility, remove soon\n  vocab_size: number;\n  conv_config?: Partial<ConvTemplateConfig>;\n  conv_template: ConvTemplateConfig;\n  // KVCache settings\n  context_window_size: number;\n  sliding_window_size: number;\n  attention_sink_size: number;\n  // Fields below can be swapped per-generation via `GenerationConfig`\n  // Fields only used in MLC\n  repetition_penalty: number;\n  // Fields shared by MLC and OpenAI APIs\n  frequency_penalty: number;\n  presence_penalty: number;\n  top_p: number;\n  temperature: number;\n  bos_token_id?: number;\n}\n\n/**\n * Custom options that can be used to override known config values.\n */\n// eslint-disable-next-line @typescript-eslint/no-empty-object-type\nexport interface ChatOptions extends Partial<ChatConfig> {}\n\n/**\n * Optional configurations for `CreateMLCEngine()` and `CreateWebWorkerMLCEngine()`.\n *\n * appConfig: Configure the app, including the list of models and whether to use IndexedDB cache.\n * initProgressCallback: A callback for showing the progress of loading the model.\n * logitProcessorRegistry: A register for stateful logit processors, see `webllm.LogitProcessor`.\n *\n * @note All fields are optional, and `logitProcessorRegistry` is only used for `MLCEngine` and not\n * other `MLCEngine`s.\n */\nexport interface MLCEngineConfig {\n  appConfig?: AppConfig;\n  initProgressCallback?: InitProgressCallback;\n  logitProcessorRegistry?: Map<string, LogitProcessor>;\n  logLevel?: LogLevel;\n}\n\n/**\n * Config for a single generation.\n * Essentially `ChatConfig` without `tokenizer_files`, `conv_config`, or `conv_template`.\n * We also support additional fields not present in `mlc-chat-config.json` due to OpenAI-like APIs.\n *\n * Note that all values are optional. If unspecified, we use whatever values in `ChatConfig`\n * initialized during `MLCEngine.reload()`.\n */\nexport interface GenerationConfig {\n  // Only used in MLC\n  repetition_penalty?: number | null;\n  ignore_eos?: boolean;\n  // Shared by MLC and OpenAI APIs\n  top_p?: number | null;\n  temperature?: number | null;\n  // Only in OpenAI APIs\n  max_tokens?: number | null;\n  frequency_penalty?: number | null;\n  presence_penalty?: number | null;\n  stop?: string | null | Array<string>;\n  n?: number | null;\n  logit_bias?: Record<string, number> | null;\n  logprobs?: boolean | null;\n  top_logprobs?: number | null;\n  response_format?: ResponseFormat | null;\n  // extra_body in ChatCompletionsRequest\n  enable_thinking?: boolean | null;\n  enable_latency_breakdown?: boolean | null;\n}\n\nexport function postInitAndCheckGenerationConfigValues(\n  config: GenerationConfig,\n): void {\n  function _hasValue(value: any): boolean {\n    // if we use `if value` directly, `value` being 0 evaluates to false, violating semantics\n    return value !== undefined && value !== null;\n  }\n  if (\n    config.frequency_penalty &&\n    (config.frequency_penalty < -2.0 || config.frequency_penalty > 2.0)\n  ) {\n    throw new RangeError(\"frequency_penalty\", -2.0, 2.0);\n  }\n  if (\n    config.presence_penalty &&\n    (config.presence_penalty < -2.0 || config.presence_penalty > 2.0)\n  ) {\n    throw new RangeError(\"presence_penalty\", -2.0, 2.0);\n  }\n  if (_hasValue(config.repetition_penalty) && config.repetition_penalty! <= 0) {\n    throw new MinValueError(\"repetition_penalty\", 0);\n  }\n  if (_hasValue(config.max_tokens) && config.max_tokens! <= 0) {\n    throw new MinValueError(\"max_tokens\", 0);\n  }\n  if ((_hasValue(config.top_p) && config.top_p! <= 0) || config.top_p! > 1) {\n    throw new RangeError(\"top_p\", 0, 1);\n  }\n  if (_hasValue(config.temperature) && config.temperature! < 0) {\n    throw new NonNegativeError(\"temperature\");\n  }\n  // If only one of frequency or presence penatly is set, make the other one 0.0\n  if (\n    _hasValue(config.frequency_penalty) &&\n    !_hasValue(config.presence_penalty)\n  ) {\n    config.presence_penalty = 0.0;\n    log.warn(\"Only frequency_penalty is set; we default presence_penaty to 0.\");\n  }\n  if (\n    _hasValue(config.presence_penalty) &&\n    !_hasValue(config.frequency_penalty)\n  ) {\n    config.frequency_penalty = 0.0;\n    log.warn(\n      \"Only presence_penalty is set; we default frequency_penalty to 0.\",\n    );\n  }\n  // Check logit_bias range\n  if (_hasValue(config.logit_bias)) {\n    for (const tokenID in config.logit_bias) {\n      const bias = config.logit_bias[tokenID];\n      if (bias > 100 || bias < -100) {\n        throw new RangeError(\n          \"logit_bias\",\n          -100,\n          100,\n          \"Got \" + bias + \" for tokenID \" + tokenID,\n        );\n      }\n      if (isNaN(parseInt(tokenID))) {\n        throw new InvalidNumberStringError(\"logit_bias's keys\", tokenID);\n      }\n    }\n  }\n  // logprobs and top_logprobs\n  if (_hasValue(config.top_logprobs)) {\n    // If top_logprobs is non-null, logprobs must be true\n    if (!config.logprobs) {\n      throw new DependencyError(\"top_logprobs\", \"logprobs\", true);\n    }\n    // top_logprobs should be in range [0,5]\n    if (config.top_logprobs! < 0 || config.top_logprobs! > 5) {\n      throw new RangeError(\"top_logprobs\", 0, 5, \"Got \" + config.top_logprobs);\n    }\n  }\n  // If defined logprobs but not top_logprobs, simply make it 0\n  if (config.logprobs) {\n    if (!_hasValue(config.top_logprobs)) {\n      config.top_logprobs = 0;\n    }\n  }\n}\n\nexport enum ModelType {\n  \"LLM\",\n  \"embedding\",\n  \"VLM\", // vision-language model\n}\n\n/**\n * Information for a model.\n * @param model: the huggingface link to download the model weights, accepting four formats:\n *    - https://huggingface.co/{USERNAME}/{MODEL}, which we automatically use the main branch\n *    - https://huggingface.co/{USERNAME}/{MODEL}/, which we automatically use the main branch\n *    - https://huggingface.co/{USERNAME}/{MODEL}/resolve/{BRANCH}\n *    - https://huggingface.co/{USERNAME}/{MODEL}/resolve/{BRANCH}/\n * @param model_id: what we call the model.\n * @param model_lib: link to the model library (wasm file) the model uses.\n * @param overrides: partial ChatConfig to override mlc-chat-config.json; can be used to change KVCache settings.\n * @param vram_required_MB: amount of vram in MB required to run the model (can use\n *    `utils/vram_requirements` to calculate).\n * @param low_resource_required: whether the model can run on limited devices (e.g. Android phone).\n * @param buffer_size_required_bytes: required `maxStorageBufferBindingSize`, different for each device.\n * @param required_features: feature needed to run this model (e.g. shader-f16).\n * @param model_type: the intended usecase for the model, if unspecified, default to LLM.\n */\nexport interface ModelRecord {\n  model: string;\n  model_id: string;\n  model_lib: string;\n  overrides?: ChatOptions;\n  vram_required_MB?: number;\n  low_resource_required?: boolean;\n  buffer_size_required_bytes?: number;\n  required_features?: Array<string>;\n  model_type?: ModelType;\n}\n\n/**\n * Extra configuration that can be\n * passed to the load.\n *\n * @param model_list: models to be used.\n * @param useIndexedDBCache: if true, will use IndexedDBCache to cache models and other artifacts.\n * If false or unspecified, will use the Cache API. For more information of the two, see:\n * https://developer.mozilla.org/en-US/docs/Web/API/Storage_API/Storage_quotas_and_eviction_criteria#what_technologies_store_data_in_the_browser\n *\n * @note Note that the Cache API is more well-tested in WebLLM as of now.\n */\nexport interface AppConfig {\n  model_list: Array<ModelRecord>;\n  useIndexedDBCache?: boolean;\n}\n\n/**\n * modelVersion: the prebuilt model libraries that the current npm is compatible with, affects the\n * `model_lib`s in `prebuiltAppConfig`.\n *\n * @note The model version does not have to match the npm version, since not each npm update\n * requires an update of the model libraries.\n */\nexport const modelVersion = \"v0_2_80\";\nexport const modelLibURLPrefix =\n  \"https://raw.githubusercontent.com/mlc-ai/binary-mlc-llm-libs/main/web-llm-models/\";\n\n/**\n * Models that support function calling (i.e. usage of `ChatCompletionRequest.tools`). More to come.\n */\nexport const functionCallingModelIds = [\n  \"Hermes-2-Pro-Llama-3-8B-q4f16_1-MLC\",\n  \"Hermes-2-Pro-Llama-3-8B-q4f32_1-MLC\",\n  \"Hermes-2-Pro-Mistral-7B-q4f16_1-MLC\",\n  \"Hermes-3-Llama-3.1-8B-q4f32_1-MLC\",\n  \"Hermes-3-Llama-3.1-8B-q4f16_1-MLC\",\n];\n\n/**\n * Default models and model library mapping to be used if unspecified.\n *\n * @note This is the only source of truth of which prebuilt model libraries are compatible with the\n * current WebLLM npm version.\n */\nexport const prebuiltAppConfig: AppConfig = {\n  useIndexedDBCache: false,\n  model_list: [\n    // Llama-3.2\n    {\n      model: \"https://huggingface.co/mlc-ai/Llama-3.2-1B-Instruct-q4f32_1-MLC\",\n      model_id: \"Llama-3.2-1B-Instruct-q4f32_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Llama-3.2-1B-Instruct-q4f32_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 1128.82,\n      low_resource_required: true,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/Llama-3.2-1B-Instruct-q4f16_1-MLC\",\n      model_id: \"Llama-3.2-1B-Instruct-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Llama-3.2-1B-Instruct-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 879.04,\n      low_resource_required: true,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    // TODO: temporarily commenting out q0f32 models due to correctness issues\n    // {\n    //   model: \"https://huggingface.co/mlc-ai/Llama-3.2-1B-Instruct-q0f32-MLC\",\n    //   model_id: \"Llama-3.2-1B-Instruct-q0f32-MLC\",\n    //   model_lib:\n    //     modelLibURLPrefix +\n    //     modelVersion +\n    //     \"/Llama-3.2-1B-Instruct-q0f32-ctx4k_cs1k-webgpu.wasm\",\n    //   vram_required_MB: 5106.26,\n    //   low_resource_required: true,\n    //   overrides: {\n    //     context_window_size: 4096,\n    //   },\n    // },\n    {\n      model: \"https://huggingface.co/mlc-ai/Llama-3.2-1B-Instruct-q0f16-MLC\",\n      model_id: \"Llama-3.2-1B-Instruct-q0f16-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Llama-3.2-1B-Instruct-q0f16-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 2573.13,\n      low_resource_required: true,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/Llama-3.2-3B-Instruct-q4f32_1-MLC\",\n      model_id: \"Llama-3.2-3B-Instruct-q4f32_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Llama-3.2-3B-Instruct-q4f32_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 2951.51,\n      low_resource_required: true,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/Llama-3.2-3B-Instruct-q4f16_1-MLC\",\n      model_id: \"Llama-3.2-3B-Instruct-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Llama-3.2-3B-Instruct-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 2263.69,\n      low_resource_required: true,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    // Llama-3.1\n    {\n      model: \"https://huggingface.co/mlc-ai/Llama-3.1-8B-Instruct-q4f32_1-MLC\",\n      model_id: \"Llama-3.1-8B-Instruct-q4f32_1-MLC-1k\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Llama-3_1-8B-Instruct-q4f32_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 5295.7,\n      low_resource_required: true,\n      overrides: {\n        context_window_size: 1024,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/Llama-3.1-8B-Instruct-q4f16_1-MLC\",\n      model_id: \"Llama-3.1-8B-Instruct-q4f16_1-MLC-1k\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Llama-3_1-8B-Instruct-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 4598.34,\n      low_resource_required: true,\n      overrides: {\n        context_window_size: 1024,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/Llama-3.1-8B-Instruct-q4f32_1-MLC\",\n      model_id: \"Llama-3.1-8B-Instruct-q4f32_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Llama-3_1-8B-Instruct-q4f32_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 6101.01,\n      low_resource_required: false,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/Llama-3.1-8B-Instruct-q4f16_1-MLC\",\n      model_id: \"Llama-3.1-8B-Instruct-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Llama-3_1-8B-Instruct-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 5001.0,\n      low_resource_required: false,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    // DeepSeek-R1-Distill-Qwen\n    // TODO(Charlie): Qwen2-1.5B is experiencing correctness issue, hence commented for now.\n    // {\n    //   model: \"https://huggingface.co/mlc-ai/DeepSeek-R1-Distill-Qwen-1.5B-q4f16_1-MLC\",\n    //   model_id: \"DeepSeek-R1-Distill-Qwen-1.5B-q4f16_1-MLC\",\n    //   model_lib:\n    //     modelLibURLPrefix +\n    //     modelVersion +\n    //     \"/Qwen2-1.5B-Instruct-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n    //   low_resource_required: true,\n    //   vram_required_MB: 1629.75,\n    //   overrides: {\n    //     context_window_size: 4096,\n    //   },\n    // },\n    // {\n    //   model: \"https://huggingface.co/mlc-ai/DeepSeek-R1-Distill-Qwen-1.5B-q4f32_1-MLC\",\n    //   model_id: \"DeepSeek-R1-Distill-Qwen-1.5B-q4f32_1-MLC\",\n    //   model_lib:\n    //     modelLibURLPrefix +\n    //     modelVersion +\n    //     \"/Qwen2-1.5B-Instruct-q4f32_1-ctx4k_cs1k-webgpu.wasm\",\n    //   low_resource_required: true,\n    //   vram_required_MB: 1888.97,\n    //   overrides: {\n    //     context_window_size: 4096,\n    //   },\n    // },\n    {\n      model:\n        \"https://huggingface.co/mlc-ai/DeepSeek-R1-Distill-Qwen-7B-q4f16_1-MLC\",\n      model_id: \"DeepSeek-R1-Distill-Qwen-7B-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Qwen2-7B-Instruct-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      low_resource_required: false,\n      vram_required_MB: 5106.67,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model:\n        \"https://huggingface.co/mlc-ai/DeepSeek-R1-Distill-Qwen-7B-q4f32_1-MLC\",\n      model_id: \"DeepSeek-R1-Distill-Qwen-7B-q4f32_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Qwen2-7B-Instruct-q4f32_1-ctx4k_cs1k-webgpu.wasm\",\n      low_resource_required: false,\n      vram_required_MB: 5900.09,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    // DeepSeek-R1-Distill-Llama\n    {\n      model:\n        \"https://huggingface.co/mlc-ai/DeepSeek-R1-Distill-Llama-8B-q4f32_1-MLC\",\n      model_id: \"DeepSeek-R1-Distill-Llama-8B-q4f32_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Llama-3_1-8B-Instruct-q4f32_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 6101.01,\n      low_resource_required: false,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model:\n        \"https://huggingface.co/mlc-ai/DeepSeek-R1-Distill-Llama-8B-q4f16_1-MLC\",\n      model_id: \"DeepSeek-R1-Distill-Llama-8B-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Llama-3_1-8B-Instruct-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 5001.0,\n      low_resource_required: false,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    // Hermes-3 and Hermes-2\n    {\n      model:\n        \"https://huggingface.co/mlc-ai/Hermes-2-Theta-Llama-3-8B-q4f16_1-MLC\",\n      model_id: \"Hermes-2-Theta-Llama-3-8B-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Llama-3-8B-Instruct-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 4976.13,\n      low_resource_required: false,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model:\n        \"https://huggingface.co/mlc-ai/Hermes-2-Theta-Llama-3-8B-q4f32_1-MLC\",\n      model_id: \"Hermes-2-Theta-Llama-3-8B-q4f32_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Llama-3-8B-Instruct-q4f32_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 6051.27,\n      low_resource_required: false,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model:\n        \"https://huggingface.co/mlc-ai/Hermes-2-Pro-Llama-3-8B-q4f16_1-MLC\",\n      model_id: \"Hermes-2-Pro-Llama-3-8B-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Llama-3-8B-Instruct-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 4976.13,\n      low_resource_required: false,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model:\n        \"https://huggingface.co/mlc-ai/Hermes-2-Pro-Llama-3-8B-q4f32_1-MLC\",\n      model_id: \"Hermes-2-Pro-Llama-3-8B-q4f32_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Llama-3-8B-Instruct-q4f32_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 6051.27,\n      low_resource_required: false,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/Hermes-3-Llama-3.2-3B-q4f32_1-MLC\",\n      model_id: \"Hermes-3-Llama-3.2-3B-q4f32_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Llama-3.2-3B-Instruct-q4f32_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 2951.51,\n      low_resource_required: true,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/Hermes-3-Llama-3.2-3B-q4f16_1-MLC\",\n      model_id: \"Hermes-3-Llama-3.2-3B-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Llama-3.2-3B-Instruct-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 2263.69,\n      low_resource_required: true,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/Hermes-3-Llama-3.1-8B-q4f32_1-MLC\",\n      model_id: \"Hermes-3-Llama-3.1-8B-q4f32_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Llama-3_1-8B-Instruct-q4f32_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 5779.27,\n      low_resource_required: false,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/Hermes-3-Llama-3.1-8B-q4f16_1-MLC\",\n      model_id: \"Hermes-3-Llama-3.1-8B-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Llama-3_1-8B-Instruct-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 4876.13,\n      low_resource_required: false,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model:\n        \"https://huggingface.co/mlc-ai/Hermes-2-Pro-Mistral-7B-q4f16_1-MLC\",\n      model_id: \"Hermes-2-Pro-Mistral-7B-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Mistral-7B-Instruct-v0.3-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 4033.28,\n      low_resource_required: false,\n      required_features: [\"shader-f16\"],\n      overrides: {\n        context_window_size: 4096,\n        sliding_window_size: -1,\n      },\n    },\n    // Phi3.5-mini-instruct\n    {\n      model: \"https://huggingface.co/mlc-ai/Phi-3.5-mini-instruct-q4f16_1-MLC\",\n      model_id: \"Phi-3.5-mini-instruct-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Phi-3.5-mini-instruct-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 3672.07,\n      low_resource_required: false,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/Phi-3.5-mini-instruct-q4f32_1-MLC\",\n      model_id: \"Phi-3.5-mini-instruct-q4f32_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Phi-3.5-mini-instruct-q4f32_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 5483.12,\n      low_resource_required: false,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/Phi-3.5-mini-instruct-q4f16_1-MLC\",\n      model_id: \"Phi-3.5-mini-instruct-q4f16_1-MLC-1k\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Phi-3.5-mini-instruct-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 2520.07,\n      low_resource_required: true,\n      overrides: {\n        context_window_size: 1024,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/Phi-3.5-mini-instruct-q4f32_1-MLC\",\n      model_id: \"Phi-3.5-mini-instruct-q4f32_1-MLC-1k\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Phi-3.5-mini-instruct-q4f32_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 3179.12,\n      low_resource_required: true,\n      overrides: {\n        context_window_size: 1024,\n      },\n    },\n    // Phi-3.5-vision-instruct\n    {\n      model:\n        \"https://huggingface.co/mlc-ai/Phi-3.5-vision-instruct-q4f16_1-MLC\",\n      model_id: \"Phi-3.5-vision-instruct-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Phi-3.5-vision-instruct-q4f16_1-ctx4k_cs2k-webgpu.wasm\",\n      vram_required_MB: 3952.18,\n      low_resource_required: true,\n      overrides: {\n        context_window_size: 4096,\n      },\n      model_type: ModelType.VLM,\n    },\n    {\n      model:\n        \"https://huggingface.co/mlc-ai/Phi-3.5-vision-instruct-q4f32_1-MLC\",\n      model_id: \"Phi-3.5-vision-instruct-q4f32_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Phi-3.5-vision-instruct-q4f32_1-ctx4k_cs2k-webgpu.wasm\",\n      vram_required_MB: 5879.84,\n      low_resource_required: true,\n      overrides: {\n        context_window_size: 4096,\n      },\n      model_type: ModelType.VLM,\n    },\n    // Mistral variants\n    {\n      model:\n        \"https://huggingface.co/mlc-ai/Mistral-7B-Instruct-v0.3-q4f16_1-MLC\",\n      model_id: \"Mistral-7B-Instruct-v0.3-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Mistral-7B-Instruct-v0.3-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 4573.39,\n      low_resource_required: false,\n      required_features: [\"shader-f16\"],\n      overrides: {\n        context_window_size: 4096,\n        sliding_window_size: -1,\n      },\n    },\n    {\n      model:\n        \"https://huggingface.co/mlc-ai/Mistral-7B-Instruct-v0.3-q4f32_1-MLC\",\n      model_id: \"Mistral-7B-Instruct-v0.3-q4f32_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Mistral-7B-Instruct-v0.3-q4f32_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 5619.27,\n      low_resource_required: false,\n      overrides: {\n        context_window_size: 4096,\n        sliding_window_size: -1,\n      },\n    },\n    {\n      model:\n        \"https://huggingface.co/mlc-ai/Mistral-7B-Instruct-v0.2-q4f16_1-MLC\",\n      model_id: \"Mistral-7B-Instruct-v0.2-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Mistral-7B-Instruct-v0.3-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 4573.39,\n      low_resource_required: false,\n      required_features: [\"shader-f16\"],\n      overrides: {\n        context_window_size: 4096,\n        sliding_window_size: -1,\n      },\n    },\n    {\n      model:\n        \"https://huggingface.co/mlc-ai/OpenHermes-2.5-Mistral-7B-q4f16_1-MLC\",\n      model_id: \"OpenHermes-2.5-Mistral-7B-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Mistral-7B-Instruct-v0.3-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 4573.39,\n      low_resource_required: false,\n      required_features: [\"shader-f16\"],\n      overrides: {\n        context_window_size: 4096,\n        sliding_window_size: -1,\n      },\n    },\n    {\n      model:\n        \"https://huggingface.co/mlc-ai/NeuralHermes-2.5-Mistral-7B-q4f16_1-MLC\",\n      model_id: \"NeuralHermes-2.5-Mistral-7B-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Mistral-7B-Instruct-v0.3-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 4573.39,\n      low_resource_required: false,\n      required_features: [\"shader-f16\"],\n      overrides: {\n        context_window_size: 4096,\n        sliding_window_size: -1,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/WizardMath-7B-V1.1-q4f16_1-MLC\",\n      model_id: \"WizardMath-7B-V1.1-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Mistral-7B-Instruct-v0.3-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 4573.39,\n      low_resource_required: false,\n      required_features: [\"shader-f16\"],\n      overrides: {\n        context_window_size: 4096,\n        sliding_window_size: -1,\n      },\n    },\n    // SmolLM2\n    {\n      model: \"https://huggingface.co/mlc-ai/SmolLM2-1.7B-Instruct-q4f16_1-MLC\",\n      model_id: \"SmolLM2-1.7B-Instruct-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/SmolLM2-1.7B-Instruct-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 1774.19,\n      low_resource_required: true,\n      required_features: [\"shader-f16\"],\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/SmolLM2-1.7B-Instruct-q4f32_1-MLC\",\n      model_id: \"SmolLM2-1.7B-Instruct-q4f32_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/SmolLM2-1.7B-Instruct-q4f32_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 2692.38,\n      low_resource_required: true,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n\n    {\n      model: \"https://huggingface.co/mlc-ai/SmolLM2-360M-Instruct-q0f16-MLC\",\n      model_id: \"SmolLM2-360M-Instruct-q0f16-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/SmolLM2-360M-Instruct-q0f16-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 871.99,\n      low_resource_required: true,\n      required_features: [\"shader-f16\"],\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/SmolLM2-360M-Instruct-q0f32-MLC\",\n      model_id: \"SmolLM2-360M-Instruct-q0f32-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/SmolLM2-360M-Instruct-q0f32-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 1743.99,\n      low_resource_required: true,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/SmolLM2-360M-Instruct-q4f16_1-MLC\",\n      model_id: \"SmolLM2-360M-Instruct-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/SmolLM2-360M-Instruct-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 376.06,\n      low_resource_required: true,\n      required_features: [\"shader-f16\"],\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/SmolLM2-360M-Instruct-q4f32_1-MLC\",\n      model_id: \"SmolLM2-360M-Instruct-q4f32_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/SmolLM2-360M-Instruct-q4f32_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 579.61,\n      low_resource_required: true,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/SmolLM2-135M-Instruct-q0f16-MLC\",\n      model_id: \"SmolLM2-135M-Instruct-q0f16-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/SmolLM2-135M-Instruct-q0f16-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 359.69,\n      low_resource_required: true,\n      required_features: [\"shader-f16\"],\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/SmolLM2-135M-Instruct-q0f32-MLC\",\n      model_id: \"SmolLM2-135M-Instruct-q0f32-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/SmolLM2-135M-Instruct-q0f32-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 719.38,\n      low_resource_required: true,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    // Gemma2\n    {\n      model: \"https://huggingface.co/mlc-ai/gemma-2-2b-it-q4f16_1-MLC\",\n      model_id: \"gemma-2-2b-it-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/gemma-2-2b-it-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 1895.3,\n      low_resource_required: false,\n      required_features: [\"shader-f16\"],\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/gemma-2-2b-it-q4f32_1-MLC\",\n      model_id: \"gemma-2-2b-it-q4f32_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/gemma-2-2b-it-q4f32_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 2508.75,\n      low_resource_required: false,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/gemma-2-2b-it-q4f16_1-MLC\",\n      model_id: \"gemma-2-2b-it-q4f16_1-MLC-1k\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/gemma-2-2b-it-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 1583.3,\n      low_resource_required: true,\n      required_features: [\"shader-f16\"],\n      overrides: {\n        context_window_size: 1024,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/gemma-2-2b-it-q4f32_1-MLC\",\n      model_id: \"gemma-2-2b-it-q4f32_1-MLC-1k\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/gemma-2-2b-it-q4f32_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 1884.75,\n      low_resource_required: true,\n      overrides: {\n        context_window_size: 1024,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/gemma-2-9b-it-q4f16_1-MLC\",\n      model_id: \"gemma-2-9b-it-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/gemma-2-9b-it-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 6422.01,\n      low_resource_required: false,\n      required_features: [\"shader-f16\"],\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/gemma-2-9b-it-q4f32_1-MLC\",\n      model_id: \"gemma-2-9b-it-q4f32_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/gemma-2-9b-it-q4f32_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 8383.33,\n      low_resource_required: false,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    // Gemma2-2b-jpn\n    {\n      model: \"https://huggingface.co/mlc-ai/gemma-2-2b-jpn-it-q4f16_1-MLC\",\n      model_id: \"gemma-2-2b-jpn-it-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/gemma-2-2b-jpn-it-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 1895.3,\n      low_resource_required: true,\n      required_features: [\"shader-f16\"],\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/gemma-2-2b-jpn-it-q4f32_1-MLC\",\n      model_id: \"gemma-2-2b-jpn-it-q4f32_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/gemma-2-2b-jpn-it-q4f32_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 2508.75,\n      low_resource_required: true,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    // Qwen-3\n    {\n      model: \"https://huggingface.co/mlc-ai/Qwen3-0.6B-q4f16_1-MLC\",\n      model_id: \"Qwen3-0.6B-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Qwen3-0.6B-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 1403.34,\n      low_resource_required: true,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/Qwen3-0.6B-q4f32_1-MLC\",\n      model_id: \"Qwen3-0.6B-q4f32_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Qwen3-0.6B-q4f32_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 1924.98,\n      low_resource_required: true,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/Qwen3-0.6B-q0f16-MLC\",\n      model_id: \"Qwen3-0.6B-q0f16-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Qwen3-0.6B-q0f16-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 2220.38,\n      low_resource_required: true,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    // TODO: temporarily commenting out q0f32 models due to correctness issues\n    // {\n    //   model: \"https://huggingface.co/mlc-ai/Qwen3-0.6B-q0f32-MLC\",\n    //   model_id: \"Qwen3-0.6B-q0f32-MLC\",\n    //   model_lib:\n    //     modelLibURLPrefix +\n    //     modelVersion +\n    //     \"/Qwen3-0.6B-q0f32-ctx4k_cs1k-webgpu.wasm\",\n    //   vram_required_MB: 3843.25,\n    //   low_resource_required: true,\n    //   overrides: {\n    //     context_window_size: 4096,\n    //   },\n    // },\n    {\n      model: \"https://huggingface.co/mlc-ai/Qwen3-1.7B-q4f16_1-MLC\",\n      model_id: \"Qwen3-1.7B-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Qwen3-1.7B-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 2036.66,\n      low_resource_required: true,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/Qwen3-1.7B-q4f32_1-MLC\",\n      model_id: \"Qwen3-1.7B-q4f32_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Qwen3-1.7B-q4f32_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 2635.44,\n      low_resource_required: true,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/Qwen3-4B-q4f16_1-MLC\",\n      model_id: \"Qwen3-4B-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Qwen3-4B-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 3431.59,\n      low_resource_required: true,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/Qwen3-4B-q4f32_1-MLC\",\n      model_id: \"Qwen3-4B-q4f32_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Qwen3-4B-q4f32_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 4327.71,\n      low_resource_required: true,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/Qwen3-8B-q4f16_1-MLC\",\n      model_id: \"Qwen3-8B-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Qwen3-8B-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 5695.78,\n      low_resource_required: false,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/Qwen3-8B-q4f32_1-MLC\",\n      model_id: \"Qwen3-8B-q4f32_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Qwen3-8B-q4f32_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 6852.55,\n      low_resource_required: false,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    // Qwen-2\n    {\n      model: \"https://huggingface.co/mlc-ai/Qwen2.5-0.5B-Instruct-q4f16_1-MLC\",\n      model_id: \"Qwen2.5-0.5B-Instruct-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Qwen2-0.5B-Instruct-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      low_resource_required: true,\n      vram_required_MB: 944.62,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/Qwen2.5-0.5B-Instruct-q4f32_1-MLC\",\n      model_id: \"Qwen2.5-0.5B-Instruct-q4f32_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Qwen2-0.5B-Instruct-q4f32_1-ctx4k_cs1k-webgpu.wasm\",\n      low_resource_required: true,\n      vram_required_MB: 1060.2,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/Qwen2.5-0.5B-Instruct-q0f16-MLC\",\n      model_id: \"Qwen2.5-0.5B-Instruct-q0f16-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Qwen2-0.5B-Instruct-q0f16-ctx4k_cs1k-webgpu.wasm\",\n      low_resource_required: true,\n      vram_required_MB: 1624.12,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    // TODO: temporarily commenting out q0f32 models due to correctness issues\n    // {\n    //   model: \"https://huggingface.co/mlc-ai/Qwen2.5-0.5B-Instruct-q0f32-MLC\",\n    //   model_id: \"Qwen2.5-0.5B-Instruct-q0f32-MLC\",\n    //   model_lib:\n    //     modelLibURLPrefix +\n    //     modelVersion +\n    //     \"/Qwen2-0.5B-Instruct-q0f32-ctx4k_cs1k-webgpu.wasm\",\n    //   low_resource_required: true,\n    //   vram_required_MB: 2654.75,\n    //   overrides: {\n    //     context_window_size: 4096,\n    //   },\n    // },\n    {\n      model: \"https://huggingface.co/mlc-ai/Qwen2.5-1.5B-Instruct-q4f16_1-MLC\",\n      model_id: \"Qwen2.5-1.5B-Instruct-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Qwen2-1.5B-Instruct-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      low_resource_required: true,\n      vram_required_MB: 1629.75,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/Qwen2.5-1.5B-Instruct-q4f32_1-MLC\",\n      model_id: \"Qwen2.5-1.5B-Instruct-q4f32_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Qwen2-1.5B-Instruct-q4f32_1-ctx4k_cs1k-webgpu.wasm\",\n      low_resource_required: true,\n      vram_required_MB: 1888.97,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/Qwen2.5-3B-Instruct-q4f16_1-MLC\",\n      model_id: \"Qwen2.5-3B-Instruct-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Qwen2.5-3B-Instruct-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      low_resource_required: true,\n      vram_required_MB: 2504.76,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/Qwen2.5-3B-Instruct-q4f32_1-MLC\",\n      model_id: \"Qwen2.5-3B-Instruct-q4f32_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Qwen2.5-3B-Instruct-q4f32_1-ctx4k_cs1k-webgpu.wasm\",\n      low_resource_required: true,\n      vram_required_MB: 2893.64,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/Qwen2.5-7B-Instruct-q4f16_1-MLC\",\n      model_id: \"Qwen2.5-7B-Instruct-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Qwen2-7B-Instruct-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      low_resource_required: false,\n      vram_required_MB: 5106.67,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/Qwen2.5-7B-Instruct-q4f32_1-MLC\",\n      model_id: \"Qwen2.5-7B-Instruct-q4f32_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Qwen2-7B-Instruct-q4f32_1-ctx4k_cs1k-webgpu.wasm\",\n      low_resource_required: false,\n      vram_required_MB: 5900.09,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    // Qwen2.5-Coder\n    {\n      model:\n        \"https://huggingface.co/mlc-ai/Qwen2.5-Coder-0.5B-Instruct-q4f16_1-MLC\",\n      model_id: \"Qwen2.5-Coder-0.5B-Instruct-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Qwen2-0.5B-Instruct-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      low_resource_required: true,\n      vram_required_MB: 944.62,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model:\n        \"https://huggingface.co/mlc-ai/Qwen2.5-Coder-0.5B-Instruct-q4f32_1-MLC\",\n      model_id: \"Qwen2.5-Coder-0.5B-Instruct-q4f32_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Qwen2-0.5B-Instruct-q4f32_1-ctx4k_cs1k-webgpu.wasm\",\n      low_resource_required: true,\n      vram_required_MB: 1060.2,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model:\n        \"https://huggingface.co/mlc-ai/Qwen2.5-Coder-0.5B-Instruct-q0f16-MLC\",\n      model_id: \"Qwen2.5-Coder-0.5B-Instruct-q0f16-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Qwen2-0.5B-Instruct-q0f16-ctx4k_cs1k-webgpu.wasm\",\n      low_resource_required: true,\n      vram_required_MB: 1624.12,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    // TODO: temporarily commenting out q0f32 models due to correctness issues\n    // {\n    //   model:\n    //     \"https://huggingface.co/mlc-ai/Qwen2.5-Coder-0.5B-Instruct-q0f32-MLC\",\n    //   model_id: \"Qwen2.5-Coder-0.5B-Instruct-q0f32-MLC\",\n    //   model_lib:\n    //     modelLibURLPrefix +\n    //     modelVersion +\n    //     \"/Qwen2-0.5B-Instruct-q0f32-ctx4k_cs1k-webgpu.wasm\",\n    //   low_resource_required: true,\n    //   vram_required_MB: 2654.75,\n    //   overrides: {\n    //     context_window_size: 4096,\n    //   },\n    // },\n    {\n      model:\n        \"https://huggingface.co/mlc-ai/Qwen2.5-Coder-1.5B-Instruct-q4f16_1-MLC\",\n      model_id: \"Qwen2.5-Coder-1.5B-Instruct-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Qwen2-1.5B-Instruct-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      low_resource_required: false,\n      vram_required_MB: 1629.75,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model:\n        \"https://huggingface.co/mlc-ai/Qwen2.5-Coder-1.5B-Instruct-q4f32_1-MLC\",\n      model_id: \"Qwen2.5-Coder-1.5B-Instruct-q4f32_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Qwen2-1.5B-Instruct-q4f32_1-ctx4k_cs1k-webgpu.wasm\",\n      low_resource_required: false,\n      vram_required_MB: 1888.97,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model:\n        \"https://huggingface.co/mlc-ai/Qwen2.5-Coder-3B-Instruct-q4f16_1-MLC\",\n      model_id: \"Qwen2.5-Coder-3B-Instruct-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Qwen2.5-3B-Instruct-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      low_resource_required: true,\n      vram_required_MB: 2504.76,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model:\n        \"https://huggingface.co/mlc-ai/Qwen2.5-Coder-3B-Instruct-q4f32_1-MLC\",\n      model_id: \"Qwen2.5-Coder-3B-Instruct-q4f32_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Qwen2.5-3B-Instruct-q4f32_1-ctx4k_cs1k-webgpu.wasm\",\n      low_resource_required: true,\n      vram_required_MB: 2893.64,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model:\n        \"https://huggingface.co/mlc-ai/Qwen2.5-Coder-7B-Instruct-q4f16_1-MLC\",\n      model_id: \"Qwen2.5-Coder-7B-Instruct-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Qwen2-7B-Instruct-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      low_resource_required: false,\n      vram_required_MB: 5106.67,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model:\n        \"https://huggingface.co/mlc-ai/Qwen2.5-Coder-7B-Instruct-q4f32_1-MLC\",\n      model_id: \"Qwen2.5-Coder-7B-Instruct-q4f32_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Qwen2-7B-Instruct-q4f32_1-ctx4k_cs1k-webgpu.wasm\",\n      low_resource_required: false,\n      vram_required_MB: 5900.09,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    // Qwen2.5-Math\n    {\n      model:\n        \"https://huggingface.co/mlc-ai/Qwen2.5-Math-1.5B-Instruct-q4f16_1-MLC\",\n      model_id: \"Qwen2.5-Math-1.5B-Instruct-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Qwen2-1.5B-Instruct-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      low_resource_required: true,\n      vram_required_MB: 1629.75,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model:\n        \"https://huggingface.co/mlc-ai/Qwen2.5-Math-1.5B-Instruct-q4f32_1-MLC\",\n      model_id: \"Qwen2.5-Math-1.5B-Instruct-q4f32_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Qwen2-1.5B-Instruct-q4f32_1-ctx4k_cs1k-webgpu.wasm\",\n      low_resource_required: true,\n      vram_required_MB: 1888.97,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    // StableLM-zephyr-1.6B\n    {\n      model: \"https://huggingface.co/mlc-ai/stablelm-2-zephyr-1_6b-q4f16_1-MLC\",\n      model_id: \"stablelm-2-zephyr-1_6b-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/stablelm-2-zephyr-1_6b-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 2087.66,\n      low_resource_required: false,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/stablelm-2-zephyr-1_6b-q4f32_1-MLC\",\n      model_id: \"stablelm-2-zephyr-1_6b-q4f32_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/stablelm-2-zephyr-1_6b-q4f32_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 2999.33,\n      low_resource_required: false,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/stablelm-2-zephyr-1_6b-q4f16_1-MLC\",\n      model_id: \"stablelm-2-zephyr-1_6b-q4f16_1-MLC-1k\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/stablelm-2-zephyr-1_6b-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 1511.66,\n      low_resource_required: true,\n      overrides: {\n        context_window_size: 1024,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/stablelm-2-zephyr-1_6b-q4f32_1-MLC\",\n      model_id: \"stablelm-2-zephyr-1_6b-q4f32_1-MLC-1k\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/stablelm-2-zephyr-1_6b-q4f32_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 1847.33,\n      low_resource_required: true,\n      overrides: {\n        context_window_size: 1024,\n      },\n    },\n    // RedPajama\n    {\n      model:\n        \"https://huggingface.co/mlc-ai/RedPajama-INCITE-Chat-3B-v1-q4f16_1-MLC\",\n      model_id: \"RedPajama-INCITE-Chat-3B-v1-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/RedPajama-INCITE-Chat-3B-v1-q4f16_1-ctx2k_cs1k-webgpu.wasm\",\n      vram_required_MB: 2972.09,\n      low_resource_required: false,\n      required_features: [\"shader-f16\"],\n      overrides: {\n        context_window_size: 2048,\n      },\n    },\n    {\n      model:\n        \"https://huggingface.co/mlc-ai/RedPajama-INCITE-Chat-3B-v1-q4f32_1-MLC\",\n      model_id: \"RedPajama-INCITE-Chat-3B-v1-q4f32_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/RedPajama-INCITE-Chat-3B-v1-q4f32_1-ctx2k_cs1k-webgpu.wasm\",\n      vram_required_MB: 3928.09,\n      low_resource_required: false,\n      overrides: {\n        context_window_size: 2048,\n      },\n    },\n    {\n      model:\n        \"https://huggingface.co/mlc-ai/RedPajama-INCITE-Chat-3B-v1-q4f16_1-MLC\",\n      model_id: \"RedPajama-INCITE-Chat-3B-v1-q4f16_1-MLC-1k\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/RedPajama-INCITE-Chat-3B-v1-q4f16_1-ctx2k_cs1k-webgpu.wasm\",\n      vram_required_MB: 2041.09,\n      low_resource_required: true,\n      required_features: [\"shader-f16\"],\n      overrides: {\n        context_window_size: 1024,\n      },\n    },\n    {\n      model:\n        \"https://huggingface.co/mlc-ai/RedPajama-INCITE-Chat-3B-v1-q4f32_1-MLC\",\n      model_id: \"RedPajama-INCITE-Chat-3B-v1-q4f32_1-MLC-1k\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/RedPajama-INCITE-Chat-3B-v1-q4f32_1-ctx2k_cs1k-webgpu.wasm\",\n      vram_required_MB: 2558.09,\n      low_resource_required: true,\n      overrides: {\n        context_window_size: 1024,\n      },\n    },\n    // TinyLlama v1.0\n    {\n      model:\n        \"https://huggingface.co/mlc-ai/TinyLlama-1.1B-Chat-v1.0-q4f16_1-MLC\",\n      model_id: \"TinyLlama-1.1B-Chat-v1.0-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/TinyLlama-1.1B-Chat-v1.0-q4f16_1-ctx2k_cs1k-webgpu.wasm\",\n      vram_required_MB: 697.24,\n      low_resource_required: true,\n      required_features: [\"shader-f16\"],\n      overrides: {\n        context_window_size: 2048,\n      },\n    },\n    {\n      model:\n        \"https://huggingface.co/mlc-ai/TinyLlama-1.1B-Chat-v1.0-q4f32_1-MLC\",\n      model_id: \"TinyLlama-1.1B-Chat-v1.0-q4f32_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/TinyLlama-1.1B-Chat-v1.0-q4f32_1-ctx2k_cs1k-webgpu.wasm\",\n      vram_required_MB: 839.98,\n      low_resource_required: true,\n      overrides: {\n        context_window_size: 2048,\n      },\n    },\n    {\n      model:\n        \"https://huggingface.co/mlc-ai/TinyLlama-1.1B-Chat-v1.0-q4f16_1-MLC\",\n      model_id: \"TinyLlama-1.1B-Chat-v1.0-q4f16_1-MLC-1k\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/TinyLlama-1.1B-Chat-v1.0-q4f16_1-ctx2k_cs1k-webgpu.wasm\",\n      vram_required_MB: 675.24,\n      low_resource_required: true,\n      required_features: [\"shader-f16\"],\n      overrides: {\n        context_window_size: 1024,\n      },\n    },\n    {\n      model:\n        \"https://huggingface.co/mlc-ai/TinyLlama-1.1B-Chat-v1.0-q4f32_1-MLC\",\n      model_id: \"TinyLlama-1.1B-Chat-v1.0-q4f32_1-MLC-1k\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/TinyLlama-1.1B-Chat-v1.0-q4f32_1-ctx2k_cs1k-webgpu.wasm\",\n      vram_required_MB: 795.98,\n      low_resource_required: true,\n      overrides: {\n        context_window_size: 1024,\n      },\n    },\n    // BELOW ARE MODELS OF OLDER VERSIONS OR NOT AS PRACTICAL\n    // Llama-3.1 70B\n    {\n      model: \"https://huggingface.co/mlc-ai/Llama-3.1-70B-Instruct-q3f16_1-MLC\",\n      model_id: \"Llama-3.1-70B-Instruct-q3f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Llama-3_1-70B-Instruct-q3f16_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 31153.13,\n      low_resource_required: false,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    // Qwen-2\n    {\n      model: \"https://huggingface.co/mlc-ai/Qwen2-0.5B-Instruct-q4f16_1-MLC\",\n      model_id: \"Qwen2-0.5B-Instruct-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Qwen2-0.5B-Instruct-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      low_resource_required: true,\n      vram_required_MB: 944.62,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/Qwen2-0.5B-Instruct-q0f16-MLC\",\n      model_id: \"Qwen2-0.5B-Instruct-q0f16-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Qwen2-0.5B-Instruct-q0f16-ctx4k_cs1k-webgpu.wasm\",\n      low_resource_required: true,\n      vram_required_MB: 1624.12,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    // TODO: temporarily commenting out q0f32 models due to correctness issues\n    // {\n    //   model: \"https://huggingface.co/mlc-ai/Qwen2-0.5B-Instruct-q0f32-MLC\",\n    //   model_id: \"Qwen2-0.5B-Instruct-q0f32-MLC\",\n    //   model_lib:\n    //     modelLibURLPrefix +\n    //     modelVersion +\n    //     \"/Qwen2-0.5B-Instruct-q0f32-ctx4k_cs1k-webgpu.wasm\",\n    //   low_resource_required: true,\n    //   vram_required_MB: 2654.75,\n    //   overrides: {\n    //     context_window_size: 4096,\n    //   },\n    // },\n    {\n      model: \"https://huggingface.co/mlc-ai/Qwen2-1.5B-Instruct-q4f16_1-MLC\",\n      model_id: \"Qwen2-1.5B-Instruct-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Qwen2-1.5B-Instruct-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      low_resource_required: true,\n      vram_required_MB: 1629.75,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/Qwen2-1.5B-Instruct-q4f32_1-MLC\",\n      model_id: \"Qwen2-1.5B-Instruct-q4f32_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Qwen2-1.5B-Instruct-q4f32_1-ctx4k_cs1k-webgpu.wasm\",\n      low_resource_required: true,\n      vram_required_MB: 1888.97,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/Qwen2-7B-Instruct-q4f16_1-MLC\",\n      model_id: \"Qwen2-7B-Instruct-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Qwen2-7B-Instruct-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      low_resource_required: false,\n      vram_required_MB: 5106.67,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/Qwen2-7B-Instruct-q4f32_1-MLC\",\n      model_id: \"Qwen2-7B-Instruct-q4f32_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Qwen2-7B-Instruct-q4f32_1-ctx4k_cs1k-webgpu.wasm\",\n      low_resource_required: false,\n      vram_required_MB: 5900.09,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    // Qwen2-Math\n    {\n      model:\n        \"https://huggingface.co/mlc-ai/Qwen2-Math-1.5B-Instruct-q4f16_1-MLC\",\n      model_id: \"Qwen2-Math-1.5B-Instruct-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Qwen2-1.5B-Instruct-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      low_resource_required: true,\n      vram_required_MB: 1629.75,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model:\n        \"https://huggingface.co/mlc-ai/Qwen2-Math-1.5B-Instruct-q4f32_1-MLC\",\n      model_id: \"Qwen2-Math-1.5B-Instruct-q4f32_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Qwen2-1.5B-Instruct-q4f32_1-ctx4k_cs1k-webgpu.wasm\",\n      low_resource_required: true,\n      vram_required_MB: 1888.97,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/Qwen2-Math-7B-Instruct-q4f16_1-MLC\",\n      model_id: \"Qwen2-Math-7B-Instruct-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Qwen2-7B-Instruct-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      low_resource_required: false,\n      vram_required_MB: 5106.67,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/Qwen2-Math-7B-Instruct-q4f32_1-MLC\",\n      model_id: \"Qwen2-Math-7B-Instruct-q4f32_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Qwen2-7B-Instruct-q4f32_1-ctx4k_cs1k-webgpu.wasm\",\n      low_resource_required: false,\n      vram_required_MB: 5900.09,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    // Llama-3\n    {\n      model: \"https://huggingface.co/mlc-ai/Llama-3-8B-Instruct-q4f32_1-MLC\",\n      model_id: \"Llama-3-8B-Instruct-q4f32_1-MLC-1k\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Llama-3-8B-Instruct-q4f32_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 5295.7,\n      low_resource_required: true,\n      overrides: {\n        context_window_size: 1024,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/Llama-3-8B-Instruct-q4f16_1-MLC\",\n      model_id: \"Llama-3-8B-Instruct-q4f16_1-MLC-1k\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Llama-3-8B-Instruct-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 4598.34,\n      low_resource_required: true,\n      overrides: {\n        context_window_size: 1024,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/Llama-3-8B-Instruct-q4f32_1-MLC\",\n      model_id: \"Llama-3-8B-Instruct-q4f32_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Llama-3-8B-Instruct-q4f32_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 6101.01,\n      low_resource_required: false,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/Llama-3-8B-Instruct-q4f16_1-MLC\",\n      model_id: \"Llama-3-8B-Instruct-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Llama-3-8B-Instruct-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 5001.0,\n      low_resource_required: false,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/Llama-3-70B-Instruct-q3f16_1-MLC\",\n      model_id: \"Llama-3-70B-Instruct-q3f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Llama-3-70B-Instruct-q3f16_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 31153.13,\n      low_resource_required: false,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    // Phi3-mini-instruct\n    {\n      model: \"https://huggingface.co/mlc-ai/Phi-3-mini-4k-instruct-q4f16_1-MLC\",\n      model_id: \"Phi-3-mini-4k-instruct-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Phi-3-mini-4k-instruct-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 3672.07,\n      low_resource_required: false,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/Phi-3-mini-4k-instruct-q4f32_1-MLC\",\n      model_id: \"Phi-3-mini-4k-instruct-q4f32_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Phi-3-mini-4k-instruct-q4f32_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 5483.12,\n      low_resource_required: false,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/Phi-3-mini-4k-instruct-q4f16_1-MLC\",\n      model_id: \"Phi-3-mini-4k-instruct-q4f16_1-MLC-1k\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Phi-3-mini-4k-instruct-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 2520.07,\n      low_resource_required: true,\n      overrides: {\n        context_window_size: 1024,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/Phi-3-mini-4k-instruct-q4f32_1-MLC\",\n      model_id: \"Phi-3-mini-4k-instruct-q4f32_1-MLC-1k\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Phi-3-mini-4k-instruct-q4f32_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 3179.12,\n      low_resource_required: true,\n      overrides: {\n        context_window_size: 1024,\n      },\n    },\n    // Llama-2\n    {\n      model: \"https://huggingface.co/mlc-ai/Llama-2-7b-chat-hf-q4f32_1-MLC\",\n      model_id: \"Llama-2-7b-chat-hf-q4f32_1-MLC-1k\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Llama-2-7b-chat-hf-q4f32_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 5284.01,\n      low_resource_required: false,\n      overrides: {\n        context_window_size: 1024,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/Llama-2-7b-chat-hf-q4f16_1-MLC\",\n      model_id: \"Llama-2-7b-chat-hf-q4f16_1-MLC-1k\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Llama-2-7b-chat-hf-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 4618.52,\n      low_resource_required: false,\n      required_features: [\"shader-f16\"],\n      overrides: {\n        context_window_size: 1024,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/Llama-2-7b-chat-hf-q4f32_1-MLC\",\n      model_id: \"Llama-2-7b-chat-hf-q4f32_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Llama-2-7b-chat-hf-q4f32_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 9109.03,\n      low_resource_required: false,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/Llama-2-7b-chat-hf-q4f16_1-MLC\",\n      model_id: \"Llama-2-7b-chat-hf-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Llama-2-7b-chat-hf-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 6749.02,\n      low_resource_required: false,\n      required_features: [\"shader-f16\"],\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/Llama-2-13b-chat-hf-q4f16_1-MLC\",\n      model_id: \"Llama-2-13b-chat-hf-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Llama-2-13b-chat-hf-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 11814.09,\n      low_resource_required: false,\n      required_features: [\"shader-f16\"],\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    // Gemma-2B\n    {\n      model: \"https://huggingface.co/mlc-ai/gemma-2b-it-q4f16_1-MLC\",\n      model_id: \"gemma-2b-it-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/gemma-2b-it-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 1476.52,\n      low_resource_required: false,\n      buffer_size_required_bytes: 262144000,\n      required_features: [\"shader-f16\"],\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/gemma-2b-it-q4f32_1-MLC\",\n      model_id: \"gemma-2b-it-q4f32_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/gemma-2b-it-q4f32_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 1750.66,\n      low_resource_required: false,\n      buffer_size_required_bytes: 262144000,\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/gemma-2b-it-q4f16_1-MLC\",\n      model_id: \"gemma-2b-it-q4f16_1-MLC-1k\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/gemma-2b-it-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 1476.52,\n      low_resource_required: true,\n      buffer_size_required_bytes: 262144000,\n      required_features: [\"shader-f16\"],\n      overrides: {\n        context_window_size: 1024,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/gemma-2b-it-q4f32_1-MLC\",\n      model_id: \"gemma-2b-it-q4f32_1-MLC-1k\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/gemma-2b-it-q4f32_1-ctx4k_cs1k-webgpu.wasm\",\n      vram_required_MB: 1750.66,\n      low_resource_required: true,\n      buffer_size_required_bytes: 262144000,\n      overrides: {\n        context_window_size: 1024,\n      },\n    },\n    // Phi-2\n    {\n      model: \"https://huggingface.co/mlc-ai/phi-2-q4f16_1-MLC\",\n      model_id: \"phi-2-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/phi-2-q4f16_1-ctx2k_cs1k-webgpu.wasm\",\n      vram_required_MB: 3053.97,\n      low_resource_required: false,\n      required_features: [\"shader-f16\"],\n      overrides: {\n        context_window_size: 2048,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/phi-2-q4f32_1-MLC\",\n      model_id: \"phi-2-q4f32_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/phi-2-q4f32_1-ctx2k_cs1k-webgpu.wasm\",\n      vram_required_MB: 4032.48,\n      low_resource_required: false,\n      overrides: {\n        context_window_size: 2048,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/phi-2-q4f16_1-MLC\",\n      model_id: \"phi-2-q4f16_1-MLC-1k\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/phi-2-q4f16_1-ctx2k_cs1k-webgpu.wasm\",\n      vram_required_MB: 2131.97,\n      low_resource_required: true,\n      required_features: [\"shader-f16\"],\n      overrides: {\n        context_window_size: 1024,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/phi-2-q4f32_1-MLC\",\n      model_id: \"phi-2-q4f32_1-MLC-1k\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/phi-2-q4f32_1-ctx2k_cs1k-webgpu.wasm\",\n      vram_required_MB: 2740.48,\n      low_resource_required: true,\n      overrides: {\n        context_window_size: 1024,\n      },\n    },\n    // Phi-1.5\n    {\n      model: \"https://huggingface.co/mlc-ai/phi-1_5-q4f16_1-MLC\",\n      model_id: \"phi-1_5-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/phi-1_5-q4f16_1-ctx2k_cs1k-webgpu.wasm\",\n      vram_required_MB: 1210.09,\n      low_resource_required: true,\n      required_features: [\"shader-f16\"],\n      overrides: {\n        context_window_size: 2048,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/phi-1_5-q4f32_1-MLC\",\n      model_id: \"phi-1_5-q4f32_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/phi-1_5-q4f32_1-ctx2k_cs1k-webgpu.wasm\",\n      vram_required_MB: 1682.09,\n      low_resource_required: true,\n      overrides: {\n        context_window_size: 2048,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/phi-1_5-q4f16_1-MLC\",\n      model_id: \"phi-1_5-q4f16_1-MLC-1k\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/phi-1_5-q4f16_1-ctx2k_cs1k-webgpu.wasm\",\n      vram_required_MB: 1210.09,\n      low_resource_required: true,\n      required_features: [\"shader-f16\"],\n      overrides: {\n        context_window_size: 1024,\n      },\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/phi-1_5-q4f32_1-MLC\",\n      model_id: \"phi-1_5-q4f32_1-MLC-1k\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/phi-1_5-q4f32_1-ctx2k_cs1k-webgpu.wasm\",\n      vram_required_MB: 1682.09,\n      low_resource_required: true,\n      overrides: {\n        context_window_size: 1024,\n      },\n    },\n    // TinyLlama v0.4\n    {\n      model:\n        \"https://huggingface.co/mlc-ai/TinyLlama-1.1B-Chat-v0.4-q4f16_1-MLC\",\n      model_id: \"TinyLlama-1.1B-Chat-v0.4-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/TinyLlama-1.1B-Chat-v0.4-q4f16_1-ctx2k_cs1k-webgpu.wasm\",\n      vram_required_MB: 697.24,\n      low_resource_required: true,\n      required_features: [\"shader-f16\"],\n      overrides: {\n        context_window_size: 2048,\n      },\n    },\n    {\n      model:\n        \"https://huggingface.co/mlc-ai/TinyLlama-1.1B-Chat-v0.4-q4f32_1-MLC\",\n      model_id: \"TinyLlama-1.1B-Chat-v0.4-q4f32_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/TinyLlama-1.1B-Chat-v0.4-q4f32_1-ctx2k_cs1k-webgpu.wasm\",\n      vram_required_MB: 839.98,\n      low_resource_required: true,\n      overrides: {\n        context_window_size: 2048,\n      },\n    },\n    {\n      model:\n        \"https://huggingface.co/mlc-ai/TinyLlama-1.1B-Chat-v0.4-q4f16_1-MLC\",\n      model_id: \"TinyLlama-1.1B-Chat-v0.4-q4f16_1-MLC-1k\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/TinyLlama-1.1B-Chat-v0.4-q4f16_1-ctx2k_cs1k-webgpu.wasm\",\n      vram_required_MB: 675.24,\n      low_resource_required: true,\n      required_features: [\"shader-f16\"],\n      overrides: {\n        context_window_size: 1024,\n      },\n    },\n    {\n      model:\n        \"https://huggingface.co/mlc-ai/TinyLlama-1.1B-Chat-v0.4-q4f32_1-MLC\",\n      model_id: \"TinyLlama-1.1B-Chat-v0.4-q4f32_1-MLC-1k\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/TinyLlama-1.1B-Chat-v0.4-q4f32_1-ctx2k_cs1k-webgpu.wasm\",\n      vram_required_MB: 795.98,\n      low_resource_required: true,\n      overrides: {\n        context_window_size: 1024,\n      },\n    },\n    // Embedding models\n    // -b means max_batch_size this model allows. The smaller it is, the less memory the model consumes.\n    {\n      model: \"https://huggingface.co/mlc-ai/snowflake-arctic-embed-m-q0f32-MLC\",\n      model_id: \"snowflake-arctic-embed-m-q0f32-MLC-b32\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/snowflake-arctic-embed-m-q0f32-ctx512_cs512_batch32-webgpu.wasm\",\n      vram_required_MB: 1407.51,\n      model_type: ModelType.embedding,\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/snowflake-arctic-embed-m-q0f32-MLC\",\n      model_id: \"snowflake-arctic-embed-m-q0f32-MLC-b4\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/snowflake-arctic-embed-m-q0f32-ctx512_cs512_batch4-webgpu.wasm\",\n      vram_required_MB: 539.4,\n      model_type: ModelType.embedding,\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/snowflake-arctic-embed-s-q0f32-MLC\",\n      model_id: \"snowflake-arctic-embed-s-q0f32-MLC-b32\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/snowflake-arctic-embed-s-q0f32-ctx512_cs512_batch32-webgpu.wasm\",\n      vram_required_MB: 1022.82,\n      model_type: ModelType.embedding,\n    },\n    {\n      model: \"https://huggingface.co/mlc-ai/snowflake-arctic-embed-s-q0f32-MLC\",\n      model_id: \"snowflake-arctic-embed-s-q0f32-MLC-b4\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/snowflake-arctic-embed-s-q0f32-ctx512_cs512_batch4-webgpu.wasm\",\n      vram_required_MB: 238.71,\n      model_type: ModelType.embedding,\n    },\n    // Ministral 3\n    {\n      model:\n        \"https://huggingface.co/mlc-ai/Ministral-3-3B-Base-2512-q4f16_1-MLC\",\n      model_id: \"Ministral-3-3B-Base-2512-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Ministral-3-3B-Base-2512-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model:\n        \"https://huggingface.co/mlc-ai/Ministral-3-3B-Reasoning-2512-q4f16_1-MLC\",\n      model_id: \"Ministral-3-3B-Reasoning-2512-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Ministral-3-3B-Reasoning-2512-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n    {\n      model:\n        \"https://huggingface.co/mlc-ai/Ministral-3-3B-Instruct-2512-BF16-q4f16_1-MLC\",\n      model_id: \"Ministral-3-3B-Instruct-2512-BF16-q4f16_1-MLC\",\n      model_lib:\n        modelLibURLPrefix +\n        modelVersion +\n        \"/Ministral-3-3B-Instruct-2512-BF16-q4f16_1-ctx4k_cs1k-webgpu.wasm\",\n      overrides: {\n        context_window_size: 4096,\n      },\n    },\n  ],\n};\n"
  },
  {
    "path": "src/conversation.ts",
    "content": "import {\n  ChatConfig,\n  ConvTemplateConfig,\n  MessagePlaceholders,\n  Role,\n} from \"./config\";\nimport {\n  ChatCompletionContentPart,\n  ChatCompletionContentPartImage,\n  ChatCompletionMessageParam,\n  ChatCompletionRequest,\n} from \"./openai_api_protocols/index\";\nimport {\n  ContentTypeError,\n  FunctionNotFoundError,\n  InvalidToolChoiceError,\n  MessageOrderError,\n  MultipleTextContentError,\n  SystemMessageOrderError,\n  TextCompletionConversationError,\n  TextCompletionConversationExpectsPrompt,\n  UnsupportedRoleError,\n  UnsupportedToolChoiceTypeError,\n  UnsupportedToolTypeError,\n} from \"./error\";\n\ntype ImageURL = ChatCompletionContentPartImage.ImageURL;\n\n/**\n * Helper to keep track of history conversations.\n */\nexport class Conversation {\n  // NOTE: Update `compareConversationObject()` whenever a new state is introduced.\n  /** Each message is a tuple of (Role, role_name_str, message), where message can be either a\n   *  string or an array of contentPart for possible image input.\n   */\n  public messages: Array<\n    [Role, string, string | Array<ChatCompletionContentPart> | undefined]\n  > = [];\n  readonly config: ConvTemplateConfig;\n\n  /** Whether the Conversation object is for text completion with no conversation-style formatting */\n  public isTextCompletion: boolean;\n  /** Used when isTextCompletion is true */\n  public prompt: string | undefined;\n\n  public function_string = \"\";\n  public use_function_calling = false;\n  public override_system_message?: string = undefined;\n\n  /**\n   * Tracks whether the last message is an empty thinking block. Should only\n   * be true when we are in the middle of a generation. Will be set to\n   * false when the reply is finished with `finishReply()`.\n   */\n  private isLastMessageEmptyThinkingReplyHeader = false;\n\n  // TODO(tvm-team) confirm and remove\n  // private contextWindowStart = 0;\n\n  constructor(config: ConvTemplateConfig, isTextCompletion = false) {\n    this.config = config;\n    this.isTextCompletion = isTextCompletion;\n  }\n\n  // TODO: Consider rewriting this method, a bit messy.\n  private getPromptArrayInternal(\n    addSystem: boolean,\n    startPos: number,\n  ): Array<string | Array<string | ImageURL>> {\n    if (this.config.seps.length == 0) {\n      throw Error(\"Need seps to work\");\n    }\n\n    // Prepare system message\n    // Get overridden system message if exists, else use default one in config\n    let system_message = this.config.system_message;\n    if (this.override_system_message !== undefined) {\n      system_message = this.override_system_message;\n    }\n    const system_prompt = this.config.system_template.replace(\n      MessagePlaceholders.system,\n      system_message,\n    );\n    const ret: Array<string | Array<string | ImageURL>> =\n      addSystem && system_prompt !== \"\" ? [system_prompt] : [];\n\n    // Process each message in this.messages\n    for (let i = startPos; i < this.messages.length; ++i) {\n      const item = this.messages[i];\n      const role = item[0];\n      const role_str = item[1];\n      const messageContent = item[2];\n\n      // 1. Message from `appendReplyHeader()`, message is empty; not much processing is needed.\n      if (messageContent === undefined) {\n        if (i !== this.messages.length - 1) {\n          throw new Error(\n            \"InternalError: Only expect message to be undefined for last \" +\n              \"message for a reply header.\",\n          );\n        }\n        // Add \": \" if there is no such field. If \"\", do not add sep\n        const empty_sep =\n          this.config.role_empty_sep || this.config.role_empty_sep == \"\"\n            ? this.config.role_empty_sep\n            : \": \";\n        ret.push(role_str + empty_sep);\n        continue;\n      }\n\n      // 2. Message from `appendEmptyThinkingReplyHeader()`, message is an empty thinking block.\n      if (\n        this.isLastMessageEmptyThinkingReplyHeader &&\n        i === this.messages.length - 1\n      ) {\n        // TODO(Charlie): content_sep or empty_sep? For Qwen3, both are \"\\n\".\n        const content_sep =\n          this.config.role_content_sep || this.config.role_content_sep == \"\"\n            ? this.config.role_content_sep\n            : \": \";\n        ret.push(role_str + content_sep + messageContent);\n        continue;\n      }\n\n      // 3. Each messageContent consists of one textPart, and >= 0 imageParts, regardless whether\n      // it is Array<ChatCompletionContentPart> or text message. So we extract out each.\n      let textContentPart = \"\"; // if no textPart, use an empty string\n      const imageContentParts: ImageURL[] = [];\n      if (Array.isArray(messageContent)) {\n        // 2.1 content is Array<ChatCompletionContentPart>\n        // Iterate through the contentParts, get the text and list of images. There should\n        // be only a single text. TODO: is it always the case the number of textContentPart <= 1?\n        let seenText = false;\n        for (let i = 0; i < messageContent.length; i++) {\n          const curContentPart = messageContent[i];\n          if (curContentPart.type === \"text\") {\n            if (seenText) {\n              throw new MultipleTextContentError();\n            }\n            textContentPart = curContentPart.text;\n            seenText = true;\n          } else {\n            imageContentParts.push(curContentPart.image_url);\n          }\n        }\n      } else {\n        // 2.2 content is just a string\n        textContentPart = messageContent;\n      }\n\n      // 3. Format textContentPart with role and sep to get message_str and role_prefix\n      let message_str;\n      let role_prefix;\n      if (this.config.role_templates !== undefined) {\n        message_str = this.config.role_templates[role]?.replace(\n          MessagePlaceholders[Role[role] as keyof typeof MessagePlaceholders],\n          textContentPart,\n        );\n        if (this.use_function_calling && this.function_string !== \"\") {\n          message_str = message_str?.replace(\n            MessagePlaceholders.function,\n            this.function_string,\n          );\n        }\n        message_str = message_str?.replace(MessagePlaceholders.function, \"\");\n      }\n\n      if (message_str == undefined) {\n        message_str = textContentPart;\n      }\n      if (\n        this.config.add_role_after_system_message === false &&\n        system_prompt != \"\" &&\n        i == 0\n      ) {\n        role_prefix = \"\";\n      } else {\n        // Add \": \" if there is no such field. If \"\", do not add sep\n        const content_sep =\n          this.config.role_content_sep || this.config.role_content_sep == \"\"\n            ? this.config.role_content_sep\n            : \": \";\n        role_prefix = role_str + content_sep;\n      }\n\n      // 4. Combine everything together\n      if (imageContentParts.length === 0) {\n        // If no image, just a single string to represent this message\n        ret.push(\n          role_prefix +\n            message_str +\n            this.config.seps[i % this.config.seps.length],\n        );\n      } else {\n        // If has image input, currently we hard code it to Phi3.5-vision's format:\n        // `<|user|>\\n<|image_1|>\\n<|image_2|>\\n{prompt}<|end|>\\n`\n        // So we will return a list for this:\n        // [`<|user|>\\n`, imageUrl1, `\\n`, imageUrl2, `\\n`, `{prompt}<|end|>\\n`]\n        const curMessageList: Array<string | ImageURL> = [role_prefix];\n        imageContentParts.forEach((curImage: ImageURL) => {\n          curMessageList.push(curImage);\n          curMessageList.push(\"\\n\");\n        });\n        curMessageList.push(\n          message_str + this.config.seps[i % this.config.seps.length],\n        );\n        ret.push(curMessageList);\n      }\n    }\n    return ret;\n  }\n\n  /**\n   * Get prompt arrays with the first one as system.\n   *\n   * It is returned as an array of `string | Array<string | ImageURL>`, where each element of\n   * the array represents the formatted message of a role/turn. If the message only contains text,\n   * it will be a string that concatenates the role string, message, and separators. If the\n   * message contains image(s), it will be an array of string and ImageURL in the order of which\n   * they will be prefilled into the model. e.g. it can be something like\n   * [\n   *   \"<|system|>\\nSome system prompt\\n\",\n   *   [\n   *     \"<|user|>\\n\",\n   *     imageURL1,\n   *     \"\\n\",\n   *     imageURL2,\n   *     \"\\n\",\n   *     \"Some user input<|end|>\\n\"\n   *   ],\n   * ]\n   *\n   * @returns The prompt array.\n   */\n  getPromptArray(): Array<string | Array<string | ImageURL>> {\n    if (this.isTextCompletion) {\n      throw new TextCompletionConversationError(\"getPromptArray\");\n    }\n    return this.getPromptArrayInternal(true, 0);\n  }\n\n  /**\n   * Get the last round of prompt has not been fed as input.\n   *\n   * @note This function needs to be used with the assumption that\n   *       the caller call appendMessage then appendReplyHeader.\n   *\n   * @returns The prompt array.\n   */\n  getPromptArrayLastRound() {\n    if (this.isTextCompletion) {\n      throw new TextCompletionConversationError(\"getPromptArrayLastRound\");\n    }\n    if (this.messages.length < 3) {\n      throw Error(\"needs to call getPromptArray for the first message\");\n    }\n    return this.getPromptArrayInternal(false, this.messages.length - 2);\n  }\n\n  /**\n   * Return prompt in an array for non-conversation text completion.\n   */\n  getPromptArrayTextCompletion(): Array<string> {\n    if (!this.isTextCompletion || this.prompt === undefined) {\n      throw new TextCompletionConversationExpectsPrompt();\n    }\n    return [this.prompt];\n  }\n\n  /**\n   * Resets all states for this.conversation.\n   */\n  reset() {\n    // Note: Update this whenever we introduce a new state to conversation.\n    this.messages = [];\n    this.override_system_message = undefined;\n    this.function_string = \"\";\n    this.use_function_calling = false;\n    this.isTextCompletion = false;\n    this.prompt = undefined;\n  }\n\n  getStopStr(): string[] {\n    // TODO(Charlie): Is this needed?\n    // if (this.config.stop_str.length > 0) {\n    //   return this.config.stop_str;\n    // }\n    // return [this.config.seps[this.config.seps.length - 1]];\n    return this.config.stop_str;\n  }\n\n  getStopTokens() {\n    return this.config.stop_token_ids;\n  }\n\n  appendMessage(\n    role: Role,\n    message: string | Array<ChatCompletionContentPart>,\n    role_name?: string,\n  ) {\n    if (this.isTextCompletion) {\n      throw new TextCompletionConversationError(\"appendMessage\");\n    }\n    if (\n      this.messages.length != 0 &&\n      this.messages[this.messages.length - 1][2] == undefined\n    ) {\n      throw Error(\"Have unfinished reply\");\n    }\n    if (!(role in this.config.roles)) {\n      throw Error(\"Role is not supported: \" + role);\n    }\n    const role_name_str = role_name ? role_name : this.config.roles[role];\n    this.messages.push([role, role_name_str, message]);\n  }\n\n  appendReplyHeader(role: Role) {\n    if (this.isTextCompletion) {\n      throw new TextCompletionConversationError(\"appendReplyHeader\");\n    }\n    if (!(role in this.config.roles)) {\n      throw Error(\"Role is not supported: \" + role);\n    }\n    this.messages.push([role, this.config.roles[role], undefined]);\n  }\n\n  appendEmptyThinkingReplyHeader(role: Role, emptyThinkingBlockStr: string) {\n    if (this.isTextCompletion) {\n      throw new TextCompletionConversationError(\n        \"appendEmptyThinkingReplyHeader\",\n      );\n    }\n    this.isLastMessageEmptyThinkingReplyHeader = true;\n    this.messages.push([role, this.config.roles[role], emptyThinkingBlockStr]);\n  }\n\n  finishReply(message: string) {\n    if (this.isTextCompletion) {\n      throw new TextCompletionConversationError(\"finishReply\");\n    }\n    if (this.messages.length == 0) {\n      throw Error(\"Message error should not be 0\");\n    }\n    if (\n      this.messages[this.messages.length - 1][2] !== undefined &&\n      // If the last message has an empty thinknig block, last message is expected\n      // to be non-empty.\n      this.isLastMessageEmptyThinkingReplyHeader === false\n    ) {\n      throw Error(\"Already assigned\");\n    }\n    this.messages[this.messages.length - 1][2] = message;\n    this.isLastMessageEmptyThinkingReplyHeader = false;\n  }\n}\n\nexport function getConversation(\n  conv_template: ConvTemplateConfig,\n  conv_config?: Partial<ConvTemplateConfig>,\n  isTextCompletion = false,\n): Conversation {\n  // Update with conv_config\n  return new Conversation(\n    { ...conv_template, ...conv_config },\n    isTextCompletion,\n  );\n}\n\n/**\n * Compare the states of two conversation instances. Equality is defined as their getPromptArray()\n * should return the exact same things, which is determined by fields: messages, function_string,\n * use_function_calling, and override_system_message.\n *\n * @returns True if `convA` equals to `convB`\n * @note We assume convA and convB has the same `this.config`.\n */\nexport function compareConversationObject(\n  convA: Conversation,\n  convB: Conversation,\n): boolean {\n  // NOTE: Update this function whenever a new state is introduced to `Conversation`.\n  // Check the easy ones first\n  if (\n    convA.function_string !== convB.function_string ||\n    convA.use_function_calling !== convB.use_function_calling ||\n    convA.override_system_message !== convB.override_system_message ||\n    convA.messages.length !== convB.messages.length ||\n    convA.isTextCompletion !== convB.isTextCompletion\n  ) {\n    return false;\n  }\n\n  // Then check message\n  if (convA.messages.length === 0 && convB.messages.length === 0) {\n    // both are empty\n    return true;\n  }\n  if (convA.messages.length !== convB.messages.length) {\n    // different number of messages\n    return false;\n  }\n\n  const msgLen = convA.messages.length;\n  const msgEntryLen = convA.messages[0].length; // always 3 for now\n  for (let i = 0; i < msgLen; i++) {\n    for (let j = 0; j < msgEntryLen; j++) {\n      const entryA = convA.messages[i][j];\n      const entryB = convB.messages[i][j];\n      if (typeof entryA === \"string\" && typeof entryB === \"string\") {\n        // Case 1: both are strings\n        if (convA.messages[i][j] !== convB.messages[i][j]) {\n          return false;\n        }\n      } else if (entryA === undefined && entryB === undefined) {\n        // Case 2: both undefined\n        continue;\n      } else if (Array.isArray(entryA) && Array.isArray(entryB)) {\n        // Case 3: both are ChatCompletionContentPart[]\n        if (entryA.length !== entryB.length) {\n          return false;\n        }\n        const numContentParts = entryA.length;\n        for (let k = 0; k < numContentParts; k++) {\n          const entryA_k = entryA[k];\n          const entryB_k = entryB[k];\n          if (entryA_k.type === \"text\" && entryB_k.type === \"text\") {\n            // Case 3.1: both are text\n            if (entryA_k.text !== entryB_k.text) {\n              return false;\n            }\n          } else if (\n            entryA_k.type === \"image_url\" &&\n            entryB_k.type === \"image_url\"\n          ) {\n            // Case 3.2: both are image_url\n            if (\n              entryA_k.image_url.url !== entryB_k.image_url.url ||\n              entryA_k.image_url.detail !== entryB_k.image_url.detail\n            ) {\n              return false;\n            }\n          } else {\n            // Case 3.3: of different type\n            return false;\n          }\n        }\n      } else {\n        // Case 4: two entries are of different types\n        return false;\n      }\n    }\n  }\n  return true;\n}\n\n/**\n * Get a new Conversation object based on the chat completion request.\n *\n * @param request The incoming ChatCompletionRequest\n * @param includeLastMsg Include last message, by default is false. Set to true for testing only.\n * @note By default, `request.messages[-1]` is not included as it would be treated as a normal\n * input to `prefill()`.\n */\nexport function getConversationFromChatCompletionRequest(\n  request: ChatCompletionRequest,\n  config: ChatConfig,\n  includeLastMsg = false,\n): Conversation {\n  // 0. Instantiate a new Conversation object\n  const conversation = getConversation(\n    config.conv_template,\n    config.conv_config,\n  );\n\n  // 1. Populate function-calling-related fields\n  // TODO: either remove these or support gorilla-like function calling models.\n  // These commented code was used to support gorilla, but we could not use grammar to\n  // guarantee its output, nor make it conform to OpenAI's function calling output. Kept for now.\n  // const functionCallUsage = this.getFunctionCallUsage(request);\n  // conversation.function_string = functionCallUsage;\n  // conversation.use_function_calling = functionCallUsage !== \"\";\n\n  // 2. Populate conversation.messages\n  const input = request.messages;\n  const lastId = input.length - 1;\n  if (input[lastId].role !== \"user\" && input[lastId].role !== \"tool\") {\n    throw new MessageOrderError(\n      \"The last message should be from the `user` or `tool`.\",\n    );\n  }\n  const iterEnd = includeLastMsg ? input.length : input.length - 1;\n  for (let i = 0; i < iterEnd; i++) {\n    const message: ChatCompletionMessageParam = input[i];\n    if (message.role === \"system\") {\n      if (i !== 0) {\n        throw new SystemMessageOrderError();\n      }\n      conversation.override_system_message = message.content;\n    } else if (message.role === \"user\") {\n      conversation.appendMessage(Role.user, message.content, message.name);\n    } else if (message.role === \"assistant\") {\n      if (typeof message.content !== \"string\") {\n        throw new ContentTypeError(message.role + \"'s message\");\n      }\n      conversation.appendMessage(Role.assistant, message.content, message.name);\n    } else if (message.role === \"tool\") {\n      conversation.appendMessage(Role.tool, message.content);\n    } else {\n      // Use `[\"role\"]` instead of `.role` to suppress \"Property does not exist on type 'never'\"\n      throw new UnsupportedRoleError(message[\"role\"]);\n    }\n  }\n  return conversation;\n}\n\n/**\n * Returns the function string based on the request.tools and request.tool_choice, raises erros if\n * encounter invalid request.\n *\n * @param request The chatCompletionRequest we are about to prefill for.\n * @returns The string used to set Conversation.function_string\n */\nexport function getFunctionCallUsage(request: ChatCompletionRequest): string {\n  if (\n    request.tools == undefined ||\n    (typeof request.tool_choice == \"string\" && request.tool_choice == \"none\")\n  ) {\n    return \"\";\n  }\n  if (\n    typeof request.tool_choice == \"string\" &&\n    request.tool_choice !== \"auto\"\n  ) {\n    throw new InvalidToolChoiceError(request.tool_choice);\n  }\n  if (\n    typeof request.tool_choice !== \"string\" &&\n    request.tool_choice?.type !== \"function\"\n  ) {\n    throw new UnsupportedToolChoiceTypeError();\n  }\n\n  const singleFunctionToCall =\n    typeof request.tool_choice !== \"string\" &&\n    request.tool_choice?.function?.name;\n  if (singleFunctionToCall) {\n    for (const f of request.tools) {\n      if (singleFunctionToCall == f.function.name) {\n        return JSON.stringify([f.function]);\n      }\n    }\n    throw new FunctionNotFoundError(singleFunctionToCall);\n  }\n\n  const function_list = [];\n  for (const f of request.tools) {\n    if (f.type !== \"function\") {\n      throw new UnsupportedToolTypeError();\n    }\n    function_list.push(f.function);\n  }\n  return JSON.stringify(function_list);\n}\n"
  },
  {
    "path": "src/embedding.ts",
    "content": "import * as tvmjs from \"@mlc-ai/web-runtime\";\nimport log from \"loglevel\";\nimport { Tokenizer } from \"@mlc-ai/web-tokenizers\";\nimport { ChatConfig } from \"./config\";\nimport {\n  EmbeddingChunkingUnsupportedError,\n  EmbeddingExceedContextWindowSizeError,\n  EmbeddingInputEmptyError,\n  EmbeddingSlidingWindowError,\n  MinValueError,\n} from \"./error\";\n\nexport class EmbeddingPipeline {\n  private config: ChatConfig;\n  private tokenizer: Tokenizer;\n\n  // TVM functions\n  private tvm: tvmjs.Instance;\n  private device: tvmjs.DLDevice;\n  private vm: tvmjs.VirtualMachine;\n  private prefill: tvmjs.PackedFunc;\n  private params: tvmjs.TVMObject;\n\n  // metadata\n  private contextWindowSize = -1;\n  private prefillChunkSize = -1;\n  private maxBatchSize = -1;\n\n  // performance\n  private curRoundEmbedTotalTokens = 0; // excludes padded tokens for batching\n  private curRoundEmbedTotalTime = 0;\n\n  constructor(tvm: tvmjs.Instance, tokenizer: Tokenizer, config: ChatConfig) {\n    // 0. Setting attributes\n    this.tvm = tvm;\n    this.tokenizer = tokenizer;\n    this.config = config;\n    this.device = this.tvm.webgpu();\n\n    // 1. Create VM and get the core functions\n    tvm.beginScope();\n    this.vm = this.tvm.detachFromCurrentScope(\n      this.tvm.createVirtualMachine(this.device),\n    );\n    this.prefill = this.tvm.detachFromCurrentScope(\n      this.vm.getFunction(\"prefill\"),\n    );\n\n    // 2. Get json stored in the vm's metadata function\n    const fgetMetadata = this.vm.getFunction(\"_metadata\");\n    const ret_value = fgetMetadata();\n    const metadataStr = ret_value.toString();\n    const metadata = JSON.parse(metadataStr);\n\n    // 3. Load parameters by name\n    const paramNames: string[] = [];\n    metadata.params.forEach((param: any) => {\n      paramNames.push(param.name);\n    });\n    this.params = this.tvm.detachFromCurrentScope(\n      this.tvm.getParamsFromCacheByName(paramNames),\n    );\n\n    // 4. Read in compilation configurations from metadata\n    // We use context window size max batch size to check validity of the model\n    // We assume prefillChunkSize is the same as contextWindowSize for embedding model for now\n    this.maxBatchSize = metadata.max_batch_size;\n    this.contextWindowSize = this.config.context_window_size;\n    this.prefillChunkSize = metadata.prefill_chunk_size;\n    log.info(\"Using maxBatchSize: \", this.maxBatchSize);\n    log.info(\"Using contextWindowSize: \", this.contextWindowSize);\n    log.info(\"Using prefillChunkSize: \", this.prefillChunkSize);\n\n    if (this.config.sliding_window_size !== -1) {\n      throw new EmbeddingSlidingWindowError(this.config.sliding_window_size);\n    }\n    if (this.maxBatchSize <= 0) {\n      throw new MinValueError(\"maxBatchSize\", 0);\n    }\n    if (this.contextWindowSize <= 0) {\n      throw new MinValueError(\"contextWindowSize\", 0);\n    }\n    if (this.prefillChunkSize <= 0) {\n      throw new MinValueError(\"prefillChunkSize\", 0);\n    }\n    if (this.prefillChunkSize !== this.contextWindowSize) {\n      throw new EmbeddingChunkingUnsupportedError(\n        this.contextWindowSize,\n        this.prefillChunkSize,\n      );\n    }\n    tvm.endScope();\n  }\n\n  async embedStep(\n    input: string | Array<string> | Array<number> | Array<Array<number>>,\n  ): Promise<Array<Array<number>>> {\n    // 0. Reset performance metrics\n    this.curRoundEmbedTotalTokens = 0;\n    this.curRoundEmbedTotalTime = 0;\n    let totalNumTokens = 0;\n    const embedStart = performance.now();\n    let tokenizedInputs: Array<Array<number>> = [];\n    const tempInputs: Array<number> = [];\n    // 1. Convert all possible input types to Array<Array<number>>, tokenize if not already\n    // Cannot use input.every to match type, which leads to TS compilation error\n    // https://github.com/microsoft/TypeScript/issues/33591\n    if (input.length === 0) {\n      throw new EmbeddingInputEmptyError();\n    }\n    if (typeof input === \"string\") {\n      // string\n      tokenizedInputs = [Array.from(this.tokenizer.encode(input))];\n    } else {\n      for (let i = 0; i < input.length; i++) {\n        const curInput = input[i];\n        if (Array.isArray(curInput)) {\n          // Array<Array<number>>\n          tokenizedInputs.push(curInput);\n        } else if (typeof curInput === \"string\") {\n          // Array<string>\n          tokenizedInputs.push(Array.from(this.tokenizer.encode(curInput)));\n        } else {\n          // Array<number>\n          tempInputs.push(curInput);\n        }\n      }\n    }\n    if (tempInputs.length > 0) {\n      tokenizedInputs.push(tempInputs);\n    }\n\n    // 2. Check each input is not larger than the context window size\n    // TODO: tokenizer.encode seems to implicitly truncates to contextWindowSize, confirm behavior\n    // and decide whether to warn user\n    for (let i = 0; i < tokenizedInputs.length; i++) {\n      const curInputSize = tokenizedInputs[i].length;\n      totalNumTokens += curInputSize;\n      if (curInputSize > this.contextWindowSize) {\n        throw new EmbeddingExceedContextWindowSizeError(\n          this.contextWindowSize,\n          curInputSize,\n        );\n      }\n    }\n    if (tokenizedInputs.length === 0) {\n      throw new Error(\"InternalError: batch size is zero.\");\n    }\n\n    // 3. Forward each batch\n    const batchSize = tokenizedInputs.length;\n    const result: Array<Array<number>> = [];\n    for (let begin = 0; begin < batchSize; begin += this.maxBatchSize) {\n      this.tvm.beginScope();\n      // 3.1 Get current batch\n      const end = Math.min(batchSize, begin + this.maxBatchSize);\n      const curBatch: Array<Array<number>> = tokenizedInputs.slice(begin, end);\n      const curBatchSize = curBatch.length;\n      // 3.2 Max input size of current batch\n      let maxInputSize = 0;\n      for (let i = 0; i < curBatchSize; i++) {\n        const curInputSize = curBatch[i].length;\n        if (curInputSize > maxInputSize) {\n          maxInputSize = curInputSize;\n        }\n      }\n      // 3.3 Create inputs and attention mask\n      // Padded with zeros and flattened, of size curBatchSize * maxInputSize\n      const curBatchPaddedFlatten: Array<number> = [];\n      // 1 for non-pad, 0 otherwise, also of size curBatchSize * maxInputSize\n      const curAttnMask: Array<number> = [];\n      const flattenedInputSize = curBatchSize * maxInputSize;\n      for (let i = 0; i < curBatchSize; i++) {\n        const padding = Array(maxInputSize - curBatch[i].length).fill(0);\n        const ones = Array(curBatch[i].length).fill(1);\n        curBatchPaddedFlatten.push(...curBatch[i]);\n        curAttnMask.push(...ones);\n        curBatchPaddedFlatten.push(...padding);\n        curAttnMask.push(...padding);\n      }\n      if (\n        curBatchPaddedFlatten.length !== flattenedInputSize ||\n        curAttnMask.length !== flattenedInputSize\n      ) {\n        throw new Error(\n          `InternalError: Expect input array to be ${flattenedInputSize}, ` +\n            `but got ${curBatchPaddedFlatten.length}`,\n        );\n      }\n      // 3.4 Convert inputs and attention mask to tvm ndarray on GPU, of shape (curBatchSize, maxInputSize)\n      let inputNDArray = this.tvm.empty(\n        [flattenedInputSize],\n        \"int32\",\n        this.device,\n      );\n      inputNDArray.copyFrom(curBatchPaddedFlatten);\n      inputNDArray = inputNDArray.view([curBatchSize, maxInputSize]);\n      let maskNDArray = this.tvm.empty(\n        [flattenedInputSize],\n        \"int32\",\n        this.device,\n      );\n      maskNDArray.copyFrom(curAttnMask);\n      maskNDArray = maskNDArray.view([curBatchSize, maxInputSize]);\n\n      // 3.5 Actual forwarding on GPU, logits of shape (curBatchSize, maxInputSize, hidden_size)\n      const logitsCurBatchOnGPU: tvmjs.Tensor = this.prefill(\n        inputNDArray,\n        maskNDArray,\n        this.params,\n      );\n      await this.device.sync();\n\n      // 3.6 Copy logits to CPU, flatten to curBatchSize * maxInputSize * hidden_size\n      const hidden_size = logitsCurBatchOnGPU.shape[2];\n      let logitsCurBatchOnCPU: tvmjs.Tensor = this.tvm.empty(\n        logitsCurBatchOnGPU.shape,\n        logitsCurBatchOnGPU.dtype,\n        this.tvm.cpu(),\n      );\n      logitsCurBatchOnCPU.copyFrom(logitsCurBatchOnGPU);\n      logitsCurBatchOnCPU = logitsCurBatchOnCPU.view([\n        curBatchSize * maxInputSize * hidden_size,\n      ]);\n      await this.device.sync();\n      const logitsCurBatchOnCPUArray: Float32Array = <Float32Array>(\n        logitsCurBatchOnCPU.toArray()\n      );\n\n      // 3.7 Update final result. For each sentence, get [0,:], i.e. only the first token's output\n      // That is, we are doing result.push(logits[:,0,:]) here.\n      // TODO: check if all models only use [0,:]. If it is snowflake-specific, need to specify\n      // this in mlc-chat-config.json\n      for (let i = 0; i < curBatchSize; i++) {\n        const b = i * maxInputSize * hidden_size;\n        const e = b + hidden_size;\n        result.push(Array.from(logitsCurBatchOnCPUArray.slice(b, e)));\n      }\n      this.tvm.endScope();\n    }\n    if (result.length !== batchSize) {\n      throw new Error(`\n        InternalError: expect result.length to be ${batchSize}, but got ${result.length}`);\n    }\n    const embedEnd = performance.now();\n    this.curRoundEmbedTotalTokens = totalNumTokens;\n    this.curRoundEmbedTotalTime = (embedEnd - embedStart) / 1e3;\n\n    return result;\n  }\n\n  dispose() {\n    this.params.dispose();\n    this.prefill.dispose();\n    this.vm.dispose();\n    this.tvm.dispose();\n    this.tokenizer.dispose();\n  }\n\n  /**\n   * Synchronize the device.\n   */\n  async sync(): Promise<void> {\n    // Is it equivalent to this.tvm.sync()?\n    await this.device.sync();\n  }\n\n  async asyncLoadWebGPUPipelines() {\n    await this.tvm.asyncLoadWebGPUPipelines(this.vm.getInternalModule());\n  }\n\n  // Performance APIs below\n\n  /**\n   * Get the time it took the last `embedStep()` in seconds.\n   */\n  getCurRoundEmbedTotalTime(): number {\n    return this.curRoundEmbedTotalTime;\n  }\n\n  /**\n   * Get the number of tokens embedded in the last `embedStep()`. This excludes the padded tokens.\n   */\n  getCurRoundEmbedTotalTokens(): number {\n    return this.curRoundEmbedTotalTokens;\n  }\n\n  /**\n   * @returns Prefill tokens per second, starting from the last prefill performed.\n   */\n  getCurRoundEmbedTokensPerSec(): number {\n    return this.curRoundEmbedTotalTokens / this.curRoundEmbedTotalTime;\n  }\n}\n"
  },
  {
    "path": "src/engine.ts",
    "content": "import * as tvmjs from \"@mlc-ai/web-runtime\";\nimport log from \"loglevel\";\nimport {\n  ChatConfig,\n  ChatOptions,\n  AppConfig,\n  prebuiltAppConfig,\n  GenerationConfig,\n  postInitAndCheckGenerationConfigValues,\n  Role,\n  MLCEngineConfig,\n  DefaultLogLevel,\n  ModelType,\n} from \"./config\";\nimport { LLMChatPipeline } from \"./llm_chat\";\nimport {\n  // ChatCompletion\n  ChatCompletionRequest,\n  ChatCompletion,\n  ChatCompletionChunk,\n  ChatCompletionMessageParam,\n  ChatCompletionRequestNonStreaming,\n  ChatCompletionRequestStreaming,\n  ChatCompletionRequestBase,\n  CompletionUsage,\n  ChatCompletionMessageToolCall,\n  // Completion\n  CompletionCreateParamsNonStreaming,\n  CompletionCreateParamsStreaming,\n  CompletionCreateParamsBase,\n  CompletionCreateParams,\n  Completion,\n  CompletionChoice,\n  EmbeddingCreateParams,\n  CreateEmbeddingResponse,\n  Embedding,\n} from \"./openai_api_protocols/index\";\nimport * as API from \"./openai_api_protocols/index\";\nimport {\n  InitProgressCallback,\n  MLCEngineInterface,\n  LogitProcessor,\n  LogLevel,\n  LatencyBreakdown,\n} from \"./types\";\nimport {\n  compareConversationObject,\n  getConversation,\n  getConversationFromChatCompletionRequest,\n} from \"./conversation\";\nimport {\n  cleanModelUrl,\n  CustomLock,\n  findModelRecord,\n  getModelIdToUse,\n  getToolCallFromOutputMessage,\n} from \"./support\";\nimport {\n  ConfigurationNotInitializedError,\n  DeviceLostError,\n  EmbeddingUnsupportedModelError,\n  FeatureSupportError,\n  MissingModelWasmError,\n  ShaderF16SupportError,\n  WebGPUNotAvailableError,\n  ReloadArgumentSizeUnmatchedError,\n  IncorrectPipelineLoadedError,\n  ReloadModelIdNotUniqueError,\n  SpecifiedModelNotFoundError,\n  ModelNotLoadedError,\n} from \"./error\";\nimport { asyncLoadTokenizer } from \"./cache_util\";\nimport { EmbeddingPipeline } from \"./embedding\";\n\n/**\n * Creates `MLCEngine`, and loads `modelId` onto WebGPU.\n *\n * Equivalent to `new webllm.MLCEngine().reload(...)`.\n *\n * @param modelId model_id of the model to load, either string or string[]. When multiple models\n *   are provided, we load all models sequentially. Each modelId needs to either be in\n *   `webllm.prebuiltAppConfig`, or in `engineCOnfig.appConfig`.\n * @param engineConfig Optionally configures the engine, see `webllm.MLCEngineConfig`.\n * @param chatOpts Extra options to optionally override the `mlc-chat-config.json` of `modelId`.\n *   The size of which needs to match that of `modelId`; chatOpts[i] will be used for modelId[i].\n * @returns An initialized `WebLLM.MLCEngine` with `modelId` loaded.\n * @throws Throws error when device lost (mostly due to OOM); users should re-call `CreateMLCEngine()`,\n *   potentially with a smaller model or smaller context window size.\n */\nexport async function CreateMLCEngine(\n  modelId: string | string[],\n  engineConfig?: MLCEngineConfig,\n  chatOpts?: ChatOptions | ChatOptions[],\n): Promise<MLCEngine> {\n  const engine = new MLCEngine(engineConfig);\n  await engine.reload(modelId, chatOpts);\n  return engine;\n}\n\n/**\n * The main interface of MLCEngine, which loads a model and performs tasks.\n *\n * You can either initialize one with `webllm.CreateMLCEngine(modelId)`, or\n * `webllm.MLCEngine().reload(modelId)`.\n */\nexport class MLCEngine implements MLCEngineInterface {\n  // APIs\n  /** For chat.completions.create() */\n  public chat: API.Chat;\n  /** For completions.create() */\n  public completions: API.Completions;\n  /** For embeddings.create() */\n  public embeddings: API.Embeddings;\n\n  // Maps to maintain states of loaded model(s)\n  /** Maps each loaded model's modelId to its pipeline */\n  private loadedModelIdToPipeline: Map<\n    string,\n    LLMChatPipeline | EmbeddingPipeline\n  >;\n  /** Maps each loaded model's modelId to its chatConfig */\n  private loadedModelIdToChatConfig: Map<string, ChatConfig>;\n  /** Maps each loaded model's modelId to its modelType */\n  private loadedModelIdToModelType: Map<string, ModelType>;\n  /** Maps each loaded model's modelId to a lock. Ensures\n   * each model only processes one request at at time.\n   */\n  private loadedModelIdToLock: Map<string, CustomLock>;\n\n  // Others\n  private logger: (msg: string) => void = log.info;\n  private logitProcessorRegistry?: Map<string, LogitProcessor>;\n  private initProgressCallback?: InitProgressCallback;\n  private appConfig: AppConfig;\n\n  // Signals and flags\n  private interruptSignal = false;\n  private deviceLostIsError = true; // whether device.lost is due to actual error or model reload\n  private reloadController: AbortController | undefined;\n\n  constructor(engineConfig?: MLCEngineConfig) {\n    this.loadedModelIdToPipeline = new Map<\n      string,\n      LLMChatPipeline | EmbeddingPipeline\n    >();\n    this.loadedModelIdToChatConfig = new Map<string, ChatConfig>();\n    this.loadedModelIdToModelType = new Map<string, ModelType>();\n    this.loadedModelIdToLock = new Map<string, CustomLock>();\n    this.appConfig = engineConfig?.appConfig || prebuiltAppConfig;\n    this.setLogLevel(engineConfig?.logLevel || DefaultLogLevel);\n    this.setInitProgressCallback(engineConfig?.initProgressCallback);\n    this.setLogitProcessorRegistry(engineConfig?.logitProcessorRegistry);\n\n    this.chat = new API.Chat(this);\n    this.completions = new API.Completions(this);\n    this.embeddings = new API.Embeddings(this);\n  }\n\n  //-----------------------\n  // 0. Setters and getters\n  //-----------------------\n\n  setAppConfig(appConfig: AppConfig) {\n    this.appConfig = appConfig;\n  }\n\n  setInitProgressCallback(initProgressCallback?: InitProgressCallback) {\n    this.initProgressCallback = initProgressCallback;\n  }\n\n  getInitProgressCallback() {\n    return this.initProgressCallback;\n  }\n\n  setLogitProcessorRegistry(\n    logitProcessorRegistry?: Map<string, LogitProcessor>,\n  ) {\n    this.logitProcessorRegistry = logitProcessorRegistry;\n  }\n\n  /**\n   * Set MLCEngine logging output level\n   *\n   * @param logLevel The new log level\n   */\n  setLogLevel(logLevel: LogLevel) {\n    log.setLevel(logLevel);\n  }\n\n  //----------------------------------------\n  // 1. Model/pipeline loading and unloading\n  //----------------------------------------\n\n  async reload(\n    modelId: string | string[],\n    chatOpts?: ChatOptions | ChatOptions[],\n  ): Promise<void> {\n    // 0. Unload all loaded models\n    await this.unload();\n    // 1. Convert inputs to arrays\n    if (!Array.isArray(modelId)) {\n      modelId = [modelId];\n    }\n    if (chatOpts !== undefined && !Array.isArray(chatOpts)) {\n      chatOpts = [chatOpts];\n    }\n    // 2. Check whether size matches\n    if (chatOpts !== undefined && modelId.length !== chatOpts.length) {\n      throw new ReloadArgumentSizeUnmatchedError(\n        modelId.length,\n        chatOpts.length,\n      );\n    }\n    // 3. Make sure each model in modelId is unique\n    if (new Set(modelId).size < modelId.length) {\n      throw new ReloadModelIdNotUniqueError(modelId);\n    }\n    // 4. Sequentially load each model\n    // Single abort should stop all to-be-loaded models\n    this.reloadController = new AbortController();\n    try {\n      for (let i = 0; i < modelId.length; i++) {\n        await this.reloadInternal(\n          modelId[i],\n          chatOpts ? chatOpts[i] : undefined,\n        );\n      }\n    } catch (error) {\n      if (error instanceof DOMException && error.name === \"AbortError\") {\n        log.warn(\"Reload() is aborted.\", error.message);\n        return;\n      }\n      throw error;\n    } finally {\n      this.reloadController = undefined;\n    }\n  }\n\n  private async reloadInternal(\n    modelId: string,\n    chatOpts?: ChatOptions,\n  ): Promise<void> {\n    const logitProcessor = this.logitProcessorRegistry?.get(modelId);\n    const tstart = performance.now();\n\n    // look up and parse model record, record model type\n    const modelRecord = findModelRecord(modelId, this.appConfig);\n    const baseUrl =\n      typeof document !== \"undefined\"\n        ? document.URL\n        : globalThis.location.origin;\n    let modelUrl = cleanModelUrl(modelRecord.model);\n    if (!modelUrl.startsWith(\"http\")) {\n      modelUrl = new URL(modelUrl, baseUrl).href;\n    }\n    const modelType =\n      modelRecord.model_type === undefined || modelRecord.model_type === null\n        ? ModelType.LLM\n        : modelRecord.model_type;\n    this.loadedModelIdToModelType.set(modelId, modelType);\n\n    // instantiate cache\n    let configCache: tvmjs.ArtifactCacheTemplate;\n    if (this.appConfig.useIndexedDBCache) {\n      configCache = new tvmjs.ArtifactIndexedDBCache(\"webllm/config\");\n    } else {\n      configCache = new tvmjs.ArtifactCache(\"webllm/config\");\n    }\n\n    // load config\n    const configUrl = new URL(\"mlc-chat-config.json\", modelUrl).href;\n    const curModelConfig = {\n      ...(await configCache.fetchWithCache(\n        configUrl,\n        \"json\",\n        this.reloadController?.signal,\n      )),\n      ...modelRecord.overrides,\n      ...chatOpts,\n    } as ChatConfig;\n    this.loadedModelIdToChatConfig.set(modelId, curModelConfig);\n\n    // load tvm wasm\n    let wasmCache: tvmjs.ArtifactCacheTemplate;\n    if (this.appConfig.useIndexedDBCache) {\n      wasmCache = new tvmjs.ArtifactIndexedDBCache(\"webllm/wasm\");\n    } else {\n      wasmCache = new tvmjs.ArtifactCache(\"webllm/wasm\");\n    }\n\n    const wasmUrl = modelRecord.model_lib;\n    if (wasmUrl === undefined) {\n      throw new MissingModelWasmError(modelRecord.model_id);\n    }\n    const fetchWasmSource = async () => {\n      if (wasmUrl.includes(\"localhost\")) {\n        // do not cache wasm on local host as we might update code frequently\n        return (await fetch(wasmUrl)).arrayBuffer();\n      } else if (!wasmUrl.startsWith(\"http\")) {\n        // do not cache wasm on the same server as it can also refresh\n        // rely on the normal caching strategy\n        return (await fetch(new URL(wasmUrl, baseUrl).href)).arrayBuffer();\n      } else {\n        return await wasmCache.fetchWithCache(\n          wasmUrl,\n          \"arraybuffer\",\n          this.reloadController?.signal,\n        );\n      }\n    };\n    const wasmSource = await fetchWasmSource();\n\n    const wasm = new Uint8Array(wasmSource);\n    const tvm = await tvmjs.instantiate(\n      wasm.buffer,\n      tvmjs.createPolyfillWASI(),\n      this.logger,\n    );\n\n    if (this.initProgressCallback !== undefined) {\n      tvm.registerInitProgressCallback(this.initProgressCallback);\n    }\n\n    // detect GPU\n    const gpuDetectOutput = await tvmjs.detectGPUDevice();\n    if (gpuDetectOutput == undefined) {\n      throw new WebGPUNotAvailableError();\n    }\n    let gpuLabel = \"WebGPU\";\n    if (gpuDetectOutput.adapterInfo.description.length != 0) {\n      gpuLabel += \" - \" + gpuDetectOutput.adapterInfo.description;\n    } else {\n      gpuLabel += \" - \" + gpuDetectOutput.adapterInfo.vendor;\n    }\n    if (modelRecord.required_features !== undefined) {\n      for (const feature of modelRecord.required_features) {\n        if (!gpuDetectOutput.device.features.has(feature)) {\n          if (feature == \"shader-f16\") {\n            throw new ShaderF16SupportError();\n          }\n          throw new FeatureSupportError(feature);\n        }\n      }\n    }\n\n    // Most device lost happens in `reload()` since we allocate memory ahead of time. So we can\n    // use this flag at the end of `reload()` to make the error handling synchronous.\n    // This `.then()` exists throughout the lifetime of the device. Though we have not\n    // experienced device error outside of `reload()`, it is still possible this `.then()` is\n    // triggered outside of `reload()`. TODO: does this cause unexpected behavior?\n    let deviceLostInReload = false;\n    gpuDetectOutput.device.lost.then((info: any) => {\n      if (this.deviceLostIsError) {\n        log.error(\n          `Device was lost. This can happen due to insufficient memory or other GPU constraints. ` +\n            `Detailed error: ${info}. Please try to reload WebLLM with a less resource-intensive model.`,\n        );\n        this.unload();\n        deviceLostInReload = true;\n      }\n    });\n    tvm.initWebGPU(gpuDetectOutput.device);\n\n    const tokenizer = await asyncLoadTokenizer(\n      modelUrl,\n      curModelConfig,\n      this.appConfig,\n      this.logger,\n    );\n    const cacheType = this.appConfig.useIndexedDBCache ? \"indexeddb\" : \"cache\";\n    await tvm.fetchTensorCache(\n      modelUrl,\n      tvm.webgpu(),\n      \"webllm/model\",\n      cacheType,\n      this.reloadController?.signal,\n    );\n\n    // Instantiate pipeline\n    // TODO: would be good to somehow check for error when LLMChatPipeline is loaded for an\n    // embedding model, and prompt user to use ModelRecord.model_type\n    let newPipeline: LLMChatPipeline | EmbeddingPipeline;\n    if (modelRecord.model_type === ModelType.embedding) {\n      newPipeline = new EmbeddingPipeline(tvm, tokenizer, curModelConfig);\n    } else {\n      newPipeline = new LLMChatPipeline(\n        tvm,\n        tokenizer,\n        curModelConfig,\n        logitProcessor,\n      );\n    }\n    await newPipeline.asyncLoadWebGPUPipelines();\n    this.loadedModelIdToPipeline.set(modelId, newPipeline);\n    this.loadedModelIdToLock.set(modelId, new CustomLock());\n\n    // Clean up\n    const tend = performance.now();\n    if (this.initProgressCallback !== undefined) {\n      const text = \"Finish loading on \" + gpuLabel;\n      this.initProgressCallback({\n        progress: 1,\n        timeElapsed: (tend - tstart) / 1e3,\n        text: text,\n      });\n    }\n    if (deviceLostInReload) {\n      throw new DeviceLostError();\n    }\n  }\n\n  async unload() {\n    this.deviceLostIsError = false; // so that unload() does not trigger device.lost error\n    // TODO: can optimize by calling dispose() to all pipelines in parallel. However, need to wait\n    // for all sync() to finish before proceeding (e.g. naive forEach does not work)\n    for (const entry of Array.from(this.loadedModelIdToPipeline.entries())) {\n      const pipeline = entry[1];\n      pipeline.dispose();\n      // Wait until device is actually destroyed so we can safely set deviceLostIsError back to true\n      await pipeline.sync();\n    }\n    this.loadedModelIdToPipeline.clear();\n    this.loadedModelIdToChatConfig.clear();\n    this.loadedModelIdToModelType.clear();\n    this.loadedModelIdToLock.clear();\n    this.deviceLostIsError = true;\n    if (this.reloadController) {\n      this.reloadController.abort(\"Engine.unload() is called.\");\n      this.reloadController = undefined;\n    }\n  }\n\n  //---------------------------------------------------\n  // 2. Underlying auto-regressive generation functions\n  //---------------------------------------------------\n\n  private async _generate(\n    input:\n      | ChatCompletionRequestNonStreaming\n      | CompletionCreateParamsNonStreaming,\n    pipeline: LLMChatPipeline,\n    chatConfig: ChatConfig,\n    genConfig: GenerationConfig,\n  ): Promise<string> {\n    this.interruptSignal = false;\n    if (genConfig !== undefined) {\n      postInitAndCheckGenerationConfigValues(genConfig);\n    }\n    await this.prefill(input, pipeline, chatConfig, genConfig);\n\n    while (!pipeline.stopped()) {\n      if (this.interruptSignal) {\n        pipeline.triggerStop();\n        break;\n      }\n      await this.decode(pipeline, genConfig);\n    }\n    return pipeline.getMessage();\n  }\n\n  /**\n   * Similar to `_generate()`; but instead of using callback, we use an async iterable.\n   */\n  asyncGenerate(\n    request: ChatCompletionRequestStreaming,\n    model: string,\n    pipeline: LLMChatPipeline,\n    chatConfig: ChatConfig,\n    genConfig: GenerationConfig,\n    timeReceived: number,\n  ): AsyncGenerator<ChatCompletionChunk, void, void>;\n  asyncGenerate(\n    request: CompletionCreateParamsStreaming,\n    model: string,\n    pipeline: LLMChatPipeline,\n    chatConfig: ChatConfig,\n    genConfig: GenerationConfig,\n    timeReceived: number,\n  ): AsyncGenerator<Completion, void, void>;\n  async *asyncGenerate(\n    request: ChatCompletionRequestStreaming | CompletionCreateParamsStreaming,\n    model: string,\n    pipeline: LLMChatPipeline,\n    chatConfig: ChatConfig,\n    genConfig: GenerationConfig,\n    timeReceived: number,\n  ): AsyncGenerator<ChatCompletionChunk | Completion, void, void> {\n    // Since it is an async generator, we need to do fine-grained try-catch to ensure lock is\n    // released only when errors occur. Then release at the very end when no error occurs.\n    // TODO: This makes code less readable, is there a better way to do this?\n    const lock = this.loadedModelIdToLock.get(model)!;\n\n    // 0. Pre-processing\n    const isChatCompletion = \"messages\" in request;\n    const isFunctionCalling =\n      \"tools\" in request &&\n      request.tools !== undefined &&\n      request.tools !== null;\n    try {\n      if (isFunctionCalling && !isChatCompletion) {\n        throw new Error(\n          \"Expect `chat.completions` with tools, not `completions`.\",\n        );\n      }\n      postInitAndCheckGenerationConfigValues(genConfig);\n      if (request.seed !== null && request.seed !== undefined) {\n        pipeline.setSeed(request.seed);\n      }\n    } catch (err) {\n      await lock.release();\n      throw err;\n    }\n\n    // 1. Helper function that generates the chunk\n    const created = Date.now();\n    const id = crypto.randomUUID();\n    this.interruptSignal = false;\n    let prevMessageLength = 0; // to know where to start slicing the delta; does not count �\n\n    function _countTrailingReplacementChar(curMessage: string): number {\n      let cntr = 0;\n      for (let i = curMessage.length - 1; i >= 0; i--) {\n        if (curMessage.charAt(i) === \"�\") {\n          cntr += 1;\n        } else {\n          return cntr;\n        }\n      }\n      return cntr;\n    }\n\n    async function _getChunk(\n      selectedPipeline: LLMChatPipeline,\n    ): Promise<ChatCompletionChunk | Completion | undefined> {\n      // Remove the replacement character (U+FFFD) from the response to handle emojis.\n      // Each emoji is made up of multiples of 4 tokens; when truncated, it is displayed as �, so\n      // we skip this delta until a full emoji is rendered\n      // TODO(Charlie): This does not consider cases of � not being emoji, need to fix with Streamer\n      const curMessage = selectedPipeline.getMessage();\n      const numTrailingReplacementChar =\n        _countTrailingReplacementChar(curMessage);\n      if (numTrailingReplacementChar % 4 !== 0) {\n        return undefined;\n      }\n\n      const deltaMessage = curMessage.slice(prevMessageLength);\n      prevMessageLength = curMessage.length;\n      const logprobs = request.logprobs\n        ? ({\n            content: selectedPipeline.getTokenLogprobArray().slice(-1), // always the last entry\n          } as ChatCompletionChunk.Choice.Logprobs)\n        : null;\n      if (isChatCompletion) {\n        const chunk: ChatCompletionChunk = {\n          id: id,\n          choices: [\n            {\n              delta: { content: deltaMessage, role: \"assistant\" },\n              finish_reason: null, // not finished yet\n              index: 0,\n              logprobs: logprobs,\n            },\n          ],\n          model: model,\n          object: \"chat.completion.chunk\",\n          created: created,\n        };\n        return chunk;\n      } else {\n        const chunk: Completion = {\n          id: id,\n          choices: [\n            {\n              text: deltaMessage,\n              finish_reason: null, // not finished yet\n              index: 0,\n              logprobs: logprobs,\n            },\n          ],\n          model: model,\n          object: \"text_completion\",\n          created: created,\n        };\n        return chunk;\n      }\n    }\n\n    // 2. Auto-regressive loop\n    let curChunk;\n    try {\n      await this.prefill(request, pipeline, chatConfig, genConfig);\n      curChunk = await _getChunk(pipeline); // prefill produces a chunk\n    } catch (err) {\n      await lock.release();\n      throw err;\n    }\n    if (curChunk) {\n      yield curChunk;\n    }\n\n    while (!pipeline.stopped()) {\n      if (this.interruptSignal) {\n        // TODO: should we directly release lock here and return the async\n        // generator? Though no issue observed as of now with interruptGenerate()\n        pipeline.triggerStop();\n        break;\n      }\n      try {\n        await this.decode(pipeline, genConfig);\n        curChunk = await _getChunk(pipeline);\n      } catch (err) {\n        await lock.release();\n        throw err;\n      }\n      if (curChunk) {\n        yield curChunk;\n      }\n    }\n\n    // Reset seed -- we do not want this seed to affect future requests\n    if (request.seed !== null && request.seed !== undefined) {\n      pipeline.setSeed(Date.now());\n    }\n\n    // 3. Last chunk empty marking the end\n    // If function calling, use the last chunk to return tool_calls\n    let finish_reason = pipeline.getFinishReason()!;\n    let tool_calls:\n      | Array<ChatCompletionChunk.Choice.Delta.ToolCall>\n      | undefined;\n    try {\n      if (pipeline.getFinishReason() === \"stop\" && isFunctionCalling) {\n        // If stopped due to length or abort, cannot output return tool_calls field\n        finish_reason = \"tool_calls\";\n        const outputMessage = pipeline.getMessage();\n        tool_calls = getToolCallFromOutputMessage(\n          outputMessage,\n          /*isStreaming=*/ true,\n        ) as Array<ChatCompletionChunk.Choice.Delta.ToolCall>;\n      }\n    } catch (err) {\n      await lock.release();\n      throw err;\n    }\n\n    if (isChatCompletion) {\n      const lastChunk: ChatCompletionChunk = {\n        id: id,\n        choices: [\n          {\n            delta: isFunctionCalling\n              ? {\n                  role: \"assistant\",\n                  tool_calls: tool_calls,\n                }\n              : {},\n            finish_reason: finish_reason,\n            index: 0,\n          },\n        ],\n        model: model,\n        object: \"chat.completion.chunk\",\n        created: created,\n      };\n      yield lastChunk;\n    } else {\n      const lastChunk: Completion = {\n        id: id,\n        choices: [\n          {\n            text: \"\",\n            finish_reason: finish_reason,\n            index: 0,\n          },\n        ],\n        model: model,\n        object: \"text_completion\",\n        created: created,\n      };\n      yield lastChunk;\n    }\n\n    // 4. Usage chunk\n    if (request.stream_options?.include_usage) {\n      const usedGrammar =\n        \"response_format\" in request &&\n        (request.response_format?.type === \"grammar\" ||\n          request.response_format?.type === \"json_object\");\n      const completion_tokens = pipeline.getCurRoundDecodingTotalTokens();\n      const prompt_tokens = pipeline.getCurRoundPrefillTotalTokens();\n      const prefill_tokens_per_s = pipeline.getCurRoundPrefillTokensPerSec();\n      const decode_tokens_per_s = pipeline.getCurRoundDecodingTokensPerSec();\n      const grammar_init_s = pipeline.getCurRoundGrammarInitTotalTime();\n      const prefill_time = pipeline.getCurRoundPrefillTotalTime();\n      const decode_time = pipeline.getCurRoundDecodingTotalTime();\n      const grammar_per_token_s =\n        pipeline.getCurRoundGrammarPerTokenTotalTime();\n      const latencyBreakdown: LatencyBreakdown =\n        pipeline.getCurRoundLatencyBreakdown();\n\n      const defaultExtra = {\n        e2e_latency_s: (Date.now() - timeReceived) / 1000,\n        prefill_tokens_per_s: prefill_tokens_per_s,\n        decode_tokens_per_s: decode_tokens_per_s,\n        time_to_first_token_s: prefill_time,\n        time_per_output_token_s: decode_time / completion_tokens,\n        latencyBreakdown: request.extra_body?.enable_latency_breakdown\n          ? latencyBreakdown\n          : undefined,\n      };\n      const usage: CompletionUsage = {\n        completion_tokens: completion_tokens,\n        prompt_tokens: prompt_tokens,\n        total_tokens: completion_tokens + prompt_tokens,\n        extra: usedGrammar\n          ? {\n              ...defaultExtra,\n              ...{\n                grammar_init_s: grammar_init_s,\n                grammar_per_token_s: grammar_per_token_s / completion_tokens,\n              },\n            }\n          : defaultExtra,\n      };\n      if (isChatCompletion) {\n        const usageChunk: ChatCompletionChunk = {\n          id: id,\n          choices: [],\n          usage: usage,\n          model: model,\n          object: \"chat.completion.chunk\",\n          created: created,\n        };\n        yield usageChunk;\n      } else {\n        const usageChunk: Completion = {\n          id: id,\n          choices: [],\n          usage: usage,\n          model: model,\n          object: \"text_completion\",\n          created: created,\n        };\n        yield usageChunk;\n      }\n    }\n\n    await lock.release();\n  }\n\n  async interruptGenerate() {\n    this.interruptSignal = true;\n  }\n\n  //------------------------------\n  // 3. High-level generation APIs\n  //------------------------------\n\n  /**\n   * Completes a single ChatCompletionRequest.\n   *\n   * @param request A OpenAI-style ChatCompletion request.\n   *\n   * @note For each choice (i.e. `n`), a request is defined by a single `prefill()` and multiple\n   * `decode()`. This is important as it determines the behavior of various fields including `seed`.\n   */\n  async chatCompletion(\n    request: ChatCompletionRequestNonStreaming,\n  ): Promise<ChatCompletion>;\n  async chatCompletion(\n    request: ChatCompletionRequestStreaming,\n  ): Promise<AsyncIterable<ChatCompletionChunk>>;\n  async chatCompletion(\n    request: ChatCompletionRequestBase,\n  ): Promise<AsyncIterable<ChatCompletionChunk> | ChatCompletion>;\n  async chatCompletion(\n    request: ChatCompletionRequest,\n  ): Promise<AsyncIterable<ChatCompletionChunk> | ChatCompletion> {\n    const timeReceived = Date.now();\n    // 0. Check model loaded and preprocess inputs\n    const [selectedModelId, selectedPipeline, selectedChatConfig] =\n      this.getLLMStates(\"ChatCompletionRequest\", request.model);\n    const selectedModelType =\n      this.loadedModelIdToModelType.get(selectedModelId);\n    API.postInitAndCheckFieldsChatCompletion(\n      request,\n      selectedModelId,\n      selectedModelType!,\n    );\n    const genConfig: GenerationConfig = {\n      frequency_penalty: request.frequency_penalty,\n      presence_penalty: request.presence_penalty,\n      repetition_penalty: request.repetition_penalty,\n      max_tokens: request.max_tokens,\n      stop: request.stop,\n      top_p: request.top_p,\n      temperature: request.temperature,\n      logit_bias: request.logit_bias,\n      logprobs: request.logprobs,\n      top_logprobs: request.top_logprobs,\n      response_format: request.response_format,\n      ignore_eos: request.ignore_eos,\n      enable_thinking: request.extra_body?.enable_thinking,\n      enable_latency_breakdown: request.extra_body?.enable_latency_breakdown,\n    };\n\n    // 0.5 Block wait until this pipeline finishes all previous requests\n    const lock = this.loadedModelIdToLock.get(selectedModelId)!;\n    await lock.acquire();\n\n    // 1. If request is streaming, return an AsyncIterable (an iterable version of `_generate()`)\n    if (request.stream) {\n      return this.asyncGenerate(\n        request,\n        selectedModelId,\n        selectedPipeline,\n        selectedChatConfig,\n        genConfig,\n        timeReceived,\n      );\n    }\n\n    // Big try-finally to release lock in case of errors\n    try {\n      if (request.seed !== null && request.seed !== undefined) {\n        selectedPipeline.setSeed(request.seed);\n      }\n\n      // 2. If request is non-streaming, directly reuse `_generate()`\n      const n = request.n ? request.n : 1;\n      const choices: Array<ChatCompletion.Choice> = [];\n      let completion_tokens = 0;\n      let prompt_tokens = 0;\n      let prefill_time = 0;\n      let decode_time = 0;\n      let grammar_init_s = 0;\n      let grammar_per_token_s = 0;\n      for (let i = 0; i < n; i++) {\n        let outputMessage: string;\n        if (this.interruptSignal) {\n          // A single interrupt signal should stop all choices' generations\n          selectedPipeline.triggerStop();\n          outputMessage = \"\";\n        } else {\n          outputMessage = await this._generate(\n            request,\n            selectedPipeline,\n            selectedChatConfig,\n            genConfig,\n          );\n        }\n        let finish_reason = selectedPipeline.getFinishReason()!;\n\n        // 3. Post processing for function calling\n        const isFunctionCalling =\n          request.tools !== undefined && request.tools !== null;\n        let tool_calls: Array<ChatCompletionMessageToolCall> | undefined;\n        if (\n          selectedPipeline.getFinishReason() === \"stop\" &&\n          isFunctionCalling\n        ) {\n          // If stopped due to length or abort, cannot output return tool_calls field\n          finish_reason = \"tool_calls\";\n          tool_calls = getToolCallFromOutputMessage(\n            outputMessage,\n            /*isStreaming=*/ false,\n          );\n        }\n\n        choices.push({\n          finish_reason: finish_reason,\n          index: i,\n          logprobs: request.logprobs\n            ? ({\n                content: selectedPipeline.getTokenLogprobArray(),\n              } as ChatCompletion.Choice.Logprobs)\n            : null,\n          message: isFunctionCalling\n            ? {\n                content: null,\n                tool_calls: tool_calls,\n                role: \"assistant\",\n              }\n            : {\n                content: outputMessage,\n                role: \"assistant\",\n              },\n        });\n        completion_tokens += selectedPipeline.getCurRoundDecodingTotalTokens();\n        prompt_tokens += selectedPipeline.getCurRoundPrefillTotalTokens();\n        prefill_time += selectedPipeline.getCurRoundPrefillTotalTime();\n        decode_time += selectedPipeline.getCurRoundDecodingTotalTime();\n        grammar_init_s += selectedPipeline.getCurRoundGrammarInitTotalTime();\n        grammar_per_token_s +=\n          selectedPipeline.getCurRoundGrammarPerTokenTotalTime();\n      }\n      const usedGrammar =\n        \"response_format\" in request &&\n        (request.response_format?.type === \"grammar\" ||\n          request.response_format?.type === \"json_object\");\n\n      const latencyBreakdown: LatencyBreakdown =\n        selectedPipeline.getCurRoundLatencyBreakdown();\n\n      const defaultExtra = {\n        e2e_latency_s: (Date.now() - timeReceived) / 1000,\n        prefill_tokens_per_s: prompt_tokens / prefill_time,\n        decode_tokens_per_s: completion_tokens / decode_time,\n        time_to_first_token_s: prefill_time,\n        time_per_output_token_s: decode_time / completion_tokens,\n        latencyBreakdown: request.extra_body?.enable_latency_breakdown\n          ? latencyBreakdown\n          : undefined,\n      };\n      const response: ChatCompletion = {\n        id: crypto.randomUUID(),\n        choices: choices,\n        model: selectedModelId,\n        object: \"chat.completion\",\n        created: Date.now(),\n        usage: {\n          completion_tokens: completion_tokens,\n          prompt_tokens: prompt_tokens,\n          total_tokens: completion_tokens + prompt_tokens,\n          extra: usedGrammar\n            ? {\n                ...defaultExtra,\n                ...{\n                  grammar_init_s: grammar_init_s,\n                  grammar_per_token_s: grammar_per_token_s / completion_tokens,\n                },\n              }\n            : defaultExtra,\n        } as CompletionUsage,\n      };\n\n      // Reset seed -- we do not want this seed to affect future requests\n      if (request.seed !== null && request.seed !== undefined) {\n        selectedPipeline.setSeed(Date.now());\n      }\n      return response;\n    } finally {\n      await lock.release();\n    }\n  }\n\n  /**\n   * Completes a single CompletionCreateParams, a text completion with no chat template.\n   *\n   * @param request A OpenAI-style Completion request.\n   *\n   * @note For each choice (i.e. `n`), a request is defined by a single `prefill()` and multiple\n   * `decode()`. This is important as it determines the behavior of various fields including `seed`.\n   */\n  async completion(\n    request: CompletionCreateParamsNonStreaming,\n  ): Promise<Completion>;\n  async completion(\n    request: CompletionCreateParamsStreaming,\n  ): Promise<AsyncIterable<Completion>>;\n  async completion(\n    request: CompletionCreateParamsBase,\n  ): Promise<AsyncIterable<Completion> | Completion>;\n  async completion(\n    request: CompletionCreateParams,\n  ): Promise<AsyncIterable<Completion> | Completion> {\n    const timeReceived = Date.now();\n\n    // 0. Check model loaded and preprocess inputs\n    const [selectedModelId, selectedPipeline, selectedChatConfig] =\n      this.getLLMStates(\"CompletionCreateParams\", request.model);\n    API.postInitAndCheckFieldsCompletion(request, selectedModelId);\n    const genConfig: GenerationConfig = {\n      frequency_penalty: request.frequency_penalty,\n      presence_penalty: request.presence_penalty,\n      repetition_penalty: request.repetition_penalty,\n      max_tokens: request.max_tokens,\n      stop: request.stop,\n      top_p: request.top_p,\n      temperature: request.temperature,\n      logit_bias: request.logit_bias,\n      logprobs: request.logprobs,\n      top_logprobs: request.top_logprobs,\n      ignore_eos: request.ignore_eos,\n    };\n\n    // 0.5 Block wait until this pipeline finishes all previous requests\n    const lock = this.loadedModelIdToLock.get(selectedModelId)!;\n    await lock.acquire();\n\n    // 1. If request is streaming, return an AsyncIterable (an iterable version of `_generate()`)\n    if (request.stream) {\n      return this.asyncGenerate(\n        request,\n        selectedModelId,\n        selectedPipeline,\n        selectedChatConfig,\n        genConfig,\n        timeReceived,\n      );\n    }\n\n    // Big try-finally to release lock in case of errors\n    try {\n      if (request.seed !== null && request.seed !== undefined) {\n        selectedPipeline.setSeed(request.seed);\n      }\n\n      // 2. If request is non-streaming, directly reuse `_generate()`\n      const n = request.n ? request.n : 1;\n      const choices: Array<CompletionChoice> = [];\n      let completion_tokens = 0;\n      let prompt_tokens = 0;\n      let prefill_time = 0;\n      let decode_time = 0;\n      for (let i = 0; i < n; i++) {\n        let outputMessage: string;\n        if (this.interruptSignal) {\n          // A single interrupt signal should stop all choices' generations\n          selectedPipeline.triggerStop();\n          outputMessage = \"\";\n        } else {\n          outputMessage = await this._generate(\n            request,\n            selectedPipeline,\n            selectedChatConfig,\n            genConfig,\n          );\n        }\n        const finish_reason = selectedPipeline.getFinishReason()!;\n\n        choices.push({\n          finish_reason: finish_reason,\n          index: i,\n          logprobs: request.logprobs\n            ? ({\n                content: selectedPipeline.getTokenLogprobArray(),\n              } as ChatCompletion.Choice.Logprobs)\n            : null,\n          text: request.echo ? request.prompt + outputMessage : outputMessage,\n        });\n        completion_tokens += selectedPipeline.getCurRoundDecodingTotalTokens();\n        prompt_tokens += selectedPipeline.getCurRoundPrefillTotalTokens();\n        prefill_time += selectedPipeline.getCurRoundPrefillTotalTime();\n        decode_time += selectedPipeline.getCurRoundDecodingTotalTime();\n      }\n\n      const latencyBreakdown: LatencyBreakdown =\n        selectedPipeline.getCurRoundLatencyBreakdown();\n\n      const response: Completion = {\n        id: crypto.randomUUID(),\n        choices: choices,\n        model: selectedModelId,\n        object: \"text_completion\",\n        created: Date.now(),\n        usage: {\n          completion_tokens: completion_tokens,\n          prompt_tokens: prompt_tokens,\n          total_tokens: completion_tokens + prompt_tokens,\n          extra: {\n            e2e_latency_s: (Date.now() - timeReceived) / 1000,\n            prefill_tokens_per_s: prompt_tokens / prefill_time,\n            decode_tokens_per_s: completion_tokens / decode_time,\n            time_to_first_token_s: prefill_time,\n            time_per_output_token_s: decode_time / completion_tokens,\n            latencyBreakdown: request.extra_body?.enable_latency_breakdown\n              ? latencyBreakdown\n              : undefined,\n          },\n        } as CompletionUsage,\n      };\n\n      // Reset seed -- we do not want this seed to affect future requests\n      if (request.seed !== null && request.seed !== undefined) {\n        selectedPipeline.setSeed(Date.now());\n      }\n      return response;\n    } finally {\n      await lock.release();\n    }\n  }\n\n  async embedding(\n    request: EmbeddingCreateParams,\n  ): Promise<CreateEmbeddingResponse> {\n    // 0. Preprocess inputs\n    const [selectedModelId, selectedPipeline] = this.getEmbeddingStates(\n      \"EmbeddingCreateParams\",\n      request.model,\n    );\n    API.postInitAndCheckFieldsEmbedding(request, selectedModelId);\n\n    // 0.5 Block wait until this pipeline finishes all previous requests\n    const lock = this.loadedModelIdToLock.get(selectedModelId)!;\n    await lock.acquire();\n\n    try {\n      // 1. Call EmbeddingPipeline to get embeddings\n      const embedResult: Array<Array<number>> =\n        await selectedPipeline.embedStep(request.input);\n\n      // 2. Prepare response\n      const batchSize = embedResult.length;\n      const data: Array<Embedding> = [];\n      for (let i = 0; i < batchSize; i++) {\n        const curEmbedding: Embedding = {\n          embedding: embedResult[i],\n          index: i,\n          object: \"embedding\",\n        };\n        data.push(curEmbedding);\n      }\n      return {\n        data: data,\n        model: selectedModelId,\n        object: \"list\",\n        usage: {\n          prompt_tokens: selectedPipeline.getCurRoundEmbedTotalTokens(),\n          total_tokens: selectedPipeline.getCurRoundEmbedTotalTokens(),\n          extra: {\n            prefill_tokens_per_s:\n              selectedPipeline.getCurRoundEmbedTokensPerSec(),\n          },\n        },\n      };\n    } finally {\n      await lock.release();\n    }\n  }\n\n  //-----------------------------\n  // 4. WebGPU info-querying helpers\n  //-----------------------------\n\n  async getMaxStorageBufferBindingSize(): Promise<number> {\n    // First detect GPU\n    const gpuDetectOutput = await tvmjs.detectGPUDevice();\n    if (gpuDetectOutput == undefined) {\n      throw new WebGPUNotAvailableError();\n    }\n\n    const computeMB = (value: number) => {\n      return Math.ceil(value / (1 << 20)) + \"MB\";\n    };\n    const maxStorageBufferBindingSize =\n      gpuDetectOutput.device.limits.maxStorageBufferBindingSize;\n    const defaultMaxStorageBufferBindingSize = 1 << 30; // 1GB\n    if (maxStorageBufferBindingSize < defaultMaxStorageBufferBindingSize) {\n      log.warn(\n        `WARNING: the current maxStorageBufferBindingSize ` +\n          `(${computeMB(maxStorageBufferBindingSize)}) ` +\n          `may only work for a limited number of models, e.g.: \\n` +\n          `- Llama-3.1-8B-Instruct-q4f16_1-MLC-1k \\n` +\n          `- Llama-2-7b-chat-hf-q4f16_1-MLC-1k \\n` +\n          `- RedPajama-INCITE-Chat-3B-v1-q4f16_1-MLC-1k \\n` +\n          `- RedPajama-INCITE-Chat-3B-v1-q4f32_1-MLC-1k \\n` +\n          `- TinyLlama-1.1B-Chat-v0.4-q4f16_1-MLC-1k \\n` +\n          `- TinyLlama-1.1B-Chat-v0.4-q4f32_1-MLC-1k`,\n      );\n    }\n    return maxStorageBufferBindingSize;\n  }\n\n  async getGPUVendor(): Promise<string> {\n    // First detect GPU\n    const gpuDetectOutput = await tvmjs.detectGPUDevice();\n    if (gpuDetectOutput == undefined) {\n      throw new WebGPUNotAvailableError();\n    }\n    return gpuDetectOutput.adapterInfo.vendor;\n  }\n\n  //---------------------------------------------------------------\n  // 5. Helper for querying currently loaded model/pipeline/config.\n  // Needed due to possibly multiple loaded models.\n  //---------------------------------------------------------------\n\n  private getLLMStates(\n    requestName: string,\n    modelId?: string | null,\n  ): [string, LLMChatPipeline, ChatConfig] {\n    return this.getModelStates(requestName, ModelType.LLM, modelId) as [\n      string,\n      LLMChatPipeline,\n      ChatConfig,\n    ];\n  }\n\n  private getEmbeddingStates(\n    requestName: string,\n    modelId?: string | null,\n  ): [string, EmbeddingPipeline, ChatConfig] {\n    return this.getModelStates(requestName, ModelType.embedding, modelId) as [\n      string,\n      EmbeddingPipeline,\n      ChatConfig,\n    ];\n  }\n\n  /**\n   * Return the model, its LLMChatPipeline, and ChatConfig to use. Throws error when unclear which\n   * model to load. Ensure all loadedModelIdToXXX maps contain entry for the selected modelId.\n   * @param requestName The type of request or API to load the model for. Needed for error throwing.\n   * @param modelType The typ of model, determining what type of pipeline to expect.\n   * @param modelId Model the user specified to load via the request. Required when multiple\n   *   models are loaded\n   */\n  private getModelStates(\n    requestName: string,\n    modelType: ModelType,\n    modelId?: string | null,\n  ): [string, LLMChatPipeline | EmbeddingPipeline, ChatConfig] {\n    // 0. Select model based on request.model and loadedModelIds\n    const loadedModelIds: string[] = Array.from(\n      this.loadedModelIdToPipeline.keys(),\n    );\n    const selectedModelId: string = getModelIdToUse(\n      loadedModelIds,\n      modelId,\n      requestName,\n    );\n\n    // 1. Retrieve pipeline\n    const selectedPipeline = this.loadedModelIdToPipeline.get(selectedModelId);\n    if (modelType === ModelType.LLM) {\n      if (!(selectedPipeline instanceof LLMChatPipeline)) {\n        throw new IncorrectPipelineLoadedError(\n          selectedModelId,\n          \"LLMChatPipeline\",\n          requestName,\n        );\n      }\n    } else {\n      // ModelType.Embedding\n      if (!(selectedPipeline instanceof EmbeddingPipeline)) {\n        throw new IncorrectPipelineLoadedError(\n          selectedModelId,\n          \"EmbeddingPipeline\",\n          requestName,\n        );\n      }\n      if (\n        findModelRecord(selectedModelId, this.appConfig).model_type !==\n        ModelType.embedding\n      ) {\n        throw new EmbeddingUnsupportedModelError(selectedModelId);\n      }\n    }\n\n    // 2. Retrieve chat config\n    const selectedChatConfig =\n      this.loadedModelIdToChatConfig.get(selectedModelId);\n    if (selectedChatConfig === undefined) {\n      throw new Error(\n        `InternalError: chat config not registered for ${selectedModelId}.`,\n      );\n    }\n\n    // 3. Make sure lock is initialized\n    if (!this.loadedModelIdToLock.has(selectedModelId)) {\n      throw new Error(\n        `InternalError: loadedModelIdToLock does not contain ${selectedModelId}`,\n      );\n    }\n    return [selectedModelId, selectedPipeline, selectedChatConfig];\n  }\n\n  //--------------------------------------------------------------------\n  // 6. External low-level APIs that directly interacts with a pipeline.\n  //--------------------------------------------------------------------\n\n  async forwardTokensAndSample(\n    inputIds: Array<number>,\n    isPrefill: boolean,\n    modelId?: string,\n  ): Promise<number> {\n    const [, selectedPipeline] = this.getLLMStates(\n      \"forwardTokensAndSample\",\n      modelId,\n    );\n    return selectedPipeline.forwardTokensAndSample(inputIds, isPrefill);\n  }\n\n  /**\n   * Get the current generated response.\n   *\n   * @returns The current output message.\n   */\n  async getMessage(modelId?: string): Promise<string> {\n    const [, selectedPipeline] = this.getLLMStates(\"getMessage\", modelId);\n    return selectedPipeline.getMessage();\n  }\n\n  async runtimeStatsText(modelId?: string): Promise<string> {\n    log.warn(\n      \"WARNING: `runtimeStatsText()` will soon be deprecated. \" +\n        \"Please use `ChatCompletion.usage` for non-streaming requests, or \" +\n        \"`ChatCompletionChunk.usage` for streaming requests, enabled by `stream_options`. \" +\n        \"The only flow that expects to use `runtimeStatsText()` as of now is `forwardTokensAndSample()`.\",\n    );\n    const [, selectedPipeline] = this.getLLMStates(\"runtimeStatsText\", modelId);\n    return selectedPipeline.runtimeStatsText();\n  }\n\n  async resetChat(keepStats = false, modelId?: string) {\n    try {\n      const [, selectedPipeline] = this.getLLMStates(\"resetChat\", modelId);\n      selectedPipeline.resetChat(keepStats);\n    } catch (error) {\n      if (\n        error instanceof ModelNotLoadedError ||\n        error instanceof SpecifiedModelNotFoundError\n      ) {\n        // Only allow calling resetChat before pipeline instantiated.\n        log.debug(\n          \"Caught an expected error in resetChat, treating it as no-op. Error: \",\n          error,\n        );\n      } else {\n        throw error;\n      }\n    }\n  }\n\n  //-----------------------------------------------\n  // 7. Prefill and decode given an LLMChatPipeline\n  //-----------------------------------------------\n\n  /**\n   * Run a prefill step with a given input.\n   *\n   * If `input` is a chatCompletionRequest, we treat `input.messages[-1]` as the usual user input.\n   * We then convert `input.messages[:-1]` to a `Conversation` object, representing a conversation\n   * history.\n   *\n   * If the new `Conversation` object matches the current one loaded, it means we are\n   * performing multi-round chatting, so we do not reset, hence reusing KV cache. Otherwise, we\n   * reset every thing, treating the request as something completely new.\n   *\n   * @param input The OpenAI-style prompt to prefill.\n   * @param pipeline The loaded pipeline, hence model, to carry out this prefill.\n   * @param chatConfig The chat config to use for this model.\n   * @param genConfig Generation config.\n   */\n  async prefill(\n    input: ChatCompletionRequest | CompletionCreateParams,\n    pipeline: LLMChatPipeline,\n    chatConfig: ChatConfig,\n    genConfig: GenerationConfig,\n  ) {\n    // TODO: SPECIFY MODEL TO PERFORM PREFILL, HENCE RETRIEVE CONFIG\n    if (chatConfig === undefined) {\n      throw new ConfigurationNotInitializedError();\n    }\n    let input_str: string;\n    let input_role_str: string | undefined;\n    let lastMsgRole = Role.user;\n    if (\"messages\" in input) {\n      // For ChatCompletionRequest, we prepare input using `messages`\n      // 1. Get new conversation based on request, determine if we are in multiround chatting\n      const oldConv = pipeline.getConversationObject();\n      const newConv = getConversationFromChatCompletionRequest(\n        input,\n        chatConfig,\n      );\n      if (!compareConversationObject(oldConv, newConv)) {\n        // Not the same conversation, so not multiround chatting, reset everything (KV cache, etc.)\n        pipeline.resetChat();\n        pipeline.setConversation(newConv);\n      } else if (newConv.messages.length === 0) {\n        // Empty oldConv, and no chat history in newConv, so reset and setConversation\n        pipeline.resetChat();\n        pipeline.setConversation(newConv);\n      } else {\n        log.info(\"Multiround chatting, reuse KVCache.\");\n      }\n\n      // 2. Treat the last message as the usual input\n      const last_msg = input.messages[\n        input.messages.length - 1\n      ] as ChatCompletionMessageParam;\n      input_str = last_msg.content as string;\n      input_role_str =\n        last_msg.role === \"user\" && last_msg.name ? last_msg.name : undefined;\n      lastMsgRole = last_msg.role === \"tool\" ? Role.tool : Role.user;\n    } else {\n      // For CompletionCreateParams, the input is just the prompt\n      input_str = input.prompt;\n      pipeline.resetChat();\n      const newConv = getConversation(\n        chatConfig.conv_template,\n        chatConfig.conv_config,\n        true,\n      );\n      pipeline.setConversation(newConv);\n    }\n    return pipeline.prefillStep(\n      input_str,\n      lastMsgRole,\n      input_role_str,\n      genConfig,\n    );\n  }\n\n  /**\n   * Run a decode step to decode the next token.\n   */\n  async decode(pipeline: LLMChatPipeline, genConfig?: GenerationConfig) {\n    return pipeline.decodeStep(genConfig);\n  }\n}\n"
  },
  {
    "path": "src/error.ts",
    "content": "export class ModelNotFoundError extends Error {\n  constructor(modelId: string) {\n    super(\n      `Cannot find model record in appConfig for ${modelId}. Please check if the model ID is correct and included in the model_list configuration.`,\n    );\n    this.name = \"ModelNotFoundError\";\n  }\n}\n\nexport class ConfigValueError extends Error {\n  constructor(message: string) {\n    super(message);\n    this.name = \"ConfigValueError\";\n  }\n}\n\nexport class MinValueError extends ConfigValueError {\n  constructor(paramName: string, minValue: number) {\n    super(`Make sure \\`${paramName}\\` > ${minValue}.`);\n    this.name = \"MinValueError\";\n  }\n}\n\nexport class RangeError extends ConfigValueError {\n  constructor(\n    paramName: string,\n    minValue: number,\n    maxValue: number,\n    additionalMessage?: string,\n  ) {\n    super(\n      `Make sure ${minValue} < ${paramName} <= ${maxValue}.${additionalMessage ? \" \" + additionalMessage : \"\"}`,\n    );\n    this.name = \"RangeError\";\n  }\n}\n\nexport class NonNegativeError extends ConfigValueError {\n  constructor(paramName: string) {\n    super(`Make sure ${paramName} >= 0.`);\n    this.name = \"NonNegativeError\";\n  }\n}\n\nexport class InvalidNumberStringError extends ConfigValueError {\n  constructor(paramName: string, actualValue?: string) {\n    super(\n      `Make sure ${paramName} to be number represented in string.${actualValue ? \" Got \" + actualValue : \"\"}`,\n    );\n    this.name = \"InvalidNumberStringError\";\n  }\n}\n\nexport class DependencyError extends ConfigValueError {\n  constructor(\n    dependentParam: string,\n    requiredParam: string,\n    requiredValue: any,\n  ) {\n    super(\n      `${dependentParam} requires ${requiredParam} to be ${requiredValue}.`,\n    );\n    this.name = \"DependencyError\";\n  }\n}\n\nexport class WebGPUNotAvailableError extends Error {\n  constructor() {\n    super(\n      \"WebGPU is not supported in your current environment, but it is necessary to run the WebLLM engine. \" +\n        \"Please make sure that your browser supports WebGPU and that it is enabled in your browser settings. \" +\n        \"You can also consult your browser's compatibility chart to see if it supports WebGPU. \" +\n        \"For more information about WebGPU support in your browser, visit https://webgpureport.org/\",\n    );\n    this.name = \"WebGPUNotAvailableError\";\n  }\n}\n\nexport class WebGPUNotFoundError extends Error {\n  constructor() {\n    super(\"Cannot find WebGPU in the environment\");\n    this.name = \"WebGPUNotFoundError\";\n  }\n}\n\nexport class ModelNotLoadedError extends Error {\n  constructor(requestName: string) {\n    super(\n      `Model not loaded before trying to complete ${requestName}. Please ensure you have called ` +\n        `MLCEngine.reload(model) to load the model before initiating APIs, ` +\n        `or initialize your engine using CreateMLCEngine() with a valid model configuration.`,\n    );\n    this.name = \"ModelNotLoadedError\";\n  }\n}\n\nexport class WorkerEngineModelNotLoadedError extends Error {\n  constructor(engineName: string) {\n    super(\n      `${engineName} is not loaded with a model. Did you call \\`engine.reload()\\`?`,\n    );\n    this.name = \"WorkerEngineModelNotLoadedError\";\n  }\n}\n\nexport class MessageOrderError extends Error {\n  constructor(message: string) {\n    super(message);\n    this.name = \"MessageOrderError\";\n  }\n}\n\nexport class SystemMessageOrderError extends Error {\n  constructor() {\n    super(\"System prompt should always be the first message in `messages`.\");\n    this.name = \"SystemMessageOrderError\";\n  }\n}\n\nexport class ContentTypeError extends Error {\n  constructor(name: string) {\n    super(`${name} should have string content.`);\n    this.name = \"ContentTypeError\";\n  }\n}\n\nexport class UnsupportedRoleError extends Error {\n  constructor(role: string) {\n    super(`Unsupported role of message: ${role}`);\n    this.name = \"UnsupportedRoleError\";\n  }\n}\n\nexport class UserMessageContentErrorForNonVLM extends Error {\n  constructor(modelId: string, modelType: string, content: any) {\n    super(\n      `The model loaded is not of type ModelType.VLM (vision-language model). ` +\n        `Therefore, user message only supports string content, but received: ${content}\\n` +\n        `Loaded modelId: ${modelId}, modelType: ${modelType}`,\n    );\n    this.name = \"UserMessageContentErrorForNonVLM\";\n  }\n}\n\nexport class PrefillChunkSizeSmallerThanImageError extends Error {\n  constructor(prefillChunkSize: number, imageEmbedSize: number) {\n    super(\n      `prefillChunkSize needs to be greater than imageEmbedSize because a single image's ` +\n        `prefill cannot be chunked. Got prefillChunkSize: ` +\n        `${prefillChunkSize}, imageEmbedSize: ${imageEmbedSize}`,\n    );\n    this.name = \"PrefillChunkSizeSmallerThanImageError\";\n  }\n}\n\nexport class CannotFindImageEmbedError extends Error {\n  constructor() {\n    super(\n      `Received image input but model does not have kernel image_embed. ` +\n        `Make sure to only pass in image to a vision model.`,\n    );\n    this.name = \"CannotFindImageEmbedError\";\n  }\n}\n\nexport class UnsupportedDetailError extends Error {\n  constructor(detail: string) {\n    super(\n      `Currently do not support field image_url.detail, but received: ${detail}`,\n    );\n    this.name = \"UnsupportedDetailError\";\n  }\n}\n\nexport class UnsupportedImageURLError extends Error {\n  constructor(url: string) {\n    super(\n      `image_url.url should start with \"data:image\" for base64, or with \"http\", but got: ${url}`,\n    );\n    this.name = \"UnsupportedImageURLError\";\n  }\n}\n\nexport class MultipleTextContentError extends Error {\n  constructor() {\n    super(\n      `Each message can have at most one text contentPart, but received more than 1.`,\n    );\n    this.name = \"MultipleTextContentError\";\n  }\n}\n\nexport class ToolCallOutputParseError extends Error {\n  constructor(outputMessage: string, error: Error) {\n    super(\n      `Internal error: error encountered when parsing outputMessage for function ` +\n        `calling. Got outputMessage: ${outputMessage}\\nGot error: ${error}`,\n    );\n    this.name = \"ToolCallOutputParseError\";\n  }\n}\n\nexport class ToolCallOutputInvalidTypeError extends Error {\n  constructor(expectedType: string) {\n    super(\n      `Internal error: expect output of function calling to be an ${expectedType}`,\n    );\n    this.name = \"ToolCallOutputInvalidTypeError\";\n  }\n}\n\nexport class ToolCallOutputMissingFieldsError extends Error {\n  constructor(missingFields: string[], object: any) {\n    super(\n      `Expect generated tool call to have fields ${missingFields.map((field) => `\"\\`${field}\\`\"`).join(\", \")}, but got object: ${JSON.stringify(object)}`,\n    );\n    this.name = \"JSONFieldError\";\n  }\n}\n\nexport class ConfigurationNotInitializedError extends Error {\n  constructor() {\n    super(\n      \"Configuration not initialized. Ensure you have called `reload()` function first.\",\n    );\n    this.name = \"ConfigurationNotInitializedError\";\n  }\n}\n\nexport class MissingModelWasmError extends Error {\n  constructor(modelId: string) {\n    super(\n      `Missing \\`model_lib\\` for the model with ID \"${modelId}\". Please ensure that \\`model_lib\\` is provided in \\`model_list\\` for each model. This URL is essential for downloading the WASM library necessary to run the model.`,\n    );\n    this.name = \"MissingModelError\";\n  }\n}\n\nexport class FeatureSupportError extends Error {\n  constructor(feature: string) {\n    super(\n      `This model requires feature ${feature}, which is not yet supported by this browser.`,\n    );\n    this.name = \"FeatureSupportError\";\n  }\n}\n\nexport class UnsupportedFieldsError extends Error {\n  constructor(unsupportedFields: string[], targetClass: string) {\n    super(\n      `The following fields in ${targetClass} are not yet supported: \\n` +\n        unsupportedFields.join(\", \"),\n    );\n    this.name = \"UnsupportedFieldsError\";\n  }\n}\n\nexport class ShaderF16SupportError extends FeatureSupportError {\n  constructor() {\n    super(\n      \"This model requires WebGPU extension shader-f16, which is not enabled in this browser. \" +\n        'You can try to launch Chrome Canary in command line with flag \"--enable-dawn-features=allow_unsafe_apis\".',\n    );\n    this.name = \"ShaderF16SupportError\";\n  }\n}\nexport class DeviceLostError extends Error {\n  constructor() {\n    super(\n      \"The WebGPU device was lost while loading the model. This issue often occurs due to running out of memory (OOM). To resolve this, try reloading with a model that has fewer parameters or uses a smaller context length.\",\n    );\n    this.name = \"DeviceLostError\";\n  }\n}\n\nexport class InvalidToolChoiceError extends Error {\n  constructor(toolChoice: string) {\n    super(\n      `Invalid tool choice value: '${toolChoice}'. Please check your input and try again.`,\n    );\n    this.name = \"InvalidToolChoiceError\";\n  }\n}\n\nexport class UnsupportedToolChoiceTypeError extends Error {\n  constructor() {\n    super(\n      \"Unsupported tool choice type. Only tool choices of type 'function' are supported.\",\n    );\n    this.name = \"UnsupportedToolChoiceTypeError\";\n  }\n}\n\nexport class FunctionNotFoundError extends Error {\n  constructor(functionName: string) {\n    super(\n      `The tool choice function ${functionName} is not found in the tools list`,\n    );\n    this.name = \"FunctionNotFoundError\";\n  }\n}\n\nexport class UnsupportedToolTypeError extends Error {\n  constructor() {\n    super(\"Only 'function' tool type is supported\");\n    this.name = \"UnsupportedToolTypeError\";\n  }\n}\nexport class EngineNotLoadedError extends Error {\n  constructor() {\n    super(\n      \"Engine not yet loaded with model. Ensure you initialize the chat module by calling `engine.reload()` first.\",\n    );\n    this.name = \"EngineNotLoadedError\";\n  }\n}\nexport class UnsupportedTokenizerFilesError extends Error {\n  constructor(files: string[]) {\n    super(`Cannot handle tokenizer files ${files}`);\n    this.name = \"UnsupportedTokenizerFilesError\";\n  }\n}\n\nexport class WindowSizeConfigurationError extends Error {\n  constructor(contextWindowSize: number, slidingWindowSize: number) {\n    super(\n      `Only one of context_window_size and sliding_window_size can be positive. Got: ` +\n        `context_window_size: ${contextWindowSize}, sliding_window_size: ${slidingWindowSize}\\n` +\n        `Consider modifying ModelRecord.overrides to set one of them to -1.`,\n    );\n    this.name = \"WindowSizeConfigurationError\";\n  }\n}\n\nexport class AttentionSinkSizeError extends Error {\n  constructor() {\n    super(\n      \"Need to specify non-negative attention_sink_size if using sliding window. \" +\n        \"Consider modifying ModelRecord.overrides. \" +\n        \"Use `attention_sink_size=0` for default sliding window.\",\n    );\n    this.name = \"AttentionSinkSizeError\";\n  }\n}\n\nexport class WindowSizeSpecificationError extends Error {\n  constructor() {\n    super(\n      \"Need to specify either sliding_window_size or max_window_size.\\n\" +\n        \"Consider modifying ModelRecord.overrides to set one of them to positive.\",\n    );\n    this.name = \"WindowSizeSpecificationError\";\n  }\n}\n\nexport class ContextWindowSizeExceededError extends Error {\n  constructor(numPromptTokens: number, contextWindowSize: number) {\n    super(\n      `Prompt tokens exceed context window size: number of prompt tokens: ${numPromptTokens}; ` +\n        `context window size: ${contextWindowSize}\\nConsider shortening the prompt, or increase ` +\n        \"`context_window_size`, or using sliding window via `sliding_window_size`.\",\n    );\n    this.name = \"ContextWindowSizeExceededError\";\n  }\n}\n\nexport class NonWorkerEnvironmentError extends Error {\n  constructor(className: string) {\n    super(`${className} must be created in the service worker script.`);\n    this.name = \"NonWorkerEnvironmentError\";\n  }\n}\n\nexport class NoServiceWorkerAPIError extends Error {\n  constructor() {\n    super(\n      \"Service worker API is not available in your browser. Please ensure that your browser supports service workers and that you are using a secure context (HTTPS). \" +\n        \"Check the browser compatibility and ensure that service workers are not disabled in your browser settings.\",\n    );\n    this.name = \"NoServiceWorkerAPIError\";\n  }\n}\n\nexport class ServiceWorkerInitializationError extends Error {\n  constructor() {\n    super(\n      \"Service worker failed to initialize. This could be due to a failure in the service worker registration process or because the service worker is not active. \" +\n        \"Please refresh the page to retry initializing the service worker.\",\n    );\n    this.name = \"ServiceWorkerInitializationError\";\n  }\n}\n\nexport class StreamingCountError extends Error {\n  constructor() {\n    super(\"When streaming, `n` cannot be > 1.\");\n    this.name = \"StreamingCountError\";\n  }\n}\n\nexport class SeedTypeError extends Error {\n  constructor(seed: any) {\n    super(\"`seed` should be an integer, but got \" + seed);\n    this.name = \"SeedTypeError\";\n  }\n}\nexport class InvalidResponseFormatError extends Error {\n  constructor() {\n    super(\"JSON schema is only supported with `json_object` response format.\");\n    this.name = \"InvalidResponseFormatError\";\n  }\n}\n\nexport class InvalidResponseFormatGrammarError extends Error {\n  constructor() {\n    super(\n      \"When ResponseFormat.type is `grammar`, ResponseFormat.grammar needs to be specified.\\n\" +\n        \"When ResponseFormat.grammar is specified, ResponseFormat.type needs to be grammar.\",\n    );\n    this.name = \"InvalidResponseFormatGrammarError\";\n  }\n}\n\nexport class InvalidResponseFormatStructuralTagError extends Error {\n  constructor() {\n    super(\n      \"When ResponseFormat.type is `structural_tag`, ResponseFormat.structural_tag needs to be specified.\\n\" +\n        \"When ResponseFormat.structural_tag is specified, ResponseFormat.type needs to be structural_tag.\",\n    );\n    this.name = \"InvalidResponseFormatStructuralTagError\";\n  }\n}\n\nexport class CustomResponseFormatError extends Error {\n  constructor(currentFormat: any) {\n    super(\n      \"When using Hermes-2-Pro function calling via ChatCompletionRequest.tools, \" +\n        \"cannot specify customized response_format. We will set it for you internally. Currently \" +\n        \"set to: \" +\n        JSON.stringify(currentFormat),\n    );\n    this.name = \"CustomResponseFormatError\";\n  }\n}\nexport class UnsupportedModelIdError extends Error {\n  constructor(currentModelId: string, supportedModelIds: string[]) {\n    super(\n      `${currentModelId} is not supported for ChatCompletionRequest.tools. Currently, models ` +\n        `that support function calling are: ${supportedModelIds.join(\", \")}`,\n    );\n    this.name = \"UnsupportedModelIdError\";\n  }\n}\nexport class CustomSystemPromptError extends Error {\n  constructor() {\n    super(\n      \"When using Hermes-2-Pro function calling via ChatCompletionRequest.tools, cannot specify customized system prompt.\",\n    );\n    this.name = \"CustomSystemPromptError\";\n  }\n}\n\nexport class InvalidStreamOptionsError extends Error {\n  constructor() {\n    super(\"Only specify stream_options when stream=True.\");\n    this.name = \"InvalidStreamOptionsError\";\n  }\n}\nexport class UnknownMessageKindError extends Error {\n  constructor(msgKind: string, msgContent: any) {\n    super(`Unknown message kind, msg: [${msgKind}] ${msgContent}`);\n    this.name = \"UnknownMessageKindError\";\n  }\n}\n\nexport class TextCompletionExpectsKVEmptyError extends Error {\n  constructor() {\n    super(\"Non-chat text completion API expects KVCache to be empty.\");\n    this.name = \"TextCompletionExpectsKVEmptyError\";\n  }\n}\n\nexport class TextCompletionConversationExpectsPrompt extends Error {\n  constructor() {\n    super(\n      \"Non-chat text completion API expects isTextCompletion is true, and prompt is defined.\",\n    );\n    this.name = \"TextCompletionConversationExpectsPrompt\";\n  }\n}\n\nexport class TextCompletionConversationError extends Error {\n  constructor(funcName: string) {\n    super(`Non-chat text completion API cannot call ${funcName}.`);\n    this.name = \"TextCompletionConversationError\";\n  }\n}\n\nexport class EmbeddingUnsupportedEncodingFormatError extends Error {\n  constructor() {\n    super(\"Embedding in base64 format is currently not supported.\");\n    this.name = \"EmbeddingUnsupportedEncodingFormatError\";\n  }\n}\n\nexport class EmbeddingUnsupportedModelError extends Error {\n  constructor(currentModel: string) {\n    super(\n      `Trying to run embeddings.create() with ${currentModel}, which does not have ` +\n        `ModelRecord.model_type === ModelType.embedding in the model record. ` +\n        `Either make sure an embedding model is loaded, or specify the model type in ModelRecord.`,\n    );\n    this.name = \"EmbeddingUnsupportedModelError\";\n  }\n}\n\nexport class EmbeddingSlidingWindowError extends Error {\n  constructor(sliding_window_size: number) {\n    super(\n      `Embedding should not use sliding window. However, ` +\n        `sliding_window_size=${sliding_window_size} is specified in the chat config.`,\n    );\n    this.name = \"EmbeddingSlidingWindowError\";\n  }\n}\n\nexport class EmbeddingChunkingUnsupportedError extends Error {\n  constructor(contextWindowSize: number, prefillChunkSize: number) {\n    super(\n      `Embedding currently does not support chunking. Make sure ` +\n        `contextWindowSize === prefillChunkSize. Got contextWindowSize=${contextWindowSize}, ` +\n        `prefillChunkSize=${prefillChunkSize} instead.`,\n    );\n    this.name = \"EmbeddingChunkingUnsupportedError\";\n  }\n}\n\nexport class EmbeddingExceedContextWindowSizeError extends Error {\n  constructor(contextWindowSize: number, receivedSize: number) {\n    super(\n      `The embedding model you are using only supports up to ${contextWindowSize} context size.` +\n        `However, an input in the batch has size ${receivedSize}.`,\n    );\n    this.name = \"EmbeddingExceedContextWindowSizeError\";\n  }\n}\n\nexport class EmbeddingInputEmptyError extends Error {\n  constructor() {\n    super(\"Embedding input cannot be empty string or empty token array.\");\n    this.name = \"EmbeddingInputEmptyError\";\n  }\n}\n\nexport class ReloadArgumentSizeUnmatchedError extends Error {\n  constructor(numModelId: number, numChatOpts: number) {\n    super(\n      `Expect chatOpts, if specified, to match the size of modelId. However, got ` +\n        `${numModelId} modelId, but ${numChatOpts} chatOpts.`,\n    );\n    this.name = \"ReloadArgumentSizeUnmatchedError\";\n  }\n}\n\nexport class UnclearModelToUseError extends Error {\n  constructor(loadedModels: string[], requestName: string) {\n    super(\n      `Multiple models are loaded in engine. Please specify the model in ${requestName}.\\n` +\n        `Currently loaded models are:\\n${loadedModels}`,\n    );\n    this.name = \"UnclearModelToUseError\";\n  }\n}\n\nexport class SpecifiedModelNotFoundError extends Error {\n  constructor(\n    loadedModels: string[],\n    requestedModelId: string,\n    requestName: string,\n  ) {\n    super(\n      `Specified model ${requestedModelId} for ${requestName} is not found in loaded models. ` +\n        `Please check if the correct model is loaded/specified. ` +\n        `Currently loaded models are:\\n${loadedModels}`,\n    );\n    this.name = \"SpecifiedModelNotFoundError\";\n  }\n}\n\nexport class IncorrectPipelineLoadedError extends Error {\n  constructor(\n    selectedModelId: string,\n    expectedPipeline: string,\n    requestName: string,\n  ) {\n    super(\n      `${requestName} expects model to be loaded with ${expectedPipeline}. However, ` +\n        `${selectedModelId} is not loaded with this pipeline.`,\n    );\n    this.name = \"IncorrectPipelineLoadedError\";\n  }\n}\n\nexport class ReloadModelIdNotUniqueError extends Error {\n  constructor(modelId: string[]) {\n    super(\n      `Need to make models in modelId passed to reload() need to be unique. If you want to, ` +\n        `load copies of the same model, consider making copies of the ModelRecord with ` +\n        `different model_id. Received modelId: ${modelId}`,\n    );\n    this.name = \"ReloadModelIdNotUniqueError\";\n  }\n}\n"
  },
  {
    "path": "src/extension_service_worker.ts",
    "content": "import * as tvmjs from \"@mlc-ai/web-runtime\";\nimport log from \"loglevel\";\nimport { ChatOptions, MLCEngineConfig } from \"./config\";\nimport { ReloadParams, WorkerRequest } from \"./message\";\nimport {\n  ChatWorker,\n  WebWorkerMLCEngineHandler,\n  WebWorkerMLCEngine,\n} from \"./web_worker\";\nimport { areArraysEqual, areChatOptionsListEqual } from \"./utils\";\nimport { WebGPUNotFoundError } from \"./error\";\n\nexport interface ExtensionMLCEngineConfig extends MLCEngineConfig {\n  extensionId?: string;\n  onDisconnect?: () => void;\n}\n\n/**\n * Worker handler that can be used in a ServiceWorker.\n *\n * @example\n *\n * const engine = new MLCEngine();\n * let handler;\n * chrome.runtime.onConnect.addListener(function (port) {\n *   if (handler === undefined) {\n *     handler = new ServiceWorkerMLCEngineHandler(engine, port);\n *   } else {\n *     handler.setPort(port);\n *   }\n *   port.onMessage.addListener(handler.onmessage.bind(handler));\n * });\n */\nexport class ServiceWorkerMLCEngineHandler extends WebWorkerMLCEngineHandler {\n  port: chrome.runtime.Port | null;\n\n  constructor(port: chrome.runtime.Port) {\n    super();\n    this.port = port;\n    port.onDisconnect.addListener(() => this.onPortDisconnect(port));\n  }\n\n  postMessage(msg: any) {\n    this.port?.postMessage(msg);\n  }\n\n  setPort(port: chrome.runtime.Port) {\n    this.port = port;\n    port.onDisconnect.addListener(() => this.onPortDisconnect(port));\n  }\n\n  onPortDisconnect(port: chrome.runtime.Port) {\n    if (port === this.port) {\n      this.port = null;\n    }\n  }\n\n  onmessage(event: any): void {\n    if (event.type === \"keepAlive\") {\n      return;\n    }\n\n    const msg = event as WorkerRequest;\n    if (msg.kind === \"reload\") {\n      this.handleTask(msg.uuid, async () => {\n        const params = msg.content as ReloadParams;\n        // If the modelId, chatOpts, and appConfig are the same, immediately return\n        if (\n          areArraysEqual(this.modelId, params.modelId) &&\n          areChatOptionsListEqual(this.chatOpts, params.chatOpts)\n        ) {\n          log.info(\"Already loaded the model. Skip loading\");\n          const gpuDetectOutput = await tvmjs.detectGPUDevice();\n          if (gpuDetectOutput == undefined) {\n            throw new WebGPUNotFoundError();\n          }\n          let gpuLabel = \"WebGPU\";\n          if (gpuDetectOutput.adapterInfo.description.length != 0) {\n            gpuLabel += \" - \" + gpuDetectOutput.adapterInfo.description;\n          } else {\n            gpuLabel += \" - \" + gpuDetectOutput.adapterInfo.vendor;\n          }\n          this.engine.getInitProgressCallback()?.({\n            progress: 1,\n            timeElapsed: 0,\n            text: \"Finish loading on \" + gpuLabel,\n          });\n          return null;\n        }\n\n        await this.engine.reload(params.modelId, params.chatOpts);\n        this.modelId = params.modelId;\n        this.chatOpts = params.chatOpts;\n        return null;\n      });\n      return;\n    }\n\n    // All rest of message handling are the same as WebWorkerMLCEngineHandler\n    super.onmessage(event);\n  }\n}\n\n/**\n * Create a ServiceWorkerMLCEngine.\n *\n * @param modelId model_id of the model to load, either string or string[]. When multiple models\n *   are provided, we load all models sequentially. Each modelId needs to either be in\n *   `webllm.prebuiltAppConfig`, or in `engineCOnfig.appConfig`.\n * @param engineConfig Optionally configures the engine, see `webllm.MLCEngineConfig` for more.\n * @param chatOpts Extra options to optionally override the `mlc-chat-config.json` of `modelId`.\n *   The size of which needs to match that of `modelId`; chatOpts[i] will be used for modelId[i].\n * @param keepAliveMs The interval to send keep alive messages to the service worker.\n * See [Service worker lifecycle](https://developer.chrome.com/docs/extensions/develop/concepts/service-workers/lifecycle#idle-shutdown)\n * The default is 10s.\n * @returns An initialized `WebLLM.ServiceWorkerMLCEngine` with `modelId` loaded.\n */\nexport async function CreateServiceWorkerMLCEngine(\n  modelId: string | string[],\n  engineConfig?: ExtensionMLCEngineConfig,\n  chatOpts?: ChatOptions | ChatOptions[],\n  keepAliveMs = 10000,\n): Promise<ServiceWorkerMLCEngine> {\n  const serviceWorkerMLCEngine = new ServiceWorkerMLCEngine(\n    engineConfig,\n    keepAliveMs,\n  );\n  await serviceWorkerMLCEngine.reload(modelId, chatOpts);\n  return serviceWorkerMLCEngine;\n}\n\nclass PortAdapter implements ChatWorker {\n  port: chrome.runtime.Port;\n  private _onmessage!: (message: any) => void;\n\n  constructor(port: chrome.runtime.Port) {\n    this.port = port;\n    this.port.onMessage.addListener(this.handleMessage.bind(this));\n  }\n\n  // Wrapper to handle incoming messages and delegate to onmessage if available\n  private handleMessage(message: any) {\n    if (this._onmessage) {\n      this._onmessage(message);\n    }\n  }\n\n  // Getter and setter for onmessage to manage adding/removing listeners\n  get onmessage(): (message: any) => void {\n    return this._onmessage;\n  }\n\n  set onmessage(listener: (message: any) => void) {\n    this._onmessage = listener;\n  }\n\n  // Wrap port.postMessage to maintain 'this' context\n  postMessage = (message: any): void => {\n    this.port.postMessage(message);\n  };\n}\n\n/**\n * A client of MLCEngine that exposes the same interface\n */\nexport class ServiceWorkerMLCEngine extends WebWorkerMLCEngine {\n  port: chrome.runtime.Port;\n  extensionId?: string;\n\n  constructor(engineConfig?: ExtensionMLCEngineConfig, keepAliveMs = 10000) {\n    const extensionId = engineConfig?.extensionId;\n    const onDisconnect = engineConfig?.onDisconnect;\n    const port = extensionId\n      ? chrome.runtime.connect(extensionId, {\n          name: \"web_llm_service_worker\",\n        })\n      : chrome.runtime.connect({ name: \"web_llm_service_worker\" });\n    const chatWorker = new PortAdapter(port);\n    super(chatWorker, engineConfig);\n    this.port = port;\n    this.extensionId = extensionId;\n\n    // Keep alive through periodical heartbeat signals\n    const keepAliveTimer = setInterval(() => {\n      this.worker.postMessage({ kind: \"keepAlive\" });\n    }, keepAliveMs);\n\n    port.onDisconnect.addListener(() => {\n      clearInterval(keepAliveTimer);\n      if (onDisconnect) {\n        onDisconnect();\n      }\n    });\n  }\n}\n"
  },
  {
    "path": "src/index.ts",
    "content": "export {\n  ModelRecord,\n  AppConfig,\n  ChatOptions,\n  MLCEngineConfig,\n  GenerationConfig,\n  ModelType,\n  prebuiltAppConfig,\n  modelVersion,\n  modelLibURLPrefix,\n  functionCallingModelIds,\n} from \"./config\";\n\nexport {\n  InitProgressCallback,\n  InitProgressReport,\n  MLCEngineInterface,\n  LogitProcessor,\n  LogLevel,\n} from \"./types\";\n\nexport { MLCEngine, CreateMLCEngine } from \"./engine\";\n\nexport {\n  hasModelInCache,\n  deleteChatConfigInCache,\n  deleteModelAllInfoInCache,\n  deleteModelWasmInCache,\n  deleteModelInCache,\n} from \"./cache_util\";\n\nexport {\n  WebWorkerMLCEngineHandler,\n  WebWorkerMLCEngine,\n  CreateWebWorkerMLCEngine,\n} from \"./web_worker\";\n\nexport { WorkerRequest, WorkerResponse, CustomRequestParams } from \"./message\";\n\nexport {\n  ServiceWorkerMLCEngineHandler,\n  ServiceWorkerMLCEngine,\n  CreateServiceWorkerMLCEngine,\n} from \"./service_worker\";\n\nexport {\n  ServiceWorkerMLCEngineHandler as ExtensionServiceWorkerMLCEngineHandler,\n  ServiceWorkerMLCEngine as ExtensionServiceWorkerMLCEngine,\n  CreateServiceWorkerMLCEngine as CreateExtensionServiceWorkerMLCEngine,\n} from \"./extension_service_worker\";\n\nexport * from \"./openai_api_protocols/index\";\n"
  },
  {
    "path": "src/llm_chat.ts",
    "content": "import * as tvmjs from \"@mlc-ai/web-runtime\";\nimport * as xgr from \"@mlc-ai/web-xgrammar\";\nimport log from \"loglevel\";\nimport { Tokenizer } from \"@mlc-ai/web-tokenizers\";\nimport { ChatConfig, GenerationConfig, Role } from \"./config\";\nimport { getConversation, Conversation } from \"./conversation\";\nimport { LogitProcessor, LatencyBreakdown } from \"./types\";\nimport {\n  getChunkedPrefillInputData,\n  getImageDataFromURL,\n  getRGBArrayFromImageData,\n  getTokenTableFromTokenizer,\n  getTopProbs,\n  IMAGE_EMBED_SIZE,\n} from \"./support\";\nimport {\n  ChatCompletionFinishReason,\n  ChatCompletionTokenLogprob,\n  TopLogprob,\n  ResponseFormat,\n  ChatCompletionContentPartImage,\n} from \"./openai_api_protocols/index\";\nimport {\n  AttentionSinkSizeError,\n  ContextWindowSizeExceededError,\n  MinValueError,\n  RangeError,\n  WindowSizeConfigurationError,\n  WindowSizeSpecificationError,\n  MessageOrderError,\n  TextCompletionExpectsKVEmptyError,\n  PrefillChunkSizeSmallerThanImageError,\n  CannotFindImageEmbedError,\n} from \"./error\";\n\ntype ImageURL = ChatCompletionContentPartImage.ImageURL;\n\nexport class LLMChatPipeline {\n  private config: ChatConfig;\n  private tokenizer: Tokenizer;\n\n  // TVM functions\n  private tvm: tvmjs.Instance;\n  private device: tvmjs.DLDevice;\n  private vm: tvmjs.VirtualMachine;\n  private prefill: tvmjs.PackedFunc;\n  private decoding: tvmjs.PackedFunc;\n  private image_embed: tvmjs.PackedFunc | undefined;\n  private embed: tvmjs.PackedFunc;\n  private fapplyBitmask: tvmjs.PackedFunc;\n  private fapplyPenalty: tvmjs.PackedFunc;\n  private fapplyLogitBias: tvmjs.PackedFunc;\n  private fsoftmaxWithTemperature: tvmjs.PackedFunc;\n  private fsampleWithTopP: tvmjs.PackedFunc;\n  private fargsortProbs: tvmjs.PackedFunc;\n\n  // Functions related to PagedKVCache\n  private fclearKVCaches: tvmjs.PackedFunc;\n  private fKVCacheAddSequence: tvmjs.PackedFunc;\n  private fKVCacheRemoveSequence: tvmjs.PackedFunc;\n  private fKVCacheBeginForward: tvmjs.PackedFunc;\n  private fKVCacheEndForward: tvmjs.PackedFunc;\n  private fKVCacheEnableSlidingWindowForSeq: tvmjs.PackedFunc;\n\n  // parameter states\n  private params: tvmjs.TVMObject;\n  private kvCache: tvmjs.TVMObject;\n  private logitsOnCPU?: tvmjs.Tensor = undefined;\n  private filledKVCacheLength = 0;\n\n  // meta data\n  private bosTokenId = 1;\n  private contextWindowSize = -1;\n  private slidingWindowSize = -1;\n  private attentionSinkSize = -1;\n  private prefillChunkSize = -1;\n  private resetStatsPerPrefill = true;\n  private stopStr: string[];\n  private stopTokens: Array<number>;\n\n  // states\n  private outputMessage = \"\";\n  private outputIds: Array<number> = [];\n  private stopTriggered = false;\n  private finishReason: ChatCompletionFinishReason | undefined = undefined;\n  // frequency of appeared token ids till now (refresh after PrefillStep); token_id mapped to freq\n  private appearedTokensFreq = new Map<number, number>();\n  private conversation: Conversation;\n  // The logprob information of all tokens for this current round (cleared upon each prefillStep)\n  // Cleared & updated at the exact same spots as `outputMessage`. Only updated when\n  // `genConfig.logprobs` is true. Each entry corresponds to a single autoregressive step.\n  private tokenLogprobArray: Array<ChatCompletionTokenLogprob> = [];\n\n  // stats, reset at every `resetChat(keepstats=false)`\n  private decodingTotalTime = 0;\n  private decodingTotalTokens = 0;\n  private prefillTotalTime = 0;\n  private prefillTotalTokens = 0;\n  // same stats as above, but reset at every `prefillStep()`\n  private curRoundDecodingTotalTokens = 0;\n  private curRoundPrefillTotalTokens = 0;\n  private curRoundDecodingTotalTime = 0;\n  private curRoundPrefillTotalTime = 0;\n\n  // additional stats, reset at every prefillStep()\n  public curRoundLatencyBreakdown: LatencyBreakdown = {\n    logitProcessorTime: [],\n    logitBiasTime: [],\n    penaltyTime: [],\n    sampleTime: [],\n    totalTime: [],\n    grammarBitmaskTime: [],\n  };\n\n  // LogitProcessor\n  private logitProcessor?: LogitProcessor = undefined;\n\n  // Grammar-related\n  // A grammar matcher for this current round if response_format is set. Reinitialized upon\n  // each step regardless of whether the chat is multi-round or not.\n  private grammarMatcher?: xgr.GrammarMatcher = undefined;\n  // Cache key of the current response_format (schema / grammar / structural tag). If undefined,\n  // grammarMatcher is simply using JSON mode. We use this field to determine whether we re-initiate\n  // a GrammarMatcher or simply reset during each round (i.e. during prefillStep).\n  private responseFormatCacheKey?: string = undefined;\n  // A string list of tokens ordered by their token id, post-processed. Once initialized, will not\n  // be reinitialized since `this.tokenizer` does not change throughout the lifetime of LLMChatPipeline.\n  private xgTokenizerInfo?: xgr.TokenizerInfo = undefined;\n  // Compiler for grammar. It is persistent since it specializes on xgTokenizerInfo.\n  private grammarCompiler?: xgr.GrammarCompiler = undefined;\n  // Size of the bitmask for grammar, determined by fullVocabSize\n  private bitmaskSize: number;\n  // `vocab_size` read from `config.json`. Can be different from the size of the tokenTable for some\n  // models due to dummy padded tokens.\n  private fullVocabSize: number;\n  // Method to post process the token for grammar; either \"byte_level\" or default \"byte_fallback\".\n  private token_postproc_method: string;\n  // Whether to prepend space for grammar\n  private prepend_space_in_encode: boolean;\n  // stats for grammar-related overhead\n  // Time to initialize grammar matcher in seconds\n  private curRoundGrammarInitTotalTime = 0;\n  // Total time of getting next bitmask and accepting token in seconds\n  private curRoundGrammarPerTokenTotalTime = 0;\n  // Instance variables for supporting sampling on WebGPU\n  private sampleIndices: Int32Array;\n  private sampleIndicesDevice: tvmjs.Tensor;\n  private topPDevice: tvmjs.Tensor;\n\n  constructor(\n    tvm: tvmjs.Instance,\n    tokenizer: Tokenizer,\n    config: ChatConfig,\n    logitProcessor?: LogitProcessor,\n  ) {\n    // 0. Setting attributes\n    this.tvm = tvm;\n    this.tokenizer = tokenizer;\n    this.config = config;\n    this.logitProcessor = logitProcessor;\n    this.fullVocabSize = this.config.vocab_size;\n    this.bitmaskSize = Math.ceil(this.fullVocabSize / 32);\n\n    this.conversation = getConversation(\n      config.conv_template,\n      config.conv_config,\n    );\n    this.stopStr = this.conversation.getStopStr();\n    this.stopTokens = this.conversation.getStopTokens();\n    if (config.bos_token_id !== undefined) {\n      this.bosTokenId = config.bos_token_id;\n    }\n    // Set token_post_proc_method, currently mlc-chat-config.json are unstable, hence various\n    // fallback mechanisms\n    if (config.tokenizer_info !== undefined) {\n      this.token_postproc_method = config.tokenizer_info.token_postproc_method;\n      this.prepend_space_in_encode =\n        config.tokenizer_info.prepend_space_in_encode;\n    } else if (config.token_table_postproc_method !== undefined) {\n      this.token_postproc_method = config.token_table_postproc_method;\n      this.prepend_space_in_encode = false;\n    } else {\n      log.warn(\n        \"Cannot find `tokenizer_info` or `token_table_postproc_method` in `mlc-chat-config.json`, \" +\n          \"using default token_postproc_method `raw`.\\n\" +\n          \"This field is only used for json mode.\",\n      );\n      this.token_postproc_method = \"raw\";\n      this.prepend_space_in_encode = false;\n    }\n    log.info(\"token_postproc_method: \", this.token_postproc_method);\n    log.info(\"prepend_space_in_encode: \", this.prepend_space_in_encode);\n\n    this.device = this.tvm.webgpu();\n\n    // 1. Create VM and get the core functions\n    tvm.beginScope();\n    this.vm = this.tvm.detachFromCurrentScope(\n      this.tvm.createVirtualMachine(this.device),\n    );\n    this.prefill = this.tvm.detachFromCurrentScope(\n      this.vm.getFunction(\"prefill\"),\n    );\n    this.embed = this.tvm.detachFromCurrentScope(this.vm.getFunction(\"embed\"));\n    this.decoding = this.tvm.detachFromCurrentScope(\n      this.vm.getFunction(\"decode\"),\n    );\n    this.fapplyBitmask = this.tvm.detachFromCurrentScope(\n      this.vm.getFunction(\"apply_bitmask_inplace\"),\n    );\n    this.fapplyPenalty = this.tvm.detachFromCurrentScope(\n      this.vm.getFunction(\"apply_penalty_inplace\"),\n    );\n    this.fapplyLogitBias = this.tvm.detachFromCurrentScope(\n      this.vm.getFunction(\"apply_logit_bias_inplace\"),\n    );\n    this.fsoftmaxWithTemperature = this.tvm.detachFromCurrentScope(\n      this.vm.getFunction(\"softmax_with_temperature\"),\n    );\n    this.fsampleWithTopP = this.tvm.detachFromCurrentScope(\n      this.vm.getFunction(\"sample_with_top_p\"),\n    );\n    this.fargsortProbs = this.tvm.detachFromCurrentScope(\n      this.vm.getFunction(\"argsort_probs\"),\n    );\n    try {\n      this.image_embed = this.tvm.detachFromCurrentScope(\n        this.vm.getFunction(\"image_embed\"),\n      );\n    } catch {\n      log.info(\"Cannot find function image_embed.\");\n    }\n\n    // 2. Get json stored in the vm's metadata function\n    const fgetMetadata = this.vm.getFunction(\"_metadata\");\n    const ret_value = fgetMetadata();\n    const metadataStr = ret_value.toString();\n    const metadata = JSON.parse(metadataStr);\n\n    // 3. Load parameters by name\n    const paramNames: string[] = [];\n    metadata.params.forEach((param: any) => {\n      paramNames.push(param.name);\n    });\n    this.params = this.tvm.detachFromCurrentScope(\n      this.tvm.getParamsFromCacheByName(paramNames),\n    );\n\n    // 4. Read in compilation configurations from metadata\n    this.prefillChunkSize = metadata.prefill_chunk_size;\n    log.info(\"Using prefillChunkSize: \", this.prefillChunkSize);\n    if (this.prefillChunkSize <= 0) {\n      throw new MinValueError(\"prefill_chunk_size\", 0);\n    }\n\n    // 5. Consolidate KVCache settings: context window, sliding window, attention sink\n    this.slidingWindowSize = config.sliding_window_size;\n    this.contextWindowSize = config.context_window_size;\n    this.attentionSinkSize = config.attention_sink_size;\n    if (this.contextWindowSize !== -1 && this.slidingWindowSize !== -1) {\n      throw new WindowSizeConfigurationError(\n        this.contextWindowSize,\n        this.slidingWindowSize,\n      );\n    } else if (this.slidingWindowSize != -1) {\n      // Use sliding window and attention sink\n      log.info(\"Using slidingWindowSize: \", this.slidingWindowSize);\n      if (this.attentionSinkSize >= 0) {\n        log.info(\"Using attentionSinkSize: \", this.attentionSinkSize);\n      } else {\n        throw new AttentionSinkSizeError();\n      }\n    } else if (this.contextWindowSize != -1) {\n      // Use default kv cache without sliding window\n      log.info(\"Using contextWindowSize: \", this.contextWindowSize);\n    } else {\n      throw new WindowSizeSpecificationError();\n    }\n\n    // 5. Create cache\n    // Load cache functions and instantiate KVCache\n    this.fclearKVCaches = this.tvm.detachFromCurrentScope(\n      this.tvm.getGlobalFunc(\"vm.builtin.kv_state_clear\"),\n    );\n    this.fKVCacheAddSequence = this.tvm.detachFromCurrentScope(\n      this.tvm.getGlobalFunc(\"vm.builtin.kv_state_add_sequence\"),\n    );\n    this.fKVCacheRemoveSequence = this.tvm.detachFromCurrentScope(\n      this.tvm.getGlobalFunc(\"vm.builtin.kv_state_remove_sequence\"),\n    );\n    this.fKVCacheBeginForward = this.tvm.detachFromCurrentScope(\n      this.tvm.getGlobalFunc(\"vm.builtin.kv_state_begin_forward\"),\n    );\n    this.fKVCacheEndForward = this.tvm.detachFromCurrentScope(\n      this.tvm.getGlobalFunc(\"vm.builtin.kv_state_end_forward\"),\n    );\n    this.fKVCacheEnableSlidingWindowForSeq = this.tvm.detachFromCurrentScope(\n      this.tvm.getGlobalFunc(\n        \"vm.builtin.attention_kv_cache_enable_sliding_window_for_seq\",\n      ),\n    );\n\n    // Create PagedKVCache; we do not expose KVCache config for now\n    const fcreateCache = this.vm.getFunction(\"create_tir_paged_kv_cache\");\n    const defaultPageSize = 16;\n    const defaultMaxNumSequence = 1;\n    const maxTotalSeqLen =\n      this.slidingWindowSize != -1\n        ? this.slidingWindowSize\n        : this.contextWindowSize;\n    this.kvCache = this.tvm.detachFromCurrentScope(\n      fcreateCache(\n        this.tvm.makeShapeTuple([defaultMaxNumSequence]), // max_num_sequence\n        this.tvm.makeShapeTuple([maxTotalSeqLen]), // max_total_sequence_length\n        this.tvm.makeShapeTuple([this.prefillChunkSize]), // prefill_chunk_size\n        this.tvm.makeShapeTuple([defaultPageSize]), // page_size, hard coded for now\n        this.tvm.makeShapeTuple([this.slidingWindowSize != -1 ? 1 : 0]),\n      ),\n    );\n\n    this.filledKVCacheLength = 0;\n    this.resetChat(); // especially needed for PagedKVCache as we need to call fKVCacheAddSequence\n\n    // Initialize WebGPU sampling related device tensors\n    const numSamples = 1;\n    const numProbs = 1;\n\n    this.sampleIndices = new Int32Array(numSamples);\n    for (let i = 0; i < numSamples; i++) {\n      this.sampleIndices[i] = i;\n    }\n    this.sampleIndicesDevice = this.tvm.detachFromCurrentScope(\n      this.tvm\n        .empty([numSamples], \"int32\", this.device)\n        .copyFrom(this.sampleIndices),\n    );\n\n    this.topPDevice = this.tvm.detachFromCurrentScope(\n      this.tvm.empty([numProbs], \"float32\", this.device),\n    );\n\n    tvm.endScope();\n  }\n\n  dispose() {\n    // TODO: Do we need to dispose all PackedFuncs here?\n    this.grammarMatcher?.dispose();\n    this.params.dispose();\n    this.decoding.dispose();\n    this.prefill.dispose();\n    this.embed.dispose();\n    this.image_embed?.dispose();\n    this.vm.dispose();\n    this.kvCache.dispose();\n    this.fclearKVCaches.dispose();\n    this.logitsOnCPU?.dispose();\n    this.tvm.dispose();\n    this.tokenizer.dispose();\n    this.xgTokenizerInfo?.dispose();\n    this.grammarCompiler?.dispose();\n  }\n\n  /**\n   * Get the current message.\n   */\n  getMessage() {\n    return this.outputMessage;\n  }\n\n  /**\n   * Reset the runtime statistics\n   */\n  resetRuntimeStats() {\n    this.prefillTotalTime = 0;\n    this.prefillTotalTokens = 0;\n    this.decodingTotalTime = 0;\n    this.decodingTotalTokens = 0;\n  }\n\n  /**\n   * Reset the chat history\n   */\n  resetChat(keepStats = false) {\n    this.tvm.beginScope();\n    this.conversation.reset();\n    if (!keepStats) {\n      this.resetRuntimeStats();\n    }\n    this.resetKVCache();\n    this.filledKVCacheLength = 0;\n    this.logitProcessor?.resetState();\n    this.tvm.endScope();\n  }\n\n  /**\n   * Reset KV Cache\n   */\n  resetKVCache() {\n    this.fclearKVCaches(this.kvCache);\n    this.fKVCacheAddSequence!(this.kvCache, new tvmjs.Scalar(0, \"int64\"));\n    if (this.slidingWindowSize != -1) {\n      this.fKVCacheEnableSlidingWindowForSeq(\n        this.kvCache,\n        new tvmjs.Scalar(0, \"int64\"),\n        new tvmjs.Scalar(this.slidingWindowSize, \"int32\"),\n        new tvmjs.Scalar(this.attentionSinkSize, \"int32\"),\n      );\n    }\n  }\n\n  /**\n   * @returns Whether stop is triggered.\n   */\n  stopped(): boolean {\n    return this.stopTriggered;\n  }\n\n  /**\n   * @returns Finish reason; undefined if generation not started/stopped yet.\n   */\n  getFinishReason(): ChatCompletionFinishReason | undefined {\n    return this.finishReason;\n  }\n\n  /**\n   * @returns tokenLogprobArray for this current round of autoregressive generation.\n   * Updated upon each sampled token, cleared upon each prefillStep().\n   */\n  getTokenLogprobArray(): Array<ChatCompletionTokenLogprob> {\n    return this.tokenLogprobArray;\n  }\n\n  /**\n   * @returns the number of tokens decoded for a single request or a single choice in the request.\n   */\n  getCurRoundDecodingTotalTokens(): number {\n    return this.curRoundDecodingTotalTokens;\n  }\n\n  /**\n   * @returns the number of tokens decoded for a single request or a single choice in the request.\n   */\n  getCurRoundPrefillTotalTokens(): number {\n    return this.curRoundPrefillTotalTokens;\n  }\n\n  /**\n   * @returns the time spent on decode for a single request or a single choice in the request.\n   */\n  getCurRoundDecodingTotalTime(): number {\n    return this.curRoundDecodingTotalTime;\n  }\n\n  /**\n   * @returns the time spent on  for a single request or a single choice in the request.\n   */\n  getCurRoundPrefillTotalTime(): number {\n    return this.curRoundPrefillTotalTime;\n  }\n\n  /**\n   * @returns the time (seconds) spent on for initializing grammar matcher for a single request.\n   */\n  getCurRoundGrammarInitTotalTime(): number {\n    return this.curRoundGrammarInitTotalTime;\n  }\n\n  /**\n   * @returns the total time (seconds) spent on creating bitmask and accepting token grammar matcher\n   * for all the generated tokens in a single request.\n   */\n  getCurRoundGrammarPerTokenTotalTime(): number {\n    return this.curRoundGrammarPerTokenTotalTime;\n  }\n\n  /**\n   * @returns the breakdown of latencies for sampling each token for a single request.\n   */\n  getCurRoundLatencyBreakdown(): LatencyBreakdown {\n    return this.curRoundLatencyBreakdown;\n  }\n\n  /**\n   * @returns Runtime stats information.\n   */\n  runtimeStatsText(): string {\n    return (\n      `prefill: ${(this.prefillTotalTokens / this.prefillTotalTime).toFixed(4)} tokens/sec, ` +\n      `decoding: ${(this.decodingTotalTokens / this.decodingTotalTime).toFixed(4)} tokens/sec`\n    );\n  }\n\n  /**\n   * @returns Runtime stats information, starting from the last prefill performed.\n   */\n  curRoundRuntimeStatsText(): string {\n    return (\n      `prefill: ${this.getCurRoundPrefillTokensPerSec().toFixed(4)} tokens/sec, ` +\n      `decoding: ${this.getCurRoundDecodingTokensPerSec().toFixed(4)} tokens/sec`\n    );\n  }\n\n  /**\n   * @returns Prefill tokens per second, starting from the last prefill performed.\n   */\n  getCurRoundPrefillTokensPerSec(): number {\n    return this.curRoundPrefillTotalTokens / this.curRoundPrefillTotalTime;\n  }\n\n  /**\n   * @returns Prefill tokens per second, starting from the last prefill performed.\n   */\n  getCurRoundDecodingTokensPerSec(): number {\n    return this.curRoundDecodingTotalTokens / this.curRoundDecodingTotalTime;\n  }\n\n  /**\n   * Set the seed for the RNG `this.tvm.rng`.\n   */\n  setSeed(seed: number): void {\n    this.tvm.setSeed(seed);\n  }\n\n  private getResponseFormatKey(\n    responseFormat?: ResponseFormat | null,\n  ): string | undefined {\n    if (!responseFormat) {\n      return undefined;\n    }\n    if (responseFormat.type === \"json_object\") {\n      return responseFormat.schema ?? undefined;\n    }\n    if (responseFormat.type === \"grammar\") {\n      return responseFormat.grammar ?? undefined;\n    }\n    if (responseFormat.type === \"structural_tag\") {\n      const structuralTag = responseFormat.structural_tag;\n      if (structuralTag === undefined || structuralTag === null) {\n        return undefined;\n      }\n      return typeof structuralTag === \"string\"\n        ? structuralTag\n        : JSON.stringify(structuralTag);\n    }\n    return undefined;\n  }\n\n  // Getters and setters for this.conversation.\n  /**\n   * @returns The conversation object (not a deep copy).\n   */\n  getConversationObject(): Conversation {\n    return this.conversation;\n  }\n\n  /**\n   * Set this.conversation to a new conversation object.\n   */\n  setConversation(newConv: Conversation) {\n    this.conversation = newConv;\n    this.stopStr = this.conversation.getStopStr();\n    this.stopTokens = this.conversation.getStopTokens();\n  }\n\n  async asyncLoadWebGPUPipelines() {\n    await this.tvm.asyncLoadWebGPUPipelines(this.vm.getInternalModule());\n  }\n\n  /**\n   * Generate the first token given input prompt\n   */\n  async prefillStep(\n    inp: string,\n    msgRole: Role, // either user or tool\n    inp_role_str?: string,\n    genConfig?: GenerationConfig,\n  ): Promise<void> {\n    if (msgRole !== Role.user && msgRole !== Role.tool) {\n      throw new MessageOrderError(\n        \"The last message should be from `user` or `tool`.\",\n      );\n    }\n    if (this.resetStatsPerPrefill) {\n      this.resetRuntimeStats();\n    }\n\n    const tstart = performance.now();\n\n    // cleanup the per convo states\n    this.outputIds = [];\n    this.appearedTokensFreq.clear();\n    this.outputMessage = \"\";\n    this.tokenLogprobArray = [];\n    this.curRoundDecodingTotalTokens = 0;\n    this.curRoundPrefillTotalTokens = 0;\n    this.curRoundPrefillTotalTime = 0;\n    this.curRoundDecodingTotalTime = 0;\n    this.curRoundGrammarInitTotalTime = 0;\n    this.curRoundGrammarPerTokenTotalTime = 0;\n\n    this.curRoundLatencyBreakdown = {\n      logitProcessorTime: [],\n      logitBiasTime: [],\n      penaltyTime: [],\n      sampleTime: [],\n      totalTime: [],\n      grammarBitmaskTime: [],\n    };\n\n    this.stopTriggered = false;\n    const conversation = this.conversation;\n\n    // -1. Instantiate grammar matcher according to generation config. This step is overlapped\n    // with prefilling the prompt to hide overhead by using this promise.\n    let grammarMatcherInitPromise: Promise<void> | undefined = undefined;\n    const responseFormat = genConfig?.response_format;\n    if (\n      responseFormat?.type === \"json_object\" ||\n      responseFormat?.type === \"grammar\" ||\n      responseFormat?.type === \"structural_tag\"\n    ) {\n      const curResponseFormatKey = this.getResponseFormatKey(responseFormat);\n      if (\n        curResponseFormatKey === this.responseFormatCacheKey &&\n        this.grammarMatcher\n      ) {\n        // If we did not change the schema and have instantiated a GrammarMatcher, we reuse it.\n        const tGrammarInitStart = performance.now();\n        log.info(\"Reuse grammar matcher.\");\n        this.grammarMatcher.reset();\n        this.curRoundGrammarInitTotalTime =\n          (performance.now() - tGrammarInitStart) / 1e3;\n      } else {\n        // Else dispose current grammarMatcher, reinitialize, and update this.schema.\n        /* eslint-disable no-async-promise-executor */\n        grammarMatcherInitPromise = new Promise(async (resolve) => {\n          const tGrammarInitStart = performance.now();\n          log.info(\"Initialize new grammar matcher.\");\n          if (this.grammarMatcher) {\n            this.grammarMatcher.dispose();\n          }\n          if (this.xgTokenizerInfo === undefined) {\n            log.info(\"Initialize token table.\");\n            // Post process entire table\n            const rawTokenTable = getTokenTableFromTokenizer(this.tokenizer);\n            this.xgTokenizerInfo = await xgr.TokenizerInfo.createTokenizerInfo(\n              rawTokenTable,\n              this.token_postproc_method,\n              this.prepend_space_in_encode,\n              this.fullVocabSize,\n              this.stopTokens,\n            );\n            this.grammarCompiler =\n              await xgr.GrammarCompiler.createGrammarCompiler(\n                this.xgTokenizerInfo,\n              );\n          }\n          const grammar: xgr.CompiledGrammar =\n            responseFormat.type === undefined\n              ? await this.grammarCompiler!.compileBuiltinJSONGrammar()\n              : responseFormat.type === \"json_object\"\n                ? await this.grammarCompiler!.compileJSONSchema(\n                    responseFormat.schema!,\n                  )\n                : responseFormat.type === \"grammar\"\n                  ? await this.grammarCompiler!.compileGrammar(\n                      responseFormat.grammar!,\n                    )\n                  : await this.grammarCompiler!.compileStructuralTag(\n                      responseFormat.structural_tag!,\n                    );\n          this.grammarMatcher =\n            await xgr.GrammarMatcher.createGrammarMatcher(grammar);\n          grammar.dispose();\n          this.responseFormatCacheKey = curResponseFormatKey;\n          this.curRoundGrammarInitTotalTime =\n            (performance.now() - tGrammarInitStart) / 1e3;\n          resolve();\n        });\n      }\n    }\n\n    // 0. Get inputData from conversation\n    if (conversation.isTextCompletion) {\n      conversation.prompt = inp;\n    } else {\n      conversation.appendMessage(msgRole, inp, inp_role_str);\n      if (genConfig?.enable_thinking === false) {\n        // TODO(Charlie): In future we should make emptyThinkingBlockStr configurable.\n        const emptyThinkingBlockStr = \"<think>\\n\\n</think>\\n\\n\";\n        const encoded = this.tokenizer.encode(emptyThinkingBlockStr);\n        this.outputIds.push(...encoded);\n        conversation.appendEmptyThinkingReplyHeader(\n          Role.assistant,\n          emptyThinkingBlockStr,\n        );\n      } else {\n        conversation.appendReplyHeader(Role.assistant);\n      }\n    }\n    const retGetInputData = this.getInputData();\n    const inputData: Array<Array<number> | ImageURL> = retGetInputData[0];\n    const promptLen: number = retGetInputData[1];\n\n    // Check if LLMChatPipeline fits for forwarding image input\n    let hasImageInput = false;\n    inputData.forEach((data) => {\n      if (!Array.isArray(data)) {\n        hasImageInput = true;\n      }\n    });\n    if (hasImageInput && this.prefillChunkSize < IMAGE_EMBED_SIZE) {\n      throw new PrefillChunkSizeSmallerThanImageError(\n        this.prefillChunkSize,\n        IMAGE_EMBED_SIZE,\n      );\n    }\n    if (hasImageInput && this.image_embed === undefined) {\n      throw new CannotFindImageEmbedError();\n    }\n\n    // 1. Chunk inputData to embed and forward in one shot for each, minimize intermediate data\n    const retGetChunks = getChunkedPrefillInputData(\n      inputData,\n      this.prefillChunkSize,\n    );\n    const chunks: Array<Array<number> | ImageURL>[] = retGetChunks[0];\n    const chunkLens: Array<number> = retGetChunks[1];\n\n    // 2. Prefill each chunk\n    this.tvm.beginScope();\n    let logits: tvmjs.Tensor;\n    for (let i = 0; i < chunks.length; i++) {\n      const chunk = chunks[i];\n      const chunkLen = chunkLens[i];\n      const prevFilledLen = this.filledKVCacheLength;\n      logits = this.tvm.detachFromCurrentScope(\n        await this.embedAndForward(chunk, chunkLen),\n      );\n      if (this.filledKVCacheLength !== prevFilledLen + chunkLen) {\n        throw new Error(\n          \"Internal Error: filledKVCacheLength does not match expected value.\",\n        );\n      }\n    }\n    this.tvm.endScope();\n\n    // 4. Sample, stats, post process token sampled.\n    // We wait for prefill and grammar matcher init to finish\n    await Promise.all([this.device.sync(), grammarMatcherInitPromise]);\n    const nextToken = await this.sampleTokenFromLogits(logits!, genConfig);\n    logits!.dispose();\n    const tend = performance.now();\n\n    this.prefillTotalTime += (tend - tstart) / 1e3;\n    this.prefillTotalTokens += promptLen;\n    this.curRoundPrefillTotalTokens += promptLen;\n    this.curRoundPrefillTotalTime += (tend - tstart) / 1e3;\n\n    this.processNextToken(nextToken, genConfig);\n  }\n\n  async decodeStep(genConfig?: GenerationConfig): Promise<void> {\n    if (this.stopTriggered) {\n      throw Error(\"Cannot run decode when stopped\");\n    }\n\n    const tstart = performance.now();\n\n    this.tvm.beginScope();\n    const chunk: Array<Array<number>> = [\n      this.outputIds.slice(this.outputIds.length - 1),\n    ];\n    const chunkLen = chunk.length;\n    const prevFilledLen = this.filledKVCacheLength;\n    const logits = this.tvm.detachFromCurrentScope(\n      await this.embedAndForward(chunk, chunkLen),\n    );\n    if (this.filledKVCacheLength !== prevFilledLen + chunkLen) {\n      throw new Error(\n        \"Internal Error: filledKVCacheLength does not match expected value.\",\n      );\n    }\n    this.tvm.endScope();\n\n    // sample from logits\n    const nextToken = await this.sampleTokenFromLogits(logits, genConfig);\n    logits.dispose();\n    const tend = performance.now();\n\n    this.decodingTotalTime += (tend - tstart) / 1e3;\n    this.decodingTotalTokens += 1;\n    this.curRoundDecodingTotalTokens += 1;\n    this.curRoundDecodingTotalTime += (tend - tstart) / 1e3;\n\n    this.processNextToken(nextToken, genConfig);\n  }\n\n  /**\n   * Manually trigger stop if it is not stopped.\n   */\n  triggerStop() {\n    if (this.stopTriggered) {\n      return;\n    }\n    this.stopTriggered = true;\n    this.finishReason = \"abort\";\n    if (!this.conversation.isTextCompletion) {\n      this.conversation.finishReply(this.outputMessage);\n    }\n  }\n\n  /**\n   * Add a generated token and check for stop.\n   *\n   * @param nextToken The next token.\n   * @param genConfig Configs that override `this.config` for this round of generation.\n   */\n  private processNextToken(\n    nextToken: number,\n    genConfig?: GenerationConfig,\n  ): void {\n    if (this.stopTriggered) {\n      throw Error(\"Cannot call process when it is stoppped\");\n    }\n\n    // Get max_tokens from generationConfig (specified by user in completion request)\n    // If not specified, do not set a limit\n    let max_tokens = Infinity;\n    if (genConfig !== undefined && genConfig.max_tokens) {\n      max_tokens = genConfig.max_tokens;\n    }\n    if (max_tokens <= 0) {\n      throw new MinValueError(\"max_tokens\", 0);\n    }\n\n    // Get ignore_eos from generationConfig (specified by user in completion request)\n    let ignore_eos = false;\n    if (\n      genConfig !== undefined &&\n      genConfig.ignore_eos !== undefined &&\n      genConfig.ignore_eos !== null\n    ) {\n      ignore_eos = genConfig.ignore_eos;\n    }\n\n    // Get stopStrs, possibly overridden by genConfig for this round\n    let stopStrs = this.stopStr;\n    if (genConfig !== undefined && genConfig.stop) {\n      stopStrs = stopStrs.concat(genConfig.stop);\n    }\n\n    let stopTokens = this.stopTokens;\n    if (ignore_eos) {\n      stopTokens = [];\n      stopStrs = [];\n    }\n\n    // Stop condition 1: stop token; otherwise, append to `this.outputIds`\n    if (stopTokens.includes(nextToken)) {\n      this.stopTriggered = true;\n      this.finishReason = \"stop\";\n    }\n    if (!this.stopTriggered) {\n      this.outputIds.push(nextToken);\n      // Update token appearance frequency\n      const curFreq = this.appearedTokensFreq.get(nextToken);\n      if (curFreq !== undefined) {\n        this.appearedTokensFreq.set(nextToken, curFreq + 1);\n      } else {\n        this.appearedTokensFreq.set(nextToken, 1);\n      }\n    }\n\n    // Stop condition 2: stop string; update `this.outputMessage` subsequently\n    let outputMessage = this.tokenizer.decode(new Int32Array(this.outputIds));\n    let stopPos = -1;\n    for (const stopStr of stopStrs) {\n      // Stop at the first stopStr we find\n      stopPos = outputMessage.lastIndexOf(stopStr);\n      if (stopPos != -1) {\n        outputMessage = outputMessage.substring(0, stopPos);\n        this.stopTriggered = true;\n        this.finishReason = \"stop\";\n        break;\n      }\n    }\n    this.outputMessage = outputMessage;\n\n    // Stop condition 3: exceed max_tokens\n    if (this.outputIds.length >= max_tokens) {\n      this.stopTriggered = true;\n      this.finishReason = \"length\";\n      log.info(\"Generation stopped due to exceeding max_tokens.\");\n    }\n\n    // Stop condition 4: exceed KVCache's context window size\n    if (\n      this.slidingWindowSize == -1 &&\n      this.filledKVCacheLength == this.contextWindowSize\n    ) {\n      this.stopTriggered = true;\n      this.finishReason = \"length\";\n      log.info(\"Generation stopped due to exceeding context_window_size.\");\n    }\n\n    // Finally, modify conversation history if stopped\n    if (this.stopTriggered) {\n      if (!this.conversation.isTextCompletion) {\n        this.conversation.finishReply(this.outputMessage);\n      }\n    }\n  }\n\n  /**\n   * Given input tokens, return embeddings of them by calling embed kernel.\n   *\n   * @note precondition: inputTokens.length <= prefillChunkSize, since we take care of\n   * chunking in `getChunkedPrefillInputData()`.\n   */\n  private getTokensEmbeddings(inputTokens: number[]): tvmjs.Tensor {\n    this.tvm.beginScope();\n    if (inputTokens.length > this.prefillChunkSize) {\n      throw new Error(\n        \"Internal Error: getTokensEmbeddings input should be <= prefillChunkSize.\",\n      );\n    }\n    const inputData = this.tvm.empty(\n      [inputTokens.length],\n      \"int32\",\n      this.device,\n    );\n    inputData.copyFrom(inputTokens);\n    const embed: tvmjs.Tensor = this.tvm.detachFromCurrentScope(\n      this.embed!(inputData, this.params),\n    );\n    this.tvm.endScope();\n    this.tvm.attachToCurrentScope(embed); // tracked by scope of embedAndForward\n    return embed;\n  }\n\n  /**\n   * Calculate resize dimensions for Phi3-V model.\n   * Based on vlm_utils.cc CalculateResizeShape\n   */\n  private calculateResizeShape(\n    imageHeight: number,\n    imageWidth: number,\n  ): [number, number] {\n    const hdNum = 16;\n    const ratio = imageWidth / imageHeight;\n    let scale = 1;\n    while (scale * Math.ceil(scale / ratio) <= hdNum) {\n      scale += 1;\n    }\n    scale -= 1;\n    const newW = scale * 336;\n    const newH = Math.floor(newW / ratio);\n    return [newH, newW];\n  }\n\n  /**\n   * Calculate crop dimensions for Phi3-V model.\n   * Based on vlm_utils.cc CalculateCropShape / CalculatePadShape\n   */\n  private calculateCropShape(\n    imageHeight: number,\n    imageWidth: number,\n  ): [number, number] {\n    const [resizedHeight, resizedWidth] = this.calculateResizeShape(\n      imageHeight,\n      imageWidth,\n    );\n    const padH = Math.ceil(resizedHeight / 336) * 336;\n    const padW = resizedWidth;\n    return [Math.floor(padH / 336), Math.floor(padW / 336)];\n  }\n\n  /**\n   * Embed an image input.\n   */\n  private async getImageEmbeddings(\n    inputImage: ImageURL,\n  ): Promise<tvmjs.Tensor> {\n    this.tvm.beginScope();\n    // 1. Transform ImageURL into image input in TVMArray\n    const url = inputImage.url;\n    // url starting with `data:image` and `http` share the same loading method\n    const imgData: ImageData = await getImageDataFromURL(url);\n    const pixelValues: Uint8ClampedArray = getRGBArrayFromImageData(imgData);\n    const pixelArray = this.tvm\n      .empty([imgData.height, imgData.width, 3], \"uint32\", this.device)\n      .copyFrom(pixelValues)\n      .view([1, imgData.height, imgData.width, 3]); // NHWC\n\n    // 2. Calculate resize and crop dimensions\n    const [resizeH, resizeW] = this.calculateResizeShape(\n      imgData.height,\n      imgData.width,\n    );\n    const [cropH, cropW] = this.calculateCropShape(\n      imgData.height,\n      imgData.width,\n    );\n    const resizeHeightShape = this.tvm.makeShapeTuple([resizeH]);\n    const resizeWidthShape = this.tvm.makeShapeTuple([resizeW]);\n    const cropHeightShape = this.tvm.makeShapeTuple([cropH]);\n    const cropWidthShape = this.tvm.makeShapeTuple([cropW]);\n\n    // 3. embed kernel\n    const embed: tvmjs.Tensor = this.tvm.detachFromCurrentScope(\n      this.image_embed!(\n        pixelArray,\n        resizeHeightShape,\n        resizeWidthShape,\n        cropHeightShape,\n        cropWidthShape,\n        this.params,\n      ),\n    );\n    if (embed.shape[0] !== IMAGE_EMBED_SIZE) {\n      throw new Error(\n        `InternalError: expect embed.shape[0] to be ${IMAGE_EMBED_SIZE}, ` +\n          `but got ${embed.shape[0]}`,\n      );\n    }\n    this.tvm.endScope();\n    this.tvm.attachToCurrentScope(embed); // tracked by scope of embedAndForward\n    return embed;\n  }\n\n  /**\n   * Embed and forward input data, that can be either array of tokens, or an image.\n   * This will increment `this.filledKVCacheLength`.\n   *\n   * @param inputData data to embed and forward\n   * @param inputDataLen length of this inputData, should smaller than prefill chunk size.\n   * @returns The logits returned by this forward as tvmjs.Tensor on GPU.\n   *\n   * @note Precondition: inputData's data length is smaller than prefill chunk size\n   */\n  private async embedAndForward(\n    inputData: Array<Array<number> | ImageURL>,\n    inputDataLen: number,\n  ): Promise<tvmjs.Tensor> {\n    if (inputDataLen > this.prefillChunkSize) {\n      throw new Error(\n        \"InternalError: expect inputDataLen <= this.prefillChunkSize.\",\n      );\n    }\n    // TODO: we should combine string data to embed once, then rearrange the embeddings; currently\n    // [\"hi\", imageUrl, \"hi\"] would call embed kernels 3 times, while 2 would suffice.\n\n    // 1. Embed all inputData\n    this.tvm.beginScope();\n    const embeddings: tvmjs.Tensor[] = [];\n    for (let i = 0; i < inputData.length; i++) {\n      const data = inputData[i];\n      if (Array.isArray(data)) {\n        embeddings.push(this.getTokensEmbeddings(data));\n      } else {\n        embeddings.push(await this.getImageEmbeddings(data));\n      }\n    }\n\n    // 2. Concatenate embeddings\n    let allEmbeddings: tvmjs.Tensor;\n    if (embeddings.length === 1) {\n      allEmbeddings = embeddings[0];\n    } else {\n      allEmbeddings = this.tvm.concatEmbeddings(embeddings);\n    }\n    if (inputDataLen !== allEmbeddings.shape[0]) {\n      throw new Error(\"InternalError: expect seqLen == allEmbeddings.shape[0]\");\n    }\n    allEmbeddings = allEmbeddings.view([1].concat(allEmbeddings.shape));\n    // TODO: Should we end this scope here and begin another scope? Will this dispose embeddings to\n    // save RAM? We will detach allEmbeddings from this scope and attach to the next scope.\n\n    // 3. Forward the concatenated embeddings\n    const inputLenShape = this.tvm.makeShapeTuple([inputDataLen]);\n    const seqIdsTuple = this.tvm.makeShapeTuple([0]);\n    this.fKVCacheBeginForward!(this.kvCache, seqIdsTuple, inputLenShape);\n    let retValue;\n    if (inputDataLen > 1) {\n      retValue = this.prefill(allEmbeddings, this.kvCache, this.params);\n    } else {\n      retValue = this.decoding(allEmbeddings, this.kvCache, this.params);\n    }\n\n    // Epilogue\n    this.fKVCacheEndForward!(this.kvCache);\n    this.filledKVCacheLength += inputDataLen;\n    const logits = this.tvm.detachFromCurrentScope(retValue.get(0));\n    this.tvm.endScope();\n    this.tvm.attachToCurrentScope(logits);\n    return logits;\n  }\n\n  // NOTE: caller must call device.sync()\n  private updateLogitsOnCPU(logits: tvmjs.Tensor): tvmjs.Tensor {\n    if (this.logitsOnCPU == undefined) {\n      this.logitsOnCPU = this.tvm.detachFromCurrentScope(\n        this.tvm.empty(logits.shape, logits.dtype, this.tvm.cpu()),\n      );\n    } else {\n      if (logits.shape[0] != this.logitsOnCPU.shape[0]) {\n        throw Error(\"We expect the size of logits to remain unchanged\");\n      }\n    }\n    this.logitsOnCPU.copyFrom(logits);\n    return this.logitsOnCPU;\n  }\n\n  private async sampleTokenFromLogits(\n    logitsOnGPU: tvmjs.Tensor,\n    genConfig?: GenerationConfig,\n  ) {\n    // 0. Get value of temperature, top_p, and various penalties, possibly overridden by genConfig\n    // Also load other genConfig items like logit_bias. Consume all fields of `genConfig` here.\n    function _hasValue(value: any): boolean {\n      // if we use `if value` directly, `value` being 0 evaluates to false, violating semantics\n      return value !== undefined && value !== null;\n    }\n    let temperature: number = this.config.temperature;\n    let top_p: number = this.config.top_p;\n    let repetition_penalty: number = this.config.repetition_penalty;\n    let frequency_penalty: number = this.config.frequency_penalty;\n    let presence_penalty: number = this.config.presence_penalty;\n    let logit_bias: Record<string, number> | undefined = undefined;\n    let logprobs: boolean | undefined = undefined;\n    let top_logprobs: number | undefined = undefined;\n    let response_format: ResponseFormat | undefined = undefined;\n\n    if (genConfig !== undefined) {\n      if (_hasValue(genConfig.temperature)) {\n        temperature = genConfig.temperature!;\n      }\n      if (_hasValue(genConfig.top_p)) {\n        top_p = genConfig.top_p!;\n      }\n      // TODO: setting top_p to 1.0 by default might run into issues since\n      // top_p masking in relax uses < instead of <=\n      // Set default top_p to 1.0 if not set\n      if (!_hasValue(top_p)) {\n        top_p = 1.0;\n      }\n      if (_hasValue(genConfig.repetition_penalty)) {\n        repetition_penalty = genConfig.repetition_penalty!;\n      }\n      if (_hasValue(genConfig.frequency_penalty)) {\n        frequency_penalty = genConfig.frequency_penalty!;\n      }\n      if (_hasValue(genConfig.presence_penalty)) {\n        presence_penalty = genConfig.presence_penalty!;\n      }\n      // If only one of frequency or presence penalty is set, make the other one 0.0\n      if (_hasValue(frequency_penalty) && !_hasValue(presence_penalty)) {\n        presence_penalty = 0.0;\n      }\n      if (_hasValue(presence_penalty) && !_hasValue(frequency_penalty)) {\n        frequency_penalty = 0.0;\n      }\n      if (!_hasValue(frequency_penalty)) {\n        frequency_penalty = 0.0;\n      }\n      if (!_hasValue(presence_penalty)) {\n        presence_penalty = 0.0;\n      }\n      if (_hasValue(genConfig.logit_bias)) {\n        logit_bias = genConfig.logit_bias!;\n      }\n      if (_hasValue(genConfig.logprobs)) {\n        logprobs = genConfig.logprobs!;\n      }\n      if (_hasValue(genConfig.top_logprobs)) {\n        top_logprobs = genConfig.top_logprobs!;\n      }\n      if (_hasValue(genConfig.response_format)) {\n        response_format = genConfig.response_format!;\n      }\n    }\n    // Check range validity\n    if (top_p <= 0 || top_p > 1) {\n      throw new RangeError(\"top_p\", 0, 1);\n    }\n    if (temperature < 0) {\n      throw new MinValueError(\"temperature\", 0);\n    }\n    if (repetition_penalty <= 0) {\n      throw new MinValueError(\"repetition_penalty\", 0);\n    }\n    if (\n      frequency_penalty &&\n      (frequency_penalty < -2.0 || frequency_penalty > 2.0)\n    ) {\n      throw new RangeError(\"frequency_penalty\", -2.0, 2.0);\n    }\n    if (\n      presence_penalty &&\n      (presence_penalty < -2.0 || presence_penalty > 2.0)\n    ) {\n      throw new RangeError(\"presence_penalty\", -2.0, 2.0);\n    }\n\n    const outputTokenBegin = performance.now();\n    const grammarConstrained =\n      response_format?.type === \"json_object\" ||\n      response_format?.type === \"grammar\" ||\n      response_format?.type === \"structural_tag\";\n\n    // 0. Update logitsOnGPU with on-GPU grammar bitmasking\n    if (grammarConstrained) {\n      const grammarBitmaskBegin = performance.now();\n\n      this.tvm.beginScope();\n      if (this.grammarMatcher === undefined) {\n        throw Error(\"Expect grammar matcher to be initialized.\");\n      }\n\n      const tBitmaskStart = performance.now();\n      const bitMaskOnCPU: Int32Array =\n        await this.grammarMatcher.getNextTokenBitmask();\n      this.curRoundGrammarPerTokenTotalTime +=\n        (performance.now() - tBitmaskStart) / 1e3;\n\n      if (bitMaskOnCPU.length !== this.bitmaskSize) {\n        throw new Error(\n          `InternalError: Expect grammar bitmask to be ` +\n            `size ${this.bitmaskSize}, but got ${bitMaskOnCPU.length}.`,\n        );\n      }\n      const bitMaskOnGPU = this.tvm\n        .empty([1, this.bitmaskSize], \"int32\", this.device)\n        .copyFrom(bitMaskOnCPU);\n      const seqIdsArray = this.tvm\n        .empty([1], \"int32\", this.device)\n        .copyFrom([0]);\n      this.fapplyBitmask(\n        logitsOnGPU.view([1, this.fullVocabSize]),\n        seqIdsArray,\n        bitMaskOnGPU,\n      );\n      this.tvm.endScope();\n\n      if (genConfig?.enable_latency_breakdown) {\n        const grammarBitmaskEnd = performance.now();\n        const grammarBitmaskTimeSpent =\n          (grammarBitmaskEnd - grammarBitmaskBegin) / 1e3;\n        this.curRoundLatencyBreakdown.grammarBitmaskTime.push(\n          grammarBitmaskTimeSpent,\n        );\n      }\n    }\n\n    // 1. Apply logitProcessor on CPU\n    if (this.logitProcessor !== undefined) {\n      // Move logits to CPU\n      this.tvm.beginScope();\n      this.updateLogitsOnCPU(logitsOnGPU);\n      this.tvm.endScope();\n      await this.device.sync();\n\n      const logitProcessorBegin = performance.now();\n\n      if (this.logitsOnCPU == undefined) {\n        throw Error(\"logits should be assigned\");\n      }\n      let logitsOnCPUArray: Float32Array = <Float32Array>(\n        this.logitsOnCPU.toArray()\n      );\n      logitsOnCPUArray = this.logitProcessor.processLogits(logitsOnCPUArray);\n      logitsOnGPU.copyFrom(logitsOnCPUArray);\n      this.logitsOnCPU.copyFrom(logitsOnCPUArray);\n\n      if (genConfig?.enable_latency_breakdown) {\n        const logitProcessorEnd = performance.now();\n        const logitProcessorTimeSpent =\n          (logitProcessorEnd - logitProcessorBegin) / 1e3;\n        this.curRoundLatencyBreakdown.logitProcessorTime.push(\n          logitProcessorTimeSpent,\n        );\n      }\n    }\n\n    // 2. Apply logit_bias on GPU\n    if (_hasValue(logit_bias)) {\n      const logitBiasBegin = performance.now();\n\n      const numTokens = Object.keys(logit_bias ?? {}).length;\n      const pos2seqIds = new Int32Array(numTokens).fill(0);\n      const tokenIds = new Int32Array(numTokens);\n      const tokenLogitBias = new Float32Array(numTokens);\n\n      const logitBiasKeys = Object.keys(logit_bias ?? {});\n      for (let index = 0; index < numTokens; index++) {\n        const tokenId = parseInt(logitBiasKeys[index]);\n        tokenIds[index] = tokenId;\n        tokenLogitBias[index] = logit_bias![tokenId];\n      }\n\n      this.tvm.beginScope();\n\n      const pos2seqIdsDevice = this.tvm\n        .empty([numTokens], \"int32\", this.device)\n        .copyFrom(pos2seqIds);\n\n      const tokenIdsDevice = this.tvm\n        .empty([numTokens], \"int32\", this.device)\n        .copyFrom(tokenIds);\n\n      const tokenLogitBiasDevice = this.tvm\n        .empty([numTokens], \"float32\", this.device)\n        .copyFrom(tokenLogitBias);\n\n      this.fapplyLogitBias(\n        logitsOnGPU.view([1, this.fullVocabSize]),\n        pos2seqIdsDevice,\n        tokenIdsDevice,\n        tokenLogitBiasDevice,\n      );\n\n      this.tvm.endScope();\n\n      if (genConfig?.enable_latency_breakdown) {\n        const logitBiasEnd = performance.now();\n        const logitBiasTimeSpent = (logitBiasEnd - logitBiasBegin) / 1e3;\n        this.curRoundLatencyBreakdown.logitBiasTime.push(logitBiasTimeSpent);\n      }\n    }\n\n    // 3. Apply penalties to logits on GPU\n    if (\n      frequency_penalty != 0.0 ||\n      presence_penalty != 0.0 ||\n      repetition_penalty != 1.0\n    ) {\n      const appearedTokens = [...this.appearedTokensFreq.keys()];\n      const appearedTokensFreqs = [...this.appearedTokensFreq.values()];\n\n      const numTokens = appearedTokens.length;\n\n      if (numTokens > 0) {\n        const penaltyBegin = performance.now();\n\n        const pos2seqIds = new Int32Array(numTokens).fill(0);\n        const tokenIds = new Int32Array(numTokens).fill(0);\n        const tokenCnt = new Int32Array(numTokens).fill(0);\n        const penalties = new Float32Array([\n          presence_penalty,\n          frequency_penalty,\n          repetition_penalty,\n        ]);\n\n        tokenIds.set(appearedTokens);\n        tokenCnt.set(appearedTokensFreqs);\n\n        this.tvm.beginScope();\n        const seqIdsArray = this.tvm\n          .empty([1], \"int32\", this.device)\n          .copyFrom([0]);\n\n        const pos2seqIdsDevice = this.tvm\n          .empty([numTokens], \"int32\", this.device)\n          .copyFrom(pos2seqIds);\n\n        const tokenIdsDevice = this.tvm\n          .empty([numTokens], \"int32\", this.device)\n          .copyFrom(tokenIds);\n\n        const tokenCntDevice = this.tvm\n          .empty([numTokens], \"int32\", this.device)\n          .copyFrom(tokenCnt);\n\n        const penaltiesDevice = this.tvm\n          .empty([1, 3], \"float32\", this.device)\n          .copyFrom(penalties);\n\n        this.fapplyPenalty(\n          logitsOnGPU.view([1, this.fullVocabSize]),\n          seqIdsArray,\n          pos2seqIdsDevice,\n          tokenIdsDevice,\n          tokenCntDevice,\n          penaltiesDevice,\n        );\n\n        this.tvm.endScope();\n\n        if (genConfig?.enable_latency_breakdown) {\n          const penaltyEnd = performance.now();\n          const penaltyTimeSpent = (penaltyEnd - penaltyBegin) / 1e3;\n          this.curRoundLatencyBreakdown.penaltyTime.push(penaltyTimeSpent);\n        }\n      }\n    }\n\n    // TODO: Explore usage of multinomial sampling kernel (currently blocked due to usage\n    // of i8) for cases where top_p is not set\n    // 4. Sample token from logits\n    const sampleBegin = performance.now();\n\n    // Inplace transform logitsOnCPU to a distribution\n    temperature = Math.max(1e-6, temperature); // to prevent division by zero\n\n    const numSeqs = 1;\n    const numProbs = 1;\n\n    const temperatures = new Float32Array([temperature]);\n\n    this.tvm.beginScope();\n    const temperaturesDevice = this.tvm\n      .empty([numSeqs], \"float32\", this.device)\n      .copyFrom(temperatures);\n\n    let probs = this.fsoftmaxWithTemperature(\n      logitsOnGPU.view([numSeqs, numProbs, this.fullVocabSize]),\n      temperaturesDevice,\n    );\n    probs = probs.view([numProbs, this.fullVocabSize]);\n\n    const argsortResults = this.fargsortProbs(probs);\n    const sortedProbsDevice = argsortResults.get(0);\n    const sortedIndicesDevice = argsortResults.get(1);\n\n    const uniformSamplesDevice = this.tvm.uniform([1], 0.0, 1.0, this.device);\n\n    const topPHost = new Float32Array(numProbs).fill(-1);\n    const topPValue = Math.max(top_p, 1e-5);\n    this.sampleIndices.forEach((row) => {\n      topPHost[row] = topPValue;\n    });\n    this.topPDevice.copyFrom(topPHost);\n\n    const sampledTokensDevice = this.tvm.detachFromCurrentScope(\n      this.fsampleWithTopP(\n        sortedProbsDevice,\n        sortedIndicesDevice,\n        uniformSamplesDevice,\n        this.sampleIndicesDevice,\n        this.topPDevice,\n      ),\n    );\n    const sampledTokensHost = this.tvm.detachFromCurrentScope(\n      this.tvm\n        .empty([numSeqs], \"int32\", this.tvm.cpu())\n        .copyFrom(sampledTokensDevice),\n    );\n    if (logprobs && top_logprobs! > 0) {\n      this.updateLogitsOnCPU(probs);\n    }\n    this.tvm.endScope();\n    await this.device.sync();\n\n    const sampledToken = sampledTokensHost.toArray()[0];\n\n    if (logprobs && top_logprobs! > 0) {\n      this.tokenLogprobArray.push(\n        this.getTokenLogprob(sampledToken, top_logprobs!),\n      );\n    }\n\n    if (genConfig?.enable_latency_breakdown) {\n      const sampleEnd = performance.now();\n      const sampleTimeSpent = (sampleEnd - sampleBegin) / 1e3;\n      this.curRoundLatencyBreakdown.sampleTime.push(sampleTimeSpent);\n    }\n\n    // 5. Update logit processor\n    this.logitProcessor?.processSampledToken(sampledToken);\n\n    // 6. Update grammar matcher with new token\n    if (grammarConstrained) {\n      if (this.grammarMatcher === undefined) {\n        throw Error(\"Expect grammar matcher to be initialized.\");\n      }\n      const tAcceptStart = performance.now();\n      const accepted = this.grammarMatcher.acceptToken(sampledToken);\n      this.curRoundGrammarPerTokenTotalTime +=\n        (performance.now() - tAcceptStart) / 1e3;\n      if (!accepted) {\n        throw Error(\"Grammar matcher rejected the newly sampled token.\");\n      }\n    }\n\n    if (genConfig?.enable_latency_breakdown) {\n      const outputTokenEnd = performance.now();\n      const outputTokenTimeSpent = (outputTokenEnd - outputTokenBegin) / 1e3;\n      this.curRoundLatencyBreakdown.totalTime.push(outputTokenTimeSpent);\n    }\n\n    return sampledToken;\n  }\n\n  /**\n   * Return the an array of a mixture of token arrays and imageURLs (which cannot be represented\n   * as tokens). Also return the number of tokens this represents.\n   *\n   * We first convert the Conversation into a prompt array to be prefilled. Then we encode the\n   * text parts, leaving the imageURLs as it is.\n   * Example prompts:\n   * [\n   *   \"<|system|>\\nSome system prompt\\n\",\n   *   [\n   *     \"<|user|>\\n\",\n   *     imageURL1,\n   *     \"\\n\",\n   *     imageURL2,\n   *     \"\\n\",\n   *     \"Some user input<|end|>\\n\"\n   *   ],\n   * ]\n   *\n   * Expected output:\n   * [\n   *   token array for \"<|system|>\\nSome system prompt\\n<|user|>\\n\",\n   *   imageUrl1,\n   *   token array for \"\\n\",\n   *   imageUrl2,\n   *   token array for \"\\nSome user input<|end|>\\n\"\n   */\n  private getInputData(): [Array<Array<number> | ImageURL>, number] {\n    const ret: Array<Array<number> | ImageURL> = [];\n    let curTokens: Array<number> = [];\n    let prompts: Array<string | Array<string | ImageURL>>;\n\n    // 1. Get prompts\n    if (this.conversation.isTextCompletion) {\n      // 1.1. Non-conversation style\n      if (this.filledKVCacheLength !== 0) {\n        throw new TextCompletionExpectsKVEmptyError();\n      }\n      prompts = this.conversation.getPromptArrayTextCompletion();\n    } else {\n      // 1.2. Conversation style\n      if (this.filledKVCacheLength === 0) {\n        if (\n          this.conversation.config.system_prefix_token_ids !== undefined &&\n          this.conversation.config.system_prefix_token_ids !== null\n        ) {\n          curTokens = [...this.conversation.config.system_prefix_token_ids];\n        }\n        prompts = this.conversation.getPromptArray();\n      } else {\n        prompts = this.conversation.getPromptArrayLastRound();\n      }\n    }\n\n    // 2. Encode all prompts. Iterate through each message in the prompt array, where each\n    // prompt can either be a string, or an array of a mixture of string and ImageURLs.\n    let numPromptTokens = 0;\n    for (let i = 0; i < prompts.length; i++) {\n      const curPrompt = prompts[i];\n      if (typeof curPrompt === \"string\") {\n        const encoded = this.tokenizer.encode(curPrompt);\n        numPromptTokens += encoded.length;\n        curTokens.push(...encoded);\n      } else {\n        for (let j = 0; j < curPrompt.length; j++) {\n          const curPromptContent: string | ImageURL = curPrompt[j];\n          if (typeof curPromptContent === \"string\") {\n            const encoded = this.tokenizer.encode(curPromptContent);\n            numPromptTokens += encoded.length;\n            curTokens.push(...encoded);\n          } else {\n            // push curTokens to ret, push imageUrl, create a new curTokens\n            ret.push([...curTokens]);\n            ret.push(curPromptContent);\n            numPromptTokens += IMAGE_EMBED_SIZE;\n            curTokens = [];\n          }\n        }\n      }\n    }\n    // Deal with last curTokens\n    if (curTokens.length !== 0) {\n      ret.push([...curTokens]);\n    }\n\n    // Check if input tokens exceed context window size\n    if (\n      this.slidingWindowSize == -1 && // There is no limit on contextWindowSize for sliding window\n      numPromptTokens + this.filledKVCacheLength > this.contextWindowSize\n    ) {\n      throw new ContextWindowSizeExceededError(\n        numPromptTokens,\n        this.contextWindowSize,\n      );\n    }\n    return [ret, numPromptTokens];\n  }\n\n  async forwardTokensAndSample(\n    inputIds: Array<number>,\n    isPrefill: boolean,\n  ): Promise<number> {\n    const tstart = performance.now();\n    this.tvm.beginScope();\n    // 1. Chunk inputData if needed\n    const inputData: Array<Array<number>> = [inputIds];\n    const retGetChunks = getChunkedPrefillInputData(\n      inputData,\n      this.prefillChunkSize,\n    );\n    const chunks: Array<Array<number> | ImageURL>[] = retGetChunks[0];\n    const chunkLens: Array<number> = retGetChunks[1];\n\n    // 2. Prefill each chunk\n    let logitsOnGPU: tvmjs.Tensor;\n    for (let i = 0; i < chunks.length; i++) {\n      const chunk = chunks[i];\n      const chunkLen = chunkLens[i];\n      const prevFilledLen = this.filledKVCacheLength;\n      logitsOnGPU = await this.embedAndForward(chunk, chunkLen);\n      if (this.filledKVCacheLength !== prevFilledLen + chunkLen) {\n        throw new Error(\n          \"Internal Error: filledKVCacheLength does not match expected value.\",\n        );\n      }\n    }\n\n    // 3. Sample next token\n    const nextToken = await this.sampleTokenFromLogits(logitsOnGPU!);\n    this.tvm.endScope();\n\n    // 4. Stats\n    const tend = performance.now();\n    if (isPrefill) {\n      // We assume that if the input has more than 1 token\n      this.prefillTotalTime += (tend - tstart) / 1e3;\n      this.prefillTotalTokens += inputIds.length;\n      this.curRoundPrefillTotalTokens += inputIds.length;\n      this.curRoundPrefillTotalTime += (tend - tstart) / 1e3;\n    } else {\n      this.decodingTotalTime += (tend - tstart) / 1e3;\n      this.decodingTotalTokens += 1;\n      this.curRoundDecodingTotalTokens += 1;\n      this.curRoundDecodingTotalTime += (tend - tstart) / 1e3;\n    }\n    return nextToken;\n  }\n\n  /**\n   * Based on `sampledToken` and `this.logitsOnCPU`, which becomes a distribution after\n   * calling `this.tvm.applySoftmaxWithTemperature()`, generate `ChatCompletionTokenLogprob` and\n   * update `this.tokenLogprobArray`.\n   *\n   * @param sampledToken The token ID sampled.\n   * @param top_logprobs Number of top tokens to include; `top_logprobs` in `ChatCompletionRequest`.\n   *\n   * @return The `ChatCompletionTokenLogprob` for this single autoregressive step.\n   */\n  private getTokenLogprob(\n    sampledToken: number,\n    top_logprobs: number,\n  ): ChatCompletionTokenLogprob {\n    if (this.logitsOnCPU == undefined) {\n      throw Error(\"logits should be assigned\");\n    }\n    // Array of [token, prob] pairs, sorted with highest prob first.\n    const logitsOnCPUArray = <Float32Array>this.logitsOnCPU.toArray();\n    const topLogprobs = getTopProbs(top_logprobs!, logitsOnCPUArray);\n\n    // Get entry for sampled token first\n    const textEncoder = new TextEncoder();\n    const tokenStr = this.tokenizer.decode(new Int32Array([sampledToken]));\n    const bytes: Array<number> = Array.from(textEncoder.encode(tokenStr));\n    const logprob = Math.log(logitsOnCPUArray[sampledToken]);\n\n    // Populate `top_logprobs`\n    const topLogprobArray: Array<TopLogprob> = [];\n    for (let i = 0; i < top_logprobs; i++) {\n      const tokenID_i = topLogprobs[i][0];\n      const prob_i = topLogprobs[i][1];\n      const tokenStr_i = this.tokenizer.decode(new Int32Array([tokenID_i]));\n      topLogprobArray.push({\n        token: tokenStr_i,\n        bytes: Array.from(textEncoder.encode(tokenStr_i)) as Array<number>,\n        logprob: Math.log(prob_i),\n      } as TopLogprob);\n    }\n\n    return {\n      token: tokenStr,\n      bytes: bytes,\n      logprob: logprob,\n      top_logprobs: topLogprobArray,\n    } as ChatCompletionTokenLogprob;\n  }\n\n  /**\n   * Synchronize the device.\n   */\n  async sync(): Promise<void> {\n    // Is it equivalent to this.tvm.sync()?\n    await this.device.sync();\n  }\n\n  async evaluate() {\n    // run a canonical evaluation of the flow\n    this.resetKVCache();\n    this.filledKVCacheLength = 0;\n\n    const testPrompt = \"The capital of Canada is\";\n    const ids = await this.tokenizer.encode(testPrompt);\n    const tokens = Array.from(ids);\n    tokens.unshift(this.bosTokenId);\n    if (tokens.length == 0) {\n      throw Error(\"empty token\");\n    }\n\n    this.tvm.beginScope();\n    const prefillChunk: Array<Array<number>> = [tokens] as Array<Array<number>>;\n    const prefillChunkLen = tokens.length;\n    const prefillStart = performance.now();\n    await this.embedAndForward(prefillChunk, prefillChunkLen);\n    this.tvm.endScope();\n    await this.device.sync();\n\n    const decodingStart = performance.now();\n\n    this.tvm.beginScope();\n    const decodeChunk: Array<Array<number>> = [[6234]];\n    const decodeChunkLen = 1;\n    const logitsOnCPU = this.updateLogitsOnCPU(\n      await this.embedAndForward(decodeChunk, decodeChunkLen),\n    );\n    await this.device.sync();\n    this.tvm.endScope();\n\n    const decodingEnd = performance.now();\n    const msg =\n      `prefill-time=${((decodingStart - prefillStart) / 1000).toFixed(4)} sec` +\n      `decoding-time=${((decodingEnd - decodingStart) / 1000).toFixed(4)} sec`;\n\n    // simply log tokens for eyeballing.\n    log.info(\"Logits:\");\n    log.info(logitsOnCPU.toArray());\n    log.info(msg);\n  }\n}\n"
  },
  {
    "path": "src/message.ts",
    "content": "import { AppConfig, ChatOptions } from \"./config\";\nimport { InitProgressReport, LogLevel } from \"./types\";\nimport {\n  ChatCompletionRequestStreaming,\n  ChatCompletionRequestNonStreaming,\n  ChatCompletion,\n  ChatCompletionChunk,\n  CompletionCreateParamsNonStreaming,\n  CompletionCreateParamsStreaming,\n  Completion,\n  EmbeddingCreateParams,\n  CreateEmbeddingResponse,\n} from \"./openai_api_protocols/index\";\n\n/**\n * Message kind used by worker\n */\ntype RequestKind =\n  | \"reload\"\n  | \"runtimeStatsText\"\n  | \"interruptGenerate\"\n  | \"unload\"\n  | \"resetChat\"\n  | \"getMaxStorageBufferBindingSize\"\n  | \"getGPUVendor\"\n  | \"forwardTokensAndSample\"\n  | \"chatCompletionNonStreaming\"\n  | \"completionNonStreaming\"\n  | \"embedding\"\n  | \"getMessage\"\n  | \"chatCompletionStreamInit\"\n  | \"completionStreamInit\"\n  | \"completionStreamNextChunk\"\n  | \"customRequest\"\n  | \"keepAlive\"\n  | \"setLogLevel\"\n  | \"setAppConfig\";\n\n// eslint-disable-next-line @typescript-eslint/no-unused-vars\ntype ResponseKind = \"return\" | \"throw\" | \"initProgressCallback\";\n\nexport interface ReloadParams {\n  modelId: string[];\n  chatOpts?: ChatOptions[];\n}\nexport interface ResetChatParams {\n  keepStats: boolean;\n  modelId?: string;\n}\nexport interface GetMessageParams {\n  modelId?: string;\n}\nexport interface RuntimeStatsTextParams {\n  modelId?: string;\n}\nexport interface ForwardTokensAndSampleParams {\n  inputIds: Array<number>;\n  isPrefill: boolean;\n  modelId?: string;\n}\n\n// Notes on the following Params with modelId and chatOpts:\n// These fields are the model and chatOpts that the frontend engine expects the backend\n// to be loaded with. If not loaded due to web/service worker unexpectedly killed,\n// handler will call reload(). An engine can load multiple models, hence both are list.\n// TODO(webllm-team): should add appConfig here as well if rigorous.\n// Fore more, see https://github.com/mlc-ai/web-llm/pull/471\n\n// Note on the messages with selectedModelId:\n// This is the modelId this request uses. It is needed to identify which async generator\n// to instantiate / use, since an engine can load multiple models, thus the handler\n// needs to maintain multiple generators.\nexport interface ChatCompletionNonStreamingParams {\n  request: ChatCompletionRequestNonStreaming;\n  modelId: string[];\n  chatOpts?: ChatOptions[];\n}\nexport interface ChatCompletionStreamInitParams {\n  request: ChatCompletionRequestStreaming;\n  selectedModelId: string;\n  modelId: string[];\n  chatOpts?: ChatOptions[];\n}\nexport interface CompletionNonStreamingParams {\n  request: CompletionCreateParamsNonStreaming;\n  modelId: string[];\n  chatOpts?: ChatOptions[];\n}\nexport interface CompletionStreamInitParams {\n  request: CompletionCreateParamsStreaming;\n  selectedModelId: string;\n  modelId: string[];\n  chatOpts?: ChatOptions[];\n}\nexport interface EmbeddingParams {\n  request: EmbeddingCreateParams;\n  modelId: string[];\n  chatOpts?: ChatOptions[];\n}\nexport interface CompletionStreamNextChunkParams {\n  selectedModelId: string;\n}\n\nexport interface CustomRequestParams {\n  requestName: string;\n  requestMessage: string;\n}\nexport type MessageContent =\n  | ReloadParams\n  | ResetChatParams\n  | GetMessageParams\n  | RuntimeStatsTextParams\n  | ForwardTokensAndSampleParams\n  | ChatCompletionNonStreamingParams\n  | ChatCompletionStreamInitParams\n  | CompletionNonStreamingParams\n  | CompletionStreamInitParams\n  | EmbeddingParams\n  | CompletionStreamNextChunkParams\n  | CustomRequestParams\n  | InitProgressReport\n  | LogLevel\n  | string\n  | null\n  | number\n  | ChatCompletion\n  | ChatCompletionChunk\n  | CreateEmbeddingResponse\n  | Completion\n  | AppConfig\n  | void;\n/**\n * The message used in exchange between worker\n * and the main thread.\n */\n\nexport type WorkerRequest = {\n  kind: RequestKind;\n  uuid: string;\n  content: MessageContent;\n};\n\ntype HeartbeatWorkerResponse = {\n  kind: \"heartbeat\";\n  uuid: string;\n};\n\ntype OneTimeWorkerResponse = {\n  kind: \"return\" | \"throw\";\n  uuid: string;\n  content: MessageContent;\n};\n\ntype InitProgressWorkerResponse = {\n  kind: \"initProgressCallback\";\n  uuid: string;\n  content: InitProgressReport;\n};\n\nexport type WorkerResponse =\n  | OneTimeWorkerResponse\n  | InitProgressWorkerResponse\n  | HeartbeatWorkerResponse;\n"
  },
  {
    "path": "src/openai_api_protocols/chat_completion.ts",
    "content": "/**\n * The input to OpenAI API, directly adopted from openai-node with small tweaks:\n * https://github.com/openai/openai-node/blob/master/src/resources/chat/completions.ts\n *\n * Copyright 2024 OpenAI\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *      http://www.apache.org/licenses/LICENSE-2.0\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\nimport { MLCEngineInterface, LatencyBreakdown } from \"../types\";\nimport {\n  functionCallingModelIds,\n  MessagePlaceholders,\n  ModelType,\n} from \"../config\";\nimport {\n  officialHermes2FunctionCallSchemaArray,\n  hermes2FunctionCallingSystemPrompt,\n} from \"../support\";\nimport {\n  CustomResponseFormatError,\n  CustomSystemPromptError,\n  InvalidResponseFormatError,\n  InvalidResponseFormatGrammarError,\n  InvalidResponseFormatStructuralTagError,\n  InvalidStreamOptionsError,\n  MessageOrderError,\n  MultipleTextContentError,\n  SeedTypeError,\n  StreamingCountError,\n  SystemMessageOrderError,\n  UnsupportedDetailError,\n  UnsupportedFieldsError,\n  UnsupportedImageURLError,\n  UnsupportedModelIdError,\n  UserMessageContentErrorForNonVLM,\n} from \"../error\";\nimport type { StructuralTagLike } from \"@mlc-ai/web-xgrammar\";\n\n/* eslint-disable @typescript-eslint/no-namespace */\n\nexport class Chat {\n  private engine: MLCEngineInterface;\n  completions: Completions;\n\n  constructor(engine: MLCEngineInterface) {\n    this.engine = engine;\n    this.completions = new Completions(this.engine);\n  }\n}\n\nexport class Completions {\n  private engine: MLCEngineInterface;\n\n  constructor(engine: MLCEngineInterface) {\n    this.engine = engine;\n  }\n\n  create(request: ChatCompletionRequestNonStreaming): Promise<ChatCompletion>;\n  create(\n    request: ChatCompletionRequestStreaming,\n  ): Promise<AsyncIterable<ChatCompletionChunk>>;\n  create(\n    request: ChatCompletionRequestBase,\n  ): Promise<AsyncIterable<ChatCompletionChunk> | ChatCompletion>;\n  create(\n    request: ChatCompletionRequest,\n  ): Promise<AsyncIterable<ChatCompletionChunk> | ChatCompletion> {\n    return this.engine.chatCompletion(request);\n  }\n}\n\n//////////////////////////////// 0. HIGH-LEVEL INTERFACES ////////////////////////////////\n\n/**\n * OpenAI chat completion request protocol.\n *\n * API reference: https://platform.openai.com/docs/api-reference/chat/create\n * Followed: https://github.com/openai/openai-node/blob/master/src/resources/chat/completions.ts\n *\n * @note `model` is excluded. Instead, call `CreateMLCEngine(model)` or `engine.reload(model)` explicitly before calling this API.\n */\nexport interface ChatCompletionRequestBase {\n  /**\n   * A list of messages comprising the conversation so far.\n   */\n  messages: Array<ChatCompletionMessageParam>;\n\n  /**\n   * If set, partial message deltas will be sent. It will be terminated by an empty chunk.\n   */\n  stream?: boolean | null;\n\n  /**\n   * Options for streaming response. Only set this when you set `stream: true`.\n   */\n  stream_options?: ChatCompletionStreamOptions | null;\n\n  /**\n   * How many chat completion choices to generate for each input message.\n   */\n  n?: number | null;\n\n  /**\n   * Number between -2.0 and 2.0. Positive values penalize new tokens based on their\n   * existing frequency in the text so far, decreasing the model's likelihood to\n   * repeat the same line verbatim.\n   *\n   * [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/text-generation/parameter-details)\n   */\n  frequency_penalty?: number | null;\n\n  /**\n   * Number between -2.0 and 2.0. Positive values penalize new tokens based on\n   * whether they appear in the text so far, increasing the model's likelihood to\n   * talk about new topics.\n   *\n   * [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/text-generation/parameter-details)\n   */\n  presence_penalty?: number | null;\n\n  /**\n   * Penalizes new tokens based on whether they appear in the prompt and the\n   * generated text so far. Values greater than 1.0 encourage the model to use new\n   * tokens, while values less than 1.0 encourage the model to repeat tokens.\n   */\n  repetition_penalty?: number | null;\n\n  /**\n   * The maximum number of [tokens](/tokenizer) that can be generated in the chat\n   * completion.\n   *\n   * The total length of input tokens and generated tokens is limited by the model's\n   * context length.\n   */\n  max_tokens?: number | null;\n\n  /**\n   * Sequences where the API will stop generating further tokens.\n   */\n  stop?: string | null | Array<string>;\n\n  /**\n   * What sampling temperature to use, between 0 and 2. Higher values like 0.8 will\n   * make the output more random, while lower values like 0.2 will make it more\n   * focused and deterministic.\n   */\n  temperature?: number | null;\n\n  /**\n   * An alternative to sampling with temperature, called nucleus sampling, where the\n   * model considers the results of the tokens with top_p probability mass. So 0.1\n   * means only the tokens comprising the top 10% probability mass are considered.\n   */\n  top_p?: number | null;\n\n  /**\n   * Modify the likelihood of specified tokens appearing in the completion.\n   *\n   * Accepts a JSON object that maps tokens (specified by their token ID, which varies per model)\n   * to an associated bias value from -100 to 100. Typically, you can see `tokenizer.json` of the\n   * model to see which token ID maps to what string. Mathematically, the bias is added to the\n   * logits generated by the model prior to sampling. The exact effect will vary per model, but\n   * values between -1 and 1 should decrease or increase likelihood of selection; values like -100\n   * or 100 should result in a ban or exclusive selection of the relevant token.\n   *\n   * As an example, you can pass `{\"16230\": -100}` to prevent the `Hello` token from being\n   * generated in Mistral-7B-Instruct-v0.2, according to the mapping in\n   * https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/raw/main/tokenizer.json.\n   *\n   * @note For stateful and customizable / flexible logit processing, see `webllm.LogitProcessor`.\n   * @note If used in combination with `webllm.LogitProcessor`, `logit_bias` is applied after\n   * `LogitProcessor.processLogits()` is called.\n   */\n  logit_bias?: Record<string, number> | null;\n\n  /**\n   * Whether to return log probabilities of the output tokens or not.\n   *\n   * If true, returns the log probabilities of each output token returned in the `content` of\n   * `message`.\n   */\n  logprobs?: boolean | null;\n\n  /**\n   * An integer between 0 and 5 specifying the number of most likely tokens to return\n   * at each token position, each with an associated log probability. `logprobs` must\n   * be set to `true` if this parameter is used.\n   */\n  top_logprobs?: number | null;\n\n  /**\n   * If specified, our system will make a best effort to sample deterministically, such that\n   * repeated requests with the same `seed` and parameters should return the same result.\n   *\n   * @note Seeding is done on a request-level rather than choice-level. That is, if `n > 1`, you\n   * would still get different content for each `Choice`. But if two requests with `n = 2` are\n   * processed with the same seed, the two results should be the same (two choices are different).\n   */\n  seed?: number | null;\n\n  /**\n   * Controls which (if any) function is called by the model. `none` means the model\n   * will not call a function and instead generates a message. `auto` means the model\n   * can pick between generating a message or calling a function. Specifying a\n   * particular function via\n   * `{\"type\": \"function\", \"function\": {\"name\": \"my_function\"}}` forces the model to\n   * call that function.\n   *\n   * `none` is the default when no functions are present. `auto` is the default if\n   * functions are present.\n   */\n  tool_choice?: ChatCompletionToolChoiceOption;\n\n  /**\n   * A list of tools the model may call. Currently, only functions are supported as a\n   * tool. Use this to provide a list of functions the model may generate JSON inputs\n   * for.\n   *\n   * The corresponding reply would populate the `tool_calls` field. If used with streaming,\n   * the last chunk would contain the `tool_calls` field, while the intermediate chunks would\n   * contain the raw string.\n   *\n   * If the generation terminates due to FinishReason other than \"stop\" (i.e. \"length\" or \"abort\"),\n   * then no `tool_calls` will be returned. User can still get the raw string output.\n   */\n  tools?: Array<ChatCompletionTool>;\n\n  /**\n   * An object specifying the format that the model must output.\n   *\n   * Setting to `{ \"type\": \"json_object\" }` enables JSON mode, which guarantees the\n   * message the model generates is valid JSON.\n   *\n   * **Important:** when using JSON mode, you **must** also instruct the model to\n   * produce JSON yourself via a system or user message. Without this, the model may\n   * generate an unending stream of whitespace until the generation reaches the token\n   * limit, resulting in a long-running and seemingly \"stuck\" request. Also note that\n   * the message content may be partially cut off if `finish_reason=\"length\"`, which\n   * indicates the generation exceeded `max_tokens` or the conversation exceeded the\n   * max context length.\n   */\n  response_format?: ResponseFormat;\n\n  /**\n   * If true, will ignore stop string and stop token and generate until max_tokens hit.\n   * If unset, will treat as false.\n   */\n  ignore_eos?: boolean;\n\n  /**\n   * ID of the model to use. This equals to `ModelRecord.model_id`, which needs to either be in\n   * `webllm.prebuiltAppConfig` or in `engineConfig.appConfig`.\n   *\n   * @note Call `CreateMLCEngine(model)` or `engine.reload(model)` ahead of time.\n   * @note If only one model is loaded in the engine, this field is optional. If multiple models\n   *   are loaded, this is required.\n   */\n  model?: string | null;\n\n  /**\n   * Fields specific to WebLLM, not present in OpenAI.\n   */\n  extra_body?: {\n    /**\n     * If set to false, prepend a \"<think>\\n\\n</think>\\n\\n\" to the response, preventing the\n     * model from generating thinking tokens. If set to true or undefined, does nothing.\n     *\n     * @note Currently only allowed to be used for Qwen3 models, though not explicitly checked.\n     */\n    enable_thinking?: boolean | null;\n\n    /**\n     * If set to true, the response will include a breakdown of the time spent in various\n     * stages of token sampling.\n     */\n    enable_latency_breakdown?: boolean | null;\n  };\n}\n\nexport interface ChatCompletionRequestNonStreaming\n  extends ChatCompletionRequestBase {\n  /**\n   * If set, partial message deltas will be sent. It will be terminated by an empty chunk.\n   */\n  stream?: false | null;\n}\n\nexport interface ChatCompletionRequestStreaming\n  extends ChatCompletionRequestBase {\n  /**\n   * If set, partial message deltas will be sent. It will be terminated by an empty chunk.\n   */\n  stream: true;\n}\n\nexport type ChatCompletionRequest =\n  | ChatCompletionRequestNonStreaming\n  | ChatCompletionRequestStreaming;\n\n/**\n * Represents a chat completion response returned by model, based on the provided input.\n */\nexport interface ChatCompletion {\n  /**\n   * A unique identifier for the chat completion.\n   */\n  id: string;\n\n  /**\n   * A list of chat completion choices. Can be more than one if `n` is greater than 1.\n   */\n  choices: Array<ChatCompletion.Choice>;\n\n  /**\n   * The model used for the chat completion.\n   */\n  model: string;\n\n  /**\n   * The object type, which is always `chat.completion`.\n   */\n  object: \"chat.completion\";\n\n  /**\n   * The Unix timestamp (in seconds) of when the chat completion was created.\n   *\n   */\n  created: number;\n\n  /**\n   * Usage statistics for the completion request.\n   *\n   * @note If we detect user is performing multi-round chatting, only the new portion of the\n   * prompt is counted for prompt_tokens. If `n > 1`, all choices' generation usages combined.\n   */\n  usage?: CompletionUsage;\n\n  /**\n   * This fingerprint represents the backend configuration that the model runs with.\n   *\n   * Can be used in conjunction with the `seed` request parameter to understand when\n   * backend changes have been made that might impact determinism.\n   *\n   * @note Not supported yet.\n   */\n  system_fingerprint?: string;\n}\n\n/**\n * Represents a streamed chunk of a chat completion response returned by model,\n * based on the provided input.\n */\nexport interface ChatCompletionChunk {\n  /**\n   * A unique identifier for the chat completion. Each chunk has the same ID.\n   */\n  id: string;\n\n  /**\n   * A list of chat completion choices. Can contain more than one elements if `n` is\n   * greater than 1. Can also be empty for the last chunk if you set\n   * `stream_options: {\"include_usage\": true}`.\n   */\n  choices: Array<ChatCompletionChunk.Choice>;\n\n  /**\n   * The Unix timestamp (in seconds) of when the chat completion was created. Each\n   * chunk has the same timestamp.\n   */\n  created: number;\n\n  /**\n   * The model to generate the completion.\n   */\n  model: string;\n\n  /**\n   * The object type, which is always `chat.completion.chunk`.\n   */\n  object: \"chat.completion.chunk\";\n\n  /**\n   * This fingerprint represents the backend configuration that the model runs with.\n   * Can be used in conjunction with the `seed` request parameter to understand when\n   * backend changes have been made that might impact determinism.\n   *\n   * @note Not supported yet.\n   */\n  system_fingerprint?: string;\n\n  /**\n   * An optional field that will only be present when you set\n   * `stream_options: {\"include_usage\": true}` in your request. When present, it\n   * contains a null value except for the last chunk which contains the token usage\n   * statistics for the entire request.\n   */\n  usage?: CompletionUsage;\n}\n\nexport const ChatCompletionRequestUnsupportedFields: Array<string> = []; // all supported as of now\n\n/**\n * Post init and verify whether the input of the request is valid. Thus, this function can throw\n * error or in-place update request.\n * @param request User's input request.\n * @param currentModelId The current model loaded that will perform this request.\n * @param currentModelType The type of the model loaded, decide what requests can be handled.\n */\nexport function postInitAndCheckFields(\n  request: ChatCompletionRequest,\n  currentModelId: string,\n  currentModelType: ModelType,\n): void {\n  // Generation-related checks and post inits are in `postInitAndCheckGenerationConfigValues()`\n  // 1. Check unsupported fields in request\n  const unsupported: Array<string> = [];\n  ChatCompletionRequestUnsupportedFields.forEach((field) => {\n    if (field in request) {\n      unsupported.push(field);\n    }\n  });\n  if (unsupported.length > 0) {\n    throw new UnsupportedFieldsError(unsupported, \"ChatCompletionRequest\");\n  }\n\n  // 2. Check unsupported messages\n  request.messages.forEach(\n    (message: ChatCompletionMessageParam, index: number) => {\n      // Check content array messages (that are not simple string)\n      if (message.role === \"user\" && typeof message.content !== \"string\") {\n        if (currentModelType !== ModelType.VLM) {\n          // Only VLM can handle non-string content (i.e. message with image)\n          throw new UserMessageContentErrorForNonVLM(\n            currentModelId,\n            ModelType[currentModelType],\n            message.content,\n          );\n        }\n        let numTextContent = 0;\n        for (let i = 0; i < message.content.length; i++) {\n          const curContent = message.content[i];\n          if (curContent.type === \"image_url\") {\n            // Do not support image_url.detail\n            const detail = curContent.image_url.detail;\n            if (detail !== undefined && detail !== null) {\n              throw new UnsupportedDetailError(detail);\n            }\n            // Either start with http or data:image for base64\n            const url = curContent.image_url.url;\n            if (!url.startsWith(\"data:image\") && !url.startsWith(\"http\")) {\n              throw new UnsupportedImageURLError(url);\n            }\n          } else {\n            numTextContent += 1;\n          }\n        }\n        if (numTextContent > 1) {\n          // Only one text contentPart per message\n          // TODO(Charlie): is it always the case that an input can only have one\n          // textPart? Or it is only for phi3vision?\n          throw new MultipleTextContentError();\n        }\n      }\n      if (message.role === \"system\" && index !== 0) {\n        throw new SystemMessageOrderError();\n      }\n    },\n  );\n\n  // 3. Last message has to be from user or tool\n  const lastId = request.messages.length - 1;\n  if (\n    request.messages[lastId].role !== \"user\" &&\n    request.messages[lastId].role !== \"tool\"\n  ) {\n    throw new MessageOrderError(\n      \"Last message should be from either `user` or `tool`.\",\n    );\n  }\n\n  // 4. If streaming, n cannot be > 1, since we cannot manage multiple sequences at once\n  if (request.stream && request.n && request.n > 1) {\n    throw new StreamingCountError();\n  }\n\n  // 5. Seed should be an integer\n  if (request.seed !== undefined && request.seed !== null) {\n    if (!Number.isInteger(request.seed)) {\n      throw new SeedTypeError(request.seed);\n    }\n  }\n\n  // 6. Schema can only be specified when type is `json_object`.\n  if (\n    request.response_format?.schema !== undefined &&\n    request.response_format?.schema !== null\n  ) {\n    if (request.response_format?.type !== \"json_object\") {\n      throw new InvalidResponseFormatError();\n    }\n  }\n\n  // 6.1 When grammar is specified, the type needs to be grammar\n  if (\n    request.response_format?.grammar !== undefined &&\n    request.response_format?.grammar !== null\n  ) {\n    if (request.response_format?.type !== \"grammar\") {\n      throw new InvalidResponseFormatGrammarError();\n    }\n  }\n\n  // 6.2 When type is grammar, the grammar field needs to be specified.\n  if (request.response_format?.type === \"grammar\") {\n    if (\n      request.response_format?.grammar === undefined ||\n      request.response_format?.grammar === null\n    ) {\n      throw new InvalidResponseFormatGrammarError();\n    }\n  }\n\n  if (\n    request.response_format?.structural_tag !== undefined &&\n    request.response_format?.structural_tag !== null\n  ) {\n    if (request.response_format?.type !== \"structural_tag\") {\n      throw new InvalidResponseFormatStructuralTagError();\n    }\n  }\n\n  if (request.response_format?.type === \"structural_tag\") {\n    if (\n      request.response_format?.structural_tag === undefined ||\n      request.response_format?.structural_tag === null\n    ) {\n      throw new InvalidResponseFormatStructuralTagError();\n    }\n  }\n\n  // 7. Function calling hardcoded handlings\n  if (request.tools !== undefined && request.tools !== null) {\n    // 7.1 Check if model supports function calling\n    if (!functionCallingModelIds.includes(currentModelId)) {\n      throw new UnsupportedModelIdError(\n        currentModelId,\n        functionCallingModelIds,\n      );\n    }\n\n    // 7.2 Hard coded support for Hermes2Pro following\n    // https://huggingface.co/NousResearch/Hermes-2-Pro-Llama-3-8B#prompt-format-for-function-calling\n    if (currentModelId.startsWith(\"Hermes-2-Pro-\")) {\n      // 7.2.1 Update response format for Hermes2Pro function calling to use json schema\n      if (\n        request.response_format !== undefined &&\n        request.response_format !== null\n      ) {\n        throw new CustomResponseFormatError(request.response_format);\n      }\n      request.response_format = {\n        type: \"json_object\",\n        schema: officialHermes2FunctionCallSchemaArray,\n      } as ResponseFormat;\n\n      // 7.2.2 Modify system prompt to provide tools usage\n      const hermes2SystemMessage = hermes2FunctionCallingSystemPrompt.replace(\n        MessagePlaceholders.hermes_tools,\n        JSON.stringify(request.tools),\n      );\n      // Make sure user did not provide system message already\n      for (let i = 0; i < request.messages.length; i++) {\n        const message: ChatCompletionMessageParam = request.messages[i];\n        if (message.role === \"system\") {\n          throw new CustomSystemPromptError();\n        }\n      }\n      // Prepend a message for hardcoded system prompt\n      request.messages.unshift({\n        role: \"system\",\n        content: hermes2SystemMessage,\n      } as ChatCompletionSystemMessageParam);\n    }\n  }\n\n  // 8. Only set stream_options when streaming\n  if (request.stream_options !== undefined && request.stream_options !== null) {\n    if (!request.stream) {\n      throw new InvalidStreamOptionsError();\n    }\n  }\n}\n\n//////////////// BELOW ARE INTERFACES THAT SUPPORT THE ONES ABOVE ////////////////\n\n//////////////////////////////// 1. MESSAGES ////////////////////////////////\n\n//////////////////////////////// 1.1. CHAT COMPLETION CONTENT ////////////////////////////////\n\nexport type ChatCompletionContentPart =\n  | ChatCompletionContentPartText\n  | ChatCompletionContentPartImage;\n\nexport interface ChatCompletionContentPartText {\n  /**\n   * The text content.\n   */\n  text: string;\n\n  /**\n   * The type of the content part.\n   */\n  type: \"text\";\n}\n\nexport namespace ChatCompletionContentPartImage {\n  export interface ImageURL {\n    /**\n     * Either a URL of the image or the base64 encoded image data.\n     */\n    url: string;\n\n    /**\n     * Specifies the detail level of the image.\n     */\n    detail?: \"auto\" | \"low\" | \"high\";\n  }\n}\n\nexport interface ChatCompletionContentPartImage {\n  image_url: ChatCompletionContentPartImage.ImageURL;\n  /**\n   * The type of the content part.\n   */\n  type: \"image_url\";\n}\n\n//////////////////////////////// 1.2. MESSAGE TOOL CALL ////////////////////////////////\n\nexport interface ChatCompletionMessageToolCall {\n  /**\n   * The ID of the tool call. In WebLLM, it is used as the index of the tool call among all\n   * the tools calls in this request generation.\n   */\n  id: string;\n\n  /**\n   * The function that the model called.\n   */\n  function: ChatCompletionMessageToolCall.Function;\n\n  /**\n   * The type of the tool. Currently, only `function` is supported.\n   */\n  type: \"function\";\n}\n\nexport namespace ChatCompletionMessageToolCall {\n  /**\n   * The function that the model called.\n   */\n  export interface Function {\n    /**\n     * The arguments to call the function with, as generated by the model in JSON\n     * format.\n     */\n    arguments: string;\n\n    /**\n     * The name of the function to call.\n     */\n    name: string;\n  }\n}\n\n//////////////////////////////// 1.3. MESSAGE PARAM ////////////////////////////////\n\n/**\n * The role of the author of a message\n */\nexport type ChatCompletionRole =\n  | \"system\"\n  | \"user\"\n  | \"assistant\"\n  | \"tool\"\n  | \"function\";\n\n/**\n * Options for streaming response. Only set this when you set `stream: true`.\n */\nexport interface ChatCompletionStreamOptions {\n  /**\n   * If set, an additional chunk will be streamed after the last empty chunk.\n   * The `usage` field on this chunk shows the token usage statistics for the entire\n   * request, and the `choices` field will always be an empty array. All other chunks\n   * will also include a `usage` field, but with a null value.\n   */\n  include_usage?: boolean;\n}\n\nexport interface ChatCompletionSystemMessageParam {\n  /**\n   * The contents of the system message.\n   */\n  content: string;\n\n  /**\n   * The role of the messages author, in this case `system`.\n   */\n  role: \"system\";\n}\n\nexport interface ChatCompletionUserMessageParam {\n  /**\n   * The contents of the user message.\n   */\n  content: string | Array<ChatCompletionContentPart>;\n\n  /**\n   * The role of the messages author, in this case `user`.\n   */\n  role: \"user\";\n\n  /**\n   * An optional name for the participant. Provides the model information to\n   * differentiate between participants of the same role.\n   *\n   * @note This is experimental, as models typically have predefined names for the user.\n   */\n  name?: string;\n}\n\nexport interface ChatCompletionAssistantMessageParam {\n  /**\n   * The role of the messages author, in this case `assistant`.\n   */\n  role: \"assistant\";\n\n  /**\n   * The contents of the assistant message. Required unless `tool_calls` is specified.\n   */\n  content?: string | null;\n\n  /**\n   * An optional name for the participant. Provides the model information to\n   * differentiate between participants of the same role.\n   *\n   * @note This is experimental, as models typically have predefined names for the user.\n   */\n  name?: string;\n\n  /**\n   * The tool calls generated by the model, such as function calls.\n   */\n  tool_calls?: Array<ChatCompletionMessageToolCall>;\n}\n\nexport interface ChatCompletionToolMessageParam {\n  /**\n   * The contents of the tool message.\n   */\n  content: string;\n\n  /**\n   * The role of the messages author, in this case `tool`.\n   */\n  role: \"tool\";\n\n  /**\n   * Tool call that this message is responding to.\n   */\n  tool_call_id: string;\n}\n\nexport type ChatCompletionMessageParam =\n  | ChatCompletionSystemMessageParam\n  | ChatCompletionUserMessageParam\n  | ChatCompletionAssistantMessageParam\n  | ChatCompletionToolMessageParam;\n\n//////////////////////////////// 2. TOOL USING ////////////////////////////////\n\n/**\n * The parameters the functions accepts, described as a JSON Schema object. See the\n * [guide](https://platform.openai.com/docs/guides/text-generation/function-calling)\n * for examples, and the\n * [JSON Schema reference](https://json-schema.org/understanding-json-schema/) for\n * documentation about the format.\n *\n * Omitting `parameters` defines a function with an empty parameter list.\n */\nexport type FunctionParameters = Record<string, unknown>;\n\nexport interface FunctionDefinition {\n  /**\n   * The name of the function to be called. Must be a-z, A-Z, 0-9, or contain\n   * underscores and dashes, with a maximum length of 64.\n   */\n  name: string;\n\n  /**\n   * A description of what the function does, used by the model to choose when and\n   * how to call the function.\n   */\n  description?: string;\n\n  /**\n   * The parameters the functions accepts, described as a JSON Schema object. See the\n   * [guide](https://platform.openai.com/docs/guides/text-generation/function-calling)\n   * for examples, and the\n   * [JSON Schema reference](https://json-schema.org/understanding-json-schema/) for\n   * documentation about the format.\n   *\n   * Omitting `parameters` defines a function with an empty parameter list.\n   */\n  parameters?: FunctionParameters;\n}\n\nexport interface ChatCompletionTool {\n  function: FunctionDefinition;\n\n  /**\n   * The type of the tool. Currently, only `function` is supported.\n   */\n  type: \"function\";\n}\n\n/**\n * Specifies a tool the model should use. Use to force the model to call a specific\n * function.\n */\nexport interface ChatCompletionNamedToolChoice {\n  function: ChatCompletionNamedToolChoice.Function;\n\n  /**\n   * The type of the tool. Currently, only `function` is supported.\n   */\n  type: \"function\";\n}\n\nexport namespace ChatCompletionNamedToolChoice {\n  export interface Function {\n    /**\n     * The name of the function to call.\n     */\n    name: string;\n  }\n}\n\n/**\n * Controls which (if any) function is called by the model. `none` means the model\n * will not call a function and instead generates a message. `auto` means the model\n * can pick between generating a message or calling a function. Specifying a\n * particular function via\n * `{\"type\": \"function\", \"function\": {\"name\": \"my_function\"}}` forces the model to\n * call that function.\n *\n * `none` is the default when no functions are present. `auto` is the default if\n * functions are present.\n */\nexport type ChatCompletionToolChoiceOption =\n  | \"none\"\n  | \"auto\"\n  | ChatCompletionNamedToolChoice;\n\n//////////////////////////////// 3. OTHERS ////////////////////////////////\n\n//////////////////////////////// 3.1. LOG PROBS ////////////////////////////////\nexport interface TopLogprob {\n  /**\n   * The token.\n   */\n  token: string;\n\n  /**\n   * A list of integers representing the UTF-8 bytes representation of the token.\n   * Useful in instances where characters are represented by multiple tokens and\n   * their byte representations must be combined to generate the correct text\n   * representation. Can be `null` if there is no bytes representation for the token.\n   *\n   * @note Encoded with `TextEncoder.encode()` and can be decoded with `TextDecoder.decode()`.\n   * For details, see https://developer.mozilla.org/en-US/docs/Web/API/TextEncoder/encode.\n   */\n  bytes: Array<number> | null;\n\n  /**\n   * The log probability of this token.\n   */\n  logprob: number;\n}\n\nexport interface ChatCompletionTokenLogprob {\n  /**\n   * The token.\n   */\n  token: string;\n\n  /**\n   * A list of integers representing the UTF-8 bytes representation of the token.\n   * Useful in instances where characters are represented by multiple tokens and\n   * their byte representations must be combined to generate the correct text\n   * representation. Can be `null` if there is no bytes representation for the token.\n   *\n   * @note Encoded with `TextEncoder.encode()` and can be decoded with `TextDecoder.decode()`.\n   * For details, see https://developer.mozilla.org/en-US/docs/Web/API/TextEncoder/encode.\n   */\n  bytes: Array<number> | null;\n\n  /**\n   * The log probability of this token.\n   */\n  logprob: number;\n\n  /**\n   * List of the most likely tokens and their log probability, at this token\n   * position. In rare cases, there may be fewer than the number of requested\n   * `top_logprobs` returned.\n   */\n  top_logprobs: Array<TopLogprob>;\n}\n\n//////////////////////////////// 3.2. OTHERS ////////////////////////////////\n/**\n * A chat completion message generated by the model.\n */\nexport interface ChatCompletionMessage {\n  /**\n   * The contents of the message.\n   */\n  content: string | null;\n\n  /**\n   * The role of the author of this message.\n   */\n  role: \"assistant\";\n\n  /**\n   * The tool calls generated by the model, such as function calls.\n   */\n  tool_calls?: Array<ChatCompletionMessageToolCall>;\n}\n\n/**\n * Usage statistics for the completion request.\n */\nexport interface CompletionUsage {\n  /**\n   * Number of tokens in the generated completion.\n   */\n  completion_tokens: number;\n\n  /**\n   * Number of tokens in the prompt.\n   *\n   * @note If we detect user is performing multi-round chatting, only the new portion of the\n   * prompt is counted for prompt_tokens.\n   */\n  prompt_tokens: number;\n\n  /**\n   * Total number of tokens used in the request (prompt + completion).\n   */\n  total_tokens: number;\n\n  /**\n   * Fields specific to WebLLM, not present in OpenAI.\n   */\n  extra: {\n    /**\n     * Total seconds spent on this request, from receiving the request, to generating the response.\n     */\n    e2e_latency_s: number;\n\n    /**\n     * Number of tokens per second for prefilling.\n     */\n    prefill_tokens_per_s: number;\n\n    /**\n     * Number of tokens per second for autoregressive decoding.\n     */\n    decode_tokens_per_s: number;\n\n    /**\n     * Seconds spent to generate the first token since receiving the request. Mainly contains\n     * prefilling overhead. If n > 1, it is the sum over all choices.\n     */\n    time_to_first_token_s: number;\n\n    /**\n     * Seconds in between generated tokens. Mainly contains decoding overhead. If n > 1, it\n     * is the average over all choices.\n     */\n    time_per_output_token_s: number;\n\n    /**\n     * Seconds spent on initializing grammar matcher for structured output. If n > 1, it\n     * is the sum over all choices.\n     */\n    grammar_init_s?: number;\n\n    /**\n     * Seconds per-token that grammar matcher spent on creating bitmask and accepting token for\n     * structured output. If n > 1, it is the average over all choices.\n     */\n    grammar_per_token_s?: number;\n\n    /**\n     * If `enable_latency_breakdown` is set to true in the request, this field will be\n     * present and contain a breakdown of the time spent in various stages of token sampling.\n     */\n    latencyBreakdown?: LatencyBreakdown;\n  };\n}\n\n/**\n * The reason the model stopped generating tokens. This will be `stop` if the model\n * hit a natural stop point or a provided stop sequence, `length` if the maximum\n * number of tokens specified in the request was reached or the context_window_size will\n * be exceeded, `tool_calls` if the model called a tool, or `abort` if user manually stops the\n * generation.\n */\nexport type ChatCompletionFinishReason =\n  | \"stop\"\n  | \"length\"\n  | \"tool_calls\"\n  | \"abort\";\n\nexport namespace ChatCompletion {\n  export interface Choice {\n    /**\n     * The reason the model stopped generating tokens. This will be `stop` if the model\n     * hit a natural stop point or a provided stop sequence, `length` if the maximum\n     * number of tokens specified in the request was reached, `tool_calls` if the\n     * model called a tool, or `abort` if user manually stops the generation.\n     */\n    finish_reason: ChatCompletionFinishReason;\n\n    /**\n     * The index of the choice in the list of choices.\n     */\n    index: number;\n\n    /**\n     * Log probability information for the choice.\n     */\n    logprobs: Choice.Logprobs | null;\n\n    /**\n     * A chat completion message generated by the model.\n     */\n    message: ChatCompletionMessage;\n  }\n\n  export namespace Choice {\n    /**\n     * Log probability information for the choice.\n     */\n    export interface Logprobs {\n      /**\n       * A list of message content tokens with log probability information.\n       */\n      content: Array<ChatCompletionTokenLogprob> | null;\n    }\n  }\n}\n\nexport namespace ChatCompletionChunk {\n  export interface Choice {\n    /**\n     * A chat completion delta generated by streamed model responses.\n     */\n    delta: Choice.Delta;\n\n    /**\n     * The reason the model stopped generating tokens. This will be `stop` if the model\n     * hit a natural stop point or a provided stop sequence, `length` if the maximum\n     * number of tokens specified in the request was reached, `tool_calls` if the\n     * model called a tool, or `abort` if user manually stops the generation.\n     */\n    finish_reason: ChatCompletionFinishReason | null;\n\n    /**\n     * The index of the choice in the list of choices.\n     */\n    index: number;\n\n    /**\n     * Log probability information for the choice.\n     */\n    logprobs?: Choice.Logprobs | null;\n  }\n\n  export namespace Choice {\n    /**\n     * A chat completion delta generated by streamed model responses.\n     */\n    export interface Delta {\n      /**\n       * The contents of the chunk message.\n       */\n      content?: string | null;\n\n      /**\n       * The role of the author of this message.\n       */\n      role?: \"system\" | \"user\" | \"assistant\" | \"tool\";\n\n      tool_calls?: Array<Delta.ToolCall>;\n    }\n\n    export namespace Delta {\n      export interface ToolCall {\n        /**\n         * The index of the tool call among all the tools calls in this request generation.\n         */\n        index: number;\n\n        /**\n         * The ID of the tool call. Not used in WebLLM.\n         */\n        id?: string;\n\n        function?: ToolCall.Function;\n\n        /**\n         * The type of the tool. Currently, only `function` is supported.\n         */\n        type?: \"function\";\n      }\n\n      export namespace ToolCall {\n        export interface Function {\n          /**\n           * The arguments to call the function with, as generated by the model in JSON\n           * format. Note that the model does not always generate valid JSON, and may\n           * hallucinate parameters not defined by your function schema. Validate the\n           * arguments in your code before calling your function.\n           */\n          arguments?: string;\n\n          /**\n           * The name of the function to call.\n           */\n          name?: string;\n        }\n      }\n    }\n\n    /**\n     * Log probability information for the choice.\n     */\n    export interface Logprobs {\n      /**\n       * A list of message content tokens with log probability information.\n       */\n      content: Array<ChatCompletionTokenLogprob> | null;\n    }\n  }\n}\n\n/**\n * An object specifying the format that the model must output.\n *\n * Setting to `{ \"type\": \"json_object\" }` enables JSON mode, which guarantees the\n * message the model generates is valid JSON.\n *\n * Setting to `{ \"type\": \"grammar\" }` requires you to also specify the `grammar` field, which\n * is a BNFGrammar string.\n *\n * Setting to `{ \"type\": \"structural_tag\" }` requires a `structural_tag` definition that\n * applies trigger-based constraints (e.g. tag-delimited blocks) while allowing free-form text\n * outside the triggered spans.\n *\n * Setting `schema` specifies the output format of the json object such as properties to include.\n *\n * **Important:** when using JSON mode, you **must** also instruct the model to produce JSON\n * following the schema (if specified) yourself via a system or user message. Without this,\n * the model may generate an unending stream of whitespace until the generation reaches the token\n * limit, resulting in a long-running and seemingly \"stuck\" request. Also note that\n * the message content may be partially cut off if `finish_reason=\"length\"`, which\n * indicates the generation exceeded `max_tokens` or the conversation exceeded the\n * max context length.\n */\nexport interface ResponseFormat {\n  /**\n   * Must be one of `text`, `json_object`, `grammar`, or `structural_tag`.\n   */\n  type?: \"text\" | \"json_object\" | \"grammar\" | \"structural_tag\";\n  /**\n   * A schema string in the format of the schema of a JSON file. `type` needs to be `json_object`.\n   */\n  schema?: string;\n  /**\n   * An EBNF-formatted string. Needs to be specified when, and only specified when,\n   * `type` is `grammar`. The grammar will be normalized (simplified) by default.\n   * EBNF grammar: see https://www.w3.org/TR/xml/#sec-notation. Note:\n      1. Use # as the comment mark\n      2. Use C-style unicode escape sequence \\u01AB, \\U000001AB, \\xAB\n      3. A-B (match A and not match B) is not supported yet\n      4. Lookahead assertion can be added at the end of a rule to speed up matching. E.g.\n      ```\n      main ::= \"ab\" a [a-z]\n      a ::= \"cd\" (=[a-z])\n      ```\n      The assertion (=[a-z]) means a must be followed by [a-z].\n   */\n  grammar?: string;\n  /**\n   * A structural tag definition. Needs to be specified when, and only when,\n   * `type` is `structural_tag`.\n   */\n  structural_tag?: StructuralTagLike | string;\n}\n"
  },
  {
    "path": "src/openai_api_protocols/completion.ts",
    "content": "/**\n * The input to OpenAI API, directly adopted from openai-node with small tweaks:\n * https://github.com/openai/openai-node/blob/master/src/resources/completions.ts\n *\n * Copyright 2024 OpenAI\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *      http://www.apache.org/licenses/LICENSE-2.0\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\nimport { MLCEngineInterface } from \"../types\";\nimport {\n  InvalidStreamOptionsError,\n  SeedTypeError,\n  StreamingCountError,\n  UnsupportedFieldsError,\n} from \"../error\";\nimport {\n  ChatCompletion,\n  ChatCompletionStreamOptions,\n  CompletionUsage,\n  ChatCompletionFinishReason,\n} from \"./chat_completion\";\n\nexport class Completions {\n  private engine: MLCEngineInterface;\n\n  constructor(engine: MLCEngineInterface) {\n    this.engine = engine;\n  }\n\n  create(request: CompletionCreateParamsNonStreaming): Promise<Completion>;\n  create(\n    request: CompletionCreateParamsStreaming,\n  ): Promise<AsyncIterable<Completion>>;\n  create(\n    request: CompletionCreateParamsBase,\n  ): Promise<AsyncIterable<Completion> | Completion>;\n  create(\n    request: CompletionCreateParams,\n  ): Promise<AsyncIterable<Completion> | Completion> {\n    return this.engine.completion(request);\n  }\n}\n\n//////////////////////////////// 1. CREATE PARAMS ////////////////////////////////\n/**\n * OpenAI completion request protocol.\n *\n * API reference: https://platform.openai.com/docs/api-reference/completions/create\n * Followed: https://github.com/openai/openai-node/blob/master/src/resources/completions.ts\n *\n * @note `model` is excluded. Instead, call `CreateMLCEngine(model)` or `engine.reload(model)` explicitly before calling this API.\n */\nexport interface CompletionCreateParamsBase {\n  /**\n   * The prompt(s) to generate completions for, encoded as a string.\n   */\n  prompt: string;\n\n  /**\n   * Echo back the prompt in addition to the completion\n   */\n  echo?: boolean | null;\n\n  /**\n   * Number between -2.0 and 2.0. Positive values penalize new tokens based on their\n   * existing frequency in the text so far, decreasing the model's likelihood to\n   * repeat the same line verbatim.\n   *\n   * [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/text-generation/parameter-details)\n   */\n  frequency_penalty?: number | null;\n\n  /**\n   * Modify the likelihood of specified tokens appearing in the completion.\n   *\n   * Accepts a JSON object that maps tokens (specified by their token ID, which varies per model)\n   * to an associated bias value from -100 to 100. Typically, you can see `tokenizer.json` of the\n   * model to see which token ID maps to what string. Mathematically, the bias is added to the\n   * logits generated by the model prior to sampling. The exact effect will vary per model, but\n   * values between -1 and 1 should decrease or increase likelihood of selection; values like -100\n   * or 100 should result in a ban or exclusive selection of the relevant token.\n   *\n   * As an example, you can pass `{\"16230\": -100}` to prevent the `Hello` token from being\n   * generated in Mistral-7B-Instruct-v0.2, according to the mapping in\n   * https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2/raw/main/tokenizer.json.\n   *\n   * @note For stateful and customizable / flexible logit processing, see `webllm.LogitProcessor`.\n   * @note If used in combination with `webllm.LogitProcessor`, `logit_bias` is applied after\n   * `LogitProcessor.processLogits()` is called.\n   */\n  logit_bias?: Record<string, number> | null;\n\n  /**\n   * Whether to return log probabilities of the output tokens or not.\n   *\n   * If true, returns the log probabilities of each output token returned in the `content` of\n   * `message`.\n   */\n  logprobs?: boolean | null;\n\n  /**\n   * An integer between 0 and 5 specifying the number of most likely tokens to return\n   * at each token position, each with an associated log probability. `logprobs` must\n   * be set to `true` if this parameter is used.\n   */\n  top_logprobs?: number | null;\n\n  /**\n   * The maximum number of [tokens](/tokenizer) that can be generated in the\n   * completion.\n   *\n   * The total length of input tokens and generated tokens is limited by the model's\n   * context length.\n   */\n  max_tokens?: number | null;\n\n  /**\n   * How many completions to generate for each prompt.\n   */\n  n?: number | null;\n\n  /**\n   * Number between -2.0 and 2.0. Positive values penalize new tokens based on\n   * whether they appear in the text so far, increasing the model's likelihood to\n   * talk about new topics.\n   *\n   * [See more information about frequency and presence penalties.](https://platform.openai.com/docs/guides/text-generation/parameter-details)\n   */\n  presence_penalty?: number | null;\n\n  /**\n   * Penalizes new tokens based on whether they appear in the prompt and the\n   * generated text so far. Values greater than 1.0 encourage the model to use new\n   * tokens, while values less than 1.0 encourage the model to repeat tokens.\n   */\n  repetition_penalty?: number | null;\n\n  /**\n   * If specified, our system will make a best effort to sample deterministically,\n   * such that repeated requests with the same `seed` and parameters should return\n   * the same result.\n   *\n   * @note Seeding is done on a request-level rather than choice-level. That is, if `n > 1`, you\n   * would still get different content for each `Choice`. But if two requests with `n = 2` are\n   * processed with the same seed, the two results should be the same (two choices are different).\n   */\n  seed?: number | null;\n\n  /**\n   * Up to 4 sequences where the API will stop generating further tokens. The\n   * returned text will not contain the stop sequence.\n   */\n  stop?: string | null | Array<string>;\n\n  /**\n   * If set, partial deltas will be sent. It will be terminated by an empty chunk.\n   */\n  stream?: boolean | null;\n\n  /**\n   * Options for streaming response. Only set this when you set `stream: true`.\n   */\n  stream_options?: ChatCompletionStreamOptions | null;\n\n  /**\n   * What sampling temperature to use, between 0 and 2. Higher values like 0.8 will\n   * make the output more random, while lower values like 0.2 will make it more\n   * focused and deterministic.\n   *\n   * We generally recommend altering this or `top_p` but not both.\n   */\n  temperature?: number | null;\n\n  /**\n   * An alternative to sampling with temperature, called nucleus sampling, where the\n   * model considers the results of the tokens with top_p probability mass. So 0.1\n   * means only the tokens comprising the top 10% probability mass are considered.\n   *\n   * We generally recommend altering this or `temperature` but not both.\n   */\n  top_p?: number | null;\n\n  /**\n   * If true, will ignore stop string and stop token and generate until max_tokens hit.\n   * If unset, will treat as false.\n   */\n  ignore_eos?: boolean;\n\n  /**\n   * ID of the model to use. This equals to `ModelRecord.model_id`, which needs to either be in\n   * `webllm.prebuiltAppConfig` or in `engineConfig.appConfig`.\n   *\n   * @note Call `CreateMLCEngine(model)` or `engine.reload(model)` ahead of time.\n   * @note If only one model is loaded in the engine, this field is optional. If multiple models\n   *   are loaded, this is required.\n   */\n  model?: string | null;\n\n  //////////////// BELOW FIELDS NOT SUPPORTED YET ////////////////\n\n  /**\n   * The suffix that comes after a completion of inserted text.\n   *\n   * @note This field is not supported.\n   */\n  suffix?: string | null;\n\n  /**\n   * A unique identifier representing your end-user, which can help OpenAI to monitor\n   * and detect abuse.\n   *\n   * @note This field is not supported.\n   */\n  user?: string;\n\n  /**\n   * Generates `best_of` completions server-side and returns the \"best\" (the one with\n   * the highest log probability per token). Results cannot be streamed.\n   *\n   * When used with `n`, `best_of` controls the number of candidate completions and\n   * `n` specifies how many to return – `best_of` must be greater than `n`.\n   *\n   * @note This field is not supported.\n   */\n  best_of?: number | null;\n\n  /**\n   * Fields specific to WebLLM, not present in OpenAI.\n   */\n  extra_body?: {\n    /**\n     * If set to true, the response will include a breakdown of the time spent in various\n     * stages of token sampling.\n     */\n    enable_latency_breakdown?: boolean | null;\n  };\n}\n\nexport type CompletionCreateParams =\n  | CompletionCreateParamsNonStreaming\n  | CompletionCreateParamsStreaming;\n\nexport interface CompletionCreateParamsNonStreaming\n  extends CompletionCreateParamsBase {\n  /**\n   * If set, partial deltas will be sent. It will be terminated by an empty chunk.\n   */\n  stream?: false | null;\n}\n\nexport interface CompletionCreateParamsStreaming\n  extends CompletionCreateParamsBase {\n  /**\n   * If set, partial deltas will be sent. It will be terminated by an empty chunk.\n   */\n  stream: true;\n}\n\n//////////////////////////////// 2. RESPONSE ////////////////////////////////\n/**\n * Represents a completion response returned by model, based on the provided input.\n */\nexport interface Completion {\n  /**\n   * A unique identifier for the completion.\n   */\n  id: string;\n\n  /**\n   * The list of completion choices the model generated for the input prompt.\n   */\n  choices: Array<CompletionChoice>;\n\n  /**\n   * The Unix timestamp (in seconds) of when the completion was created.\n   */\n  created: number;\n\n  /**\n   * The model used for completion.\n   */\n  model: string;\n\n  /**\n   * The object type, which is always \"text_completion\"\n   */\n  object: \"text_completion\";\n\n  /**\n   * This fingerprint represents the backend configuration that the model runs with.\n   *\n   * Can be used in conjunction with the `seed` request parameter to understand when\n   * backend changes have been made that might impact determinism.\n   *\n   * @note Not supported yet.\n   */\n  system_fingerprint?: string;\n\n  /**\n   * Usage statistics for the completion request.\n   */\n  usage?: CompletionUsage;\n}\n\nexport interface CompletionChoice {\n  /**\n   * The reason the model stopped generating tokens. This will be `stop` if the model\n   * hit a natural stop point or a provided stop sequence, or `length` if the maximum\n   * number of tokens specified in the request was reached.\n   */\n  finish_reason: ChatCompletionFinishReason | null;\n\n  index: number;\n\n  /**\n   * A list of message content tokens with log probability information.\n   * @note Different from openai-node, we reuse ChatCompletion's Logprobs.\n   */\n  logprobs?: ChatCompletion.Choice.Logprobs | null;\n\n  text: string;\n}\n\n//////////////////////////////// 3. POST INIT ////////////////////////////////\n\nexport const CompletionCreateParamsUnsupportedFields: Array<string> = [\n  \"suffix\",\n  \"user\",\n  \"best_of\",\n];\n\n/**\n * Post init and verify whether the input of the request is valid. Thus, this function can throw\n * error or in-place update request.\n * @param request User's input request.\n * @param currentModelId The current model loaded that will perform this request.\n */\nexport function postInitAndCheckFields(\n  request: CompletionCreateParams,\n  // eslint-disable-next-line @typescript-eslint/no-unused-vars\n  currentModelId: string,\n): void {\n  // 1. Check unsupported fields in request\n  const unsupported: Array<string> = [];\n  CompletionCreateParamsUnsupportedFields.forEach((field) => {\n    if (field in request) {\n      unsupported.push(field);\n    }\n  });\n  if (unsupported.length > 0) {\n    throw new UnsupportedFieldsError(unsupported, \"CompletionCreateParams\");\n  }\n\n  // 2. If streaming, n cannot be > 1, since we cannot manage multiple sequences at once\n  if (request.stream && request.n && request.n > 1) {\n    throw new StreamingCountError();\n  }\n\n  // 3. Seed should be an integer\n  if (request.seed !== undefined && request.seed !== null) {\n    if (!Number.isInteger(request.seed)) {\n      throw new SeedTypeError(request.seed);\n    }\n  }\n\n  // 4. Only set stream_options when streaming\n  if (request.stream_options !== undefined && request.stream_options !== null) {\n    if (!request.stream) {\n      throw new InvalidStreamOptionsError();\n    }\n  }\n}\n"
  },
  {
    "path": "src/openai_api_protocols/embedding.ts",
    "content": "/**\n * The input to OpenAI API, directly adopted from openai-node with small tweaks:\n * https://github.com/openai/openai-node/blob/master/src/resources/embeddings.ts\n *\n * Copyright 2024 OpenAI\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *      http://www.apache.org/licenses/LICENSE-2.0\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\nimport {\n  EmbeddingInputEmptyError,\n  EmbeddingUnsupportedEncodingFormatError,\n  UnsupportedFieldsError,\n} from \"../error\";\nimport { MLCEngineInterface } from \"../types\";\n\nexport class Embeddings {\n  private engine: MLCEngineInterface;\n\n  constructor(engine: MLCEngineInterface) {\n    this.engine = engine;\n  }\n\n  /**\n   * Creates an embedding vector representing the input text.\n   */\n  create(request: EmbeddingCreateParams): Promise<CreateEmbeddingResponse> {\n    return this.engine.embedding(request);\n  }\n}\n\nexport interface CreateEmbeddingResponse {\n  /**\n   * The list of embeddings generated by the model.\n   */\n  data: Array<Embedding>;\n\n  /**\n   * The name of the model used to generate the embedding.\n   */\n  model: string;\n\n  /**\n   * The object type, which is always \"list\".\n   */\n  object: \"list\";\n\n  /**\n   * The usage information for the request.\n   */\n  usage: CreateEmbeddingResponse.Usage;\n}\n\n/* eslint-disable-next-line @typescript-eslint/no-namespace */\nexport namespace CreateEmbeddingResponse {\n  /**\n   * The usage information for the request.\n   */\n  export interface Usage {\n    /**\n     * The number of tokens used by the prompt.\n     */\n    prompt_tokens: number;\n\n    /**\n     * The total number of tokens used by the request.\n     */\n    total_tokens: number;\n\n    /**\n     * Fields specific to WebLLM, not present in OpenAI.\n     */\n    extra: {\n      /**\n       * Number of tokens per second for prefilling.\n       */\n      prefill_tokens_per_s: number;\n    };\n  }\n}\n\n/**\n * Represents an embedding vector returned by embedding endpoint.\n */\nexport interface Embedding {\n  /**\n   * The embedding vector, which is a list of floats. The length of vector depends on\n   * the model.\n   */\n  embedding: Array<number>;\n\n  /**\n   * The index of the embedding in the list of embeddings.\n   */\n  index: number;\n\n  /**\n   * The object type, which is always \"embedding\".\n   */\n  object: \"embedding\";\n}\n\nexport interface EmbeddingCreateParams {\n  /**\n   * Input text to embed, encoded as a string or array of tokens. To embed multiple\n   * inputs in a single request, pass an array of strings or array of token arrays.\n   * The input must not exceed the max input tokens for the model, and cannot be an empty string.\n   * If the batch size is too large, multiple forward of the will take place.\n   */\n  input: string | Array<string> | Array<number> | Array<Array<number>>;\n\n  /**\n   * ID of the model to use. This equals to `ModelRecord.model_id`, which needs to either be in\n   * `webllm.prebuiltAppConfig` or in `engineConfig.appConfig`.\n   *\n   * @note Call `CreateMLCEngine(model)` or `engine.reload(model)` ahead of time.\n   * @note If only one model is loaded in the engine, this field is optional. If multiple models\n   *   are loaded, this is required.\n   */\n  model?: string | null;\n\n  /**\n   * The format to return the embeddings in.\n   *\n   * @note Currently only support `float`.\n   */\n  encoding_format?: \"float\" | \"base64\";\n\n  // TODO: can support matryoshka embedding models in future, hence allow `dimensions` for those.\n  /**\n   * The number of dimensions the resulting output embeddings should have.\n   *\n   * @note Not supported.\n   */\n  dimensions?: number;\n\n  /**\n   * A unique identifier representing your end-user, which can help OpenAI to monitor\n   * and detect abuse.\n   *\n   * @note Not supported.\n   */\n  user?: string;\n}\n\nexport const EmbeddingCreateParamsUnsupportedFields: Array<string> = [\n  \"dimensions\",\n  \"user\",\n];\n\nexport function postInitAndCheckFields(\n  request: EmbeddingCreateParams,\n  // eslint-disable-next-line @typescript-eslint/no-unused-vars\n  currentModelId: string,\n): void {\n  // 1. Check unsupported fields in request\n  const unsupported: Array<string> = [];\n  EmbeddingCreateParamsUnsupportedFields.forEach((field) => {\n    if (field in request) {\n      unsupported.push(field);\n    }\n  });\n  if (unsupported.length > 0) {\n    throw new UnsupportedFieldsError(unsupported, \"EmbeddingCreateParams\");\n  }\n\n  // 2. Unsupported format\n  if (request.encoding_format == \"base64\") {\n    throw new EmbeddingUnsupportedEncodingFormatError();\n  }\n\n  // 3. Invalid input\n  const input = request.input;\n  if (typeof input === \"string\") {\n    if (input === \"\") throw new EmbeddingInputEmptyError();\n  } else {\n    // input instanceof Array\n    if (input.length === 0) {\n      // Array<number>\n      throw new EmbeddingInputEmptyError();\n    }\n    for (let i = 0; i < input.length; i++) {\n      const curInput = input[i];\n      if (typeof curInput !== \"number\") {\n        // Array<string>, Array<Array<number>>\n        if (curInput.length === 0) throw new EmbeddingInputEmptyError();\n      }\n    }\n  }\n}\n"
  },
  {
    "path": "src/openai_api_protocols/index.ts",
    "content": "/**\n * The input to OpenAI API, directly adopted from openai-node with small tweaks:\n * https://github.com/openai/openai-node/blob/master/src/resources/chat/completions.ts\n *\n * Copyright 2024 OpenAI\n *\n * Licensed under the Apache License, Version 2.0 (the \"License\");\n * you may not use this file except in compliance with the License.\n * You may obtain a copy of the License at\n *      http://www.apache.org/licenses/LICENSE-2.0\n * Unless required by applicable law or agreed to in writing, software\n * distributed under the License is distributed on an \"AS IS\" BASIS,\n * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n * See the License for the specific language governing permissions and\n * limitations under the License.\n */\n\nexport {\n  Chat,\n  ChatCompletionRequestBase,\n  ChatCompletionRequestNonStreaming,\n  ChatCompletionRequestStreaming,\n  ChatCompletionRequest,\n  ChatCompletion,\n  ChatCompletionChunk,\n  ChatCompletionRequestUnsupportedFields,\n  postInitAndCheckFields as postInitAndCheckFieldsChatCompletion,\n  ChatCompletionContentPart,\n  ChatCompletionContentPartText,\n  ChatCompletionContentPartImage,\n  ChatCompletionMessageToolCall,\n  ChatCompletionRole,\n  ChatCompletionSystemMessageParam,\n  ChatCompletionUserMessageParam,\n  ChatCompletionAssistantMessageParam,\n  ChatCompletionToolMessageParam,\n  ChatCompletionMessageParam,\n  FunctionParameters,\n  FunctionDefinition,\n  ChatCompletionTool,\n  ChatCompletionNamedToolChoice,\n  ChatCompletionToolChoiceOption,\n  TopLogprob,\n  ChatCompletionTokenLogprob,\n  ChatCompletionMessage,\n  CompletionUsage,\n  ResponseFormat,\n  ChatCompletionFinishReason,\n} from \"./chat_completion\";\n\nexport {\n  Completions,\n  CompletionCreateParamsNonStreaming,\n  CompletionCreateParamsStreaming,\n  CompletionCreateParamsBase,\n  CompletionCreateParams,\n  Completion,\n  CompletionChoice,\n  postInitAndCheckFields as postInitAndCheckFieldsCompletion,\n} from \"./completion\";\n\nexport {\n  Embeddings,\n  Embedding,\n  EmbeddingCreateParams,\n  CreateEmbeddingResponse,\n  postInitAndCheckFields as postInitAndCheckFieldsEmbedding,\n} from \"./embedding\";\n"
  },
  {
    "path": "src/service_worker.ts",
    "content": "import * as tvmjs from \"@mlc-ai/web-runtime\";\nimport log from \"loglevel\";\nimport { ChatOptions, MLCEngineConfig } from \"./config\";\nimport { ReloadParams, WorkerRequest, WorkerResponse } from \"./message\";\nimport { InitProgressReport } from \"./types\";\nimport {\n  WebWorkerMLCEngineHandler,\n  WebWorkerMLCEngine,\n  ChatWorker,\n} from \"./web_worker\";\nimport { areArraysEqual, areChatOptionsListEqual } from \"./utils\";\nimport {\n  NoServiceWorkerAPIError,\n  NonWorkerEnvironmentError,\n  ServiceWorkerInitializationError,\n} from \"./error\";\n\n/* Service Worker Script */\n\ntype IServiceWorker = globalThis.ServiceWorker;\n\n/**\n * Worker handler that can be used in a ServiceWorker.\n *\n * @example\n *\n * const engine = new MLCEngine();\n * let handler;\n * chrome.runtime.onConnect.addListener(function (port) {\n *   if (handler === undefined) {\n *     handler = new ServiceWorkerMLCEngineHandler(engine, port);\n *   } else {\n *     handler.setPort(port);\n *   }\n *   port.onMessage.addListener(handler.onmessage.bind(handler));\n * });\n */\nexport class ServiceWorkerMLCEngineHandler extends WebWorkerMLCEngineHandler {\n  private clientRegistry = new Map<\n    string,\n    IServiceWorker | Client | MessagePort\n  >();\n  private initRequestUuid?: string;\n\n  constructor() {\n    if (!self || !(\"addEventListener\" in self)) {\n      throw new NonWorkerEnvironmentError(\"ServiceWorkerMLCEngineHandler\");\n    }\n    super();\n    const onmessage = this.onmessage.bind(this);\n\n    this.engine.setInitProgressCallback((report: InitProgressReport) => {\n      const msg: WorkerResponse = {\n        kind: \"initProgressCallback\",\n        uuid: this.initRequestUuid || \"\",\n        content: report,\n      };\n      this.postMessage(msg);\n    });\n\n    self.addEventListener(\"message\", (event) => {\n      const message = event as unknown as ExtendableMessageEvent;\n      if (message.source) {\n        this.clientRegistry.set(message.data.uuid, message.source);\n      }\n      message.waitUntil(\n        new Promise((resolve, reject) => {\n          onmessage(message, resolve, reject);\n        }),\n      );\n    });\n  }\n\n  postMessage(message: WorkerResponse) {\n    if (this.clientRegistry.has(message.uuid)) {\n      const client = this.clientRegistry.get(message.uuid);\n      client?.postMessage(message);\n\n      if (message.kind === \"return\" || message.kind === \"throw\") {\n        this.clientRegistry.delete(message.uuid);\n      } else {\n        // TODO(nestor): Delete clientRegistry after complete to avoid memory leak?\n      }\n    }\n  }\n\n  onmessage(\n    event: ExtendableMessageEvent,\n    onComplete?: (value: any) => void,\n    onError?: () => void,\n  ): void {\n    const msg = event.data as WorkerRequest;\n    log.trace(\n      `ServiceWorker message: [${msg.kind}] ${JSON.stringify(msg.content)}`,\n    );\n\n    // Special case message handling different from WebWorkerMLCEngineHandler\n    if (msg.kind === \"keepAlive\") {\n      const reply: WorkerResponse = {\n        kind: \"heartbeat\",\n        uuid: msg.uuid,\n      };\n      this.postMessage(reply);\n      onComplete?.(reply);\n      return;\n    }\n\n    if (msg.kind === \"reload\") {\n      this.handleTask(msg.uuid, async () => {\n        const params = msg.content as ReloadParams;\n        // If the modelId, chatOpts, and appConfig are the same, immediately return\n        if (\n          areArraysEqual(this.modelId, params.modelId) &&\n          areChatOptionsListEqual(this.chatOpts, params.chatOpts)\n        ) {\n          log.info(\"Already loaded the model. Skip loading\");\n          const gpuDetectOutput = await tvmjs.detectGPUDevice();\n          if (gpuDetectOutput == undefined) {\n            throw Error(\"Cannot find WebGPU in the environment\");\n          }\n          let gpuLabel = \"WebGPU\";\n          if (gpuDetectOutput.adapterInfo.description.length != 0) {\n            gpuLabel += \" - \" + gpuDetectOutput.adapterInfo.description;\n          } else {\n            gpuLabel += \" - \" + gpuDetectOutput.adapterInfo.vendor;\n          }\n          this.engine.getInitProgressCallback()?.({\n            progress: 1,\n            timeElapsed: 0,\n            text: \"Finish loading on \" + gpuLabel,\n          });\n          onComplete?.(null);\n          return null;\n        }\n\n        this.initRequestUuid = msg.uuid;\n        await this.engine.reload(params.modelId, params.chatOpts);\n        this.modelId = params.modelId;\n        this.chatOpts = params.chatOpts;\n        onComplete?.(null);\n        return null;\n      });\n      return;\n    }\n\n    // All rest of message handling are the same as WebWorkerMLCEngineHandler\n    super.onmessage(msg, onComplete, onError);\n  }\n}\n\n/* Webapp Client */\nexport class ServiceWorker implements ChatWorker {\n  _onmessage: (event: MessageEvent) => void = () => {};\n\n  get onmessage() {\n    return this._onmessage;\n  }\n\n  set onmessage(handler: (event: any) => void) {\n    this._onmessage = handler;\n\n    if (!(\"serviceWorker\" in navigator)) {\n      throw new NoServiceWorkerAPIError();\n    }\n    (navigator.serviceWorker as ServiceWorkerContainer).onmessage = handler;\n  }\n\n  postMessage(message: WorkerRequest) {\n    if (!(\"serviceWorker\" in navigator)) {\n      throw new NoServiceWorkerAPIError();\n    }\n    const serviceWorker = (navigator.serviceWorker as ServiceWorkerContainer)\n      .controller;\n    if (!serviceWorker) {\n      throw new Error(\"There is no active service worker\");\n    }\n    serviceWorker.postMessage(message);\n  }\n}\n\n/**\n * Create a ServiceWorkerMLCEngine.\n *\n * @param modelId model_id of the model to load, either string or string[]. When multiple models\n *   are provided, we load all models sequentially. Each modelId needs to either be in\n *   `webllm.prebuiltAppConfig`, or in `engineCOnfig.appConfig`.\n * @param engineConfig Optionally configures the engine, see `webllm.MLCEngineConfig` for more.\n * @param chatOpts Extra options to optionally override the `mlc-chat-config.json` of `modelId`.\n *   The size of which needs to match that of `modelId`; chatOpts[i] will be used for modelId[i].\n * @returns An initialized `WebLLM.ServiceWorkerMLCEngine` with `modelId` loaded.\n */\nexport async function CreateServiceWorkerMLCEngine(\n  modelId: string | string[],\n  engineConfig?: MLCEngineConfig,\n  chatOpts?: ChatOptions | ChatOptions[],\n  keepAliveMs = 10000,\n): Promise<ServiceWorkerMLCEngine> {\n  if (!(\"serviceWorker\" in navigator)) {\n    throw new NoServiceWorkerAPIError();\n  }\n  const serviceWorkerAPI = navigator.serviceWorker as ServiceWorkerContainer;\n  const registration = await serviceWorkerAPI.ready;\n  const serviceWorker = registration.active || serviceWorkerAPI.controller;\n  if (!serviceWorker) {\n    throw new ServiceWorkerInitializationError();\n  }\n  const serviceWorkerMLCEngine = new ServiceWorkerMLCEngine(\n    engineConfig,\n    keepAliveMs,\n  );\n  await serviceWorkerMLCEngine.reload(modelId, chatOpts);\n  return serviceWorkerMLCEngine;\n}\n\n/**\n * A client of MLCEngine that exposes the same interface\n */\nexport class ServiceWorkerMLCEngine extends WebWorkerMLCEngine {\n  missedHeartbeat = 0;\n\n  constructor(engineConfig?: MLCEngineConfig, keepAliveMs = 10000) {\n    if (!(\"serviceWorker\" in navigator)) {\n      throw new NoServiceWorkerAPIError();\n    }\n    super(new ServiceWorker(), engineConfig);\n\n    // Keep alive through periodical heartbeat signals\n    setInterval(() => {\n      this.worker.postMessage({ kind: \"keepAlive\", uuid: crypto.randomUUID() });\n      this.missedHeartbeat += 1;\n      log.trace(\"missedHeartbeat\", this.missedHeartbeat);\n    }, keepAliveMs);\n  }\n\n  onmessage(event: any): void {\n    const msg = event.data;\n    log.trace(\n      `MLC client message: [${msg.kind}] ${JSON.stringify(msg.content)}`,\n    );\n    try {\n      if (msg.kind === \"heartbeat\") {\n        this.missedHeartbeat = 0;\n        return;\n      }\n      super.onmessage(msg);\n    } catch (err: any) {\n      // This is expected to throw if user has multiple windows open\n      if (!err.message.startsWith(\"return from a unknown uuid\")) {\n        log.error(\"CreateWebServiceWorkerMLCEngine.onmessage\", err);\n      }\n    }\n  }\n}\n"
  },
  {
    "path": "src/support.ts",
    "content": "/** Util methods. */\nimport { Tokenizer } from \"@mlc-ai/web-tokenizers\";\nimport { AppConfig, MessagePlaceholders, ModelRecord } from \"./config\";\nimport {\n  ChatCompletionChunk,\n  ChatCompletionContentPartImage,\n  ChatCompletionMessageToolCall,\n} from \"./openai_api_protocols/index\";\nimport {\n  ModelNotFoundError,\n  ModelNotLoadedError,\n  SpecifiedModelNotFoundError,\n  ToolCallOutputInvalidTypeError,\n  ToolCallOutputMissingFieldsError,\n  ToolCallOutputParseError,\n  UnclearModelToUseError,\n} from \"./error\";\n\n/**\n * Based on `p_prob` of size (vocabSize,) which becomes a distribution after calling\n * `applySoftmaxWithTemperature()`, sample `top_logprobs` top-probable tokens.\n *\n * @param num_top_probs: `top_logprobs` from ChatCompletionRequest\n * @param p_prob: `logitsOnCPUArray`, being a distribution after `applySoftmaxWithTemperature()`.\n *\n * Followed implementation of `ComputeTopProbsImpl()` from [https://github.com/mlc-ai/mlc-llm/blob/\n * 5b8c529e9704abd09b0432da6dcb4b013fdf43b1/cpp/serve/sampler/cpu_sampler.cc].\n *\n * @returns Arrays of (tokenID, prob) pairs, ranked from highest prob to least.\n */\nexport function getTopProbs(\n  num_top_probs: number,\n  p_prob: Float32Array,\n): Array<[number, number]> {\n  if (num_top_probs == 0) return [];\n  // Initialize to dummy values\n  const top_probs: Array<[number, number]> = [];\n  const ndata = p_prob.length;\n  for (let i = 0; i < num_top_probs; i++) {\n    top_probs.push([-1, -1.0]);\n  }\n\n  let sum_prob = 0.0;\n  // Selection argsort.\n  for (let p = 0; p < ndata; p++) {\n    let i = num_top_probs - 1;\n    for (; i >= 0; --i) {\n      if (p_prob[p] > top_probs[i][1]) {\n        if (i !== num_top_probs - 1) {\n          top_probs[i + 1] = top_probs[i];\n        }\n      } else {\n        break;\n      }\n    }\n    if (i !== num_top_probs - 1) {\n      top_probs[i + 1] = [p, p_prob[p]];\n    }\n\n    // Early exit\n    sum_prob += p_prob[p];\n    if (1 - sum_prob <= top_probs[num_top_probs - 1][1]) {\n      break;\n    }\n  }\n  return top_probs;\n}\n\n/**\n * Get the token table in the form of a string list of tokens, ordered by their token id.\n * @param tokenizer A loaded tokenizer.\n * @note The size of the table (i.e. tokenizer.getVocabSize()) may be smaller than the `vocab_size`\n * in config.json (length of logits), see https://github.com/QwenLM/Qwen2/issues/147 and\n * https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/discussions/47.\n */\nexport function getTokenTableFromTokenizer(tokenizer: Tokenizer): string[] {\n  const tokenTable: string[] = [];\n  const vocabSize = tokenizer.getVocabSize();\n  for (let tokenId = 0; tokenId < vocabSize; tokenId++) {\n    tokenTable.push(tokenizer.idToToken(tokenId));\n  }\n  return tokenTable;\n}\n\n/**\n * Postprocess the suffix of ModelRecord.model to be \"/resolve/main/\" if it is not specified otherwise.\n * e.g. https://huggingface.co/mlc-ai/OpenHermes-2.5-Mistral-7B-q4f16_1-MLC/resolve/main/\n * @return the href of the final URL.\n */\nexport function cleanModelUrl(modelUrl: string): string {\n  // https://huggingface.co/USER/MODEL -> https://huggingface.co/USER/MODEL/\n  modelUrl += modelUrl.endsWith(\"/\") ? \"\" : \"/\";\n  if (!modelUrl.match(/.+\\/resolve\\/.+\\//)) modelUrl += \"resolve/main/\";\n  // https://huggingface.co/USER/MODEL/ -> https://huggingface.co/USER/MODEL/resolve/main/\n  return new URL(modelUrl).href;\n}\n\n// Constants for Hermes-2-Pro models function calling\n// Follows https://huggingface.co/NousResearch/Hermes-2-Pro-Llama-3-8B#prompt-format-for-function-calling\n\n/**\n * Json schema used to prompt the model for function calling; directly copied from the official guide.\n * This represents to a single function call.\n */\nexport const officialHermes2FunctionCallSchema = `{\"properties\": {\"arguments\": {\"title\": \"Arguments\", \"type\": \"object\"}, \"name\": {\"title\": \"Name\", \"type\": \"string\"}}, \"required\": [\"arguments\", \"name\"], \"title\": \"FunctionCall\", \"type\": \"object\"}`;\n\n/**\n * A list of such function calls. Used to specify response format, since the output is expected to\n * be a list of such function calls.\n */\nexport const officialHermes2FunctionCallSchemaArray = `{\"type\":\"array\",\"items\":${officialHermes2FunctionCallSchema}}`;\n\n/**\n * Full system prompt for Hermes-2-Pro function calling.\n */\nexport const hermes2FunctionCallingSystemPrompt = `You are a function calling AI model. You are \nprovided with function signatures within <tools></tools> XML tags. You may call one or more functions \nto assist with the user query. Don't make assumptions about what values to plug into functions. Here \nare the available tools: <tools> ${MessagePlaceholders.hermes_tools}  </tools>. \nUse the following pydantic model json schema for each tool call you will make: \n${officialHermes2FunctionCallSchema} For each function call return a json object.`;\n\n/**\n * Given a string outputMessage, parse it as a JSON object and return an array of tool calls.\n *\n * Expect outputMessage to be a valid JSON string, and expect it to be an array of Function with\n * fields `arguments` and `name`.\n */\nexport function getToolCallFromOutputMessage(\n  outputMessage: string,\n  isStreaming: false,\n): Array<ChatCompletionMessageToolCall>;\nexport function getToolCallFromOutputMessage(\n  outputMessage: string,\n  isStreaming: true,\n): Array<ChatCompletionChunk.Choice.Delta.ToolCall>;\nexport function getToolCallFromOutputMessage(\n  outputMessage: string,\n  isStreaming: boolean,\n):\n  | Array<ChatCompletionMessageToolCall>\n  | Array<ChatCompletionChunk.Choice.Delta.ToolCall> {\n  // 1. Parse outputMessage to JSON object\n  let toolCallsObject;\n  try {\n    toolCallsObject = JSON.parse(outputMessage);\n  } catch (err) {\n    throw new ToolCallOutputParseError(outputMessage, err as Error);\n  }\n\n  // 2. Expect to be an array\n  if (!(toolCallsObject instanceof Array)) {\n    throw new ToolCallOutputInvalidTypeError(\"array\");\n  }\n\n  // 3. Parse each tool call and populate tool_calls\n  const numToolCalls = toolCallsObject.length;\n  const tool_calls = [];\n  for (let id = 0; id < numToolCalls; id++) {\n    const curToolCall = toolCallsObject[id];\n    if (curToolCall.name === undefined || curToolCall.arguments === undefined) {\n      throw new ToolCallOutputMissingFieldsError(\n        [\"name\", \"arguments\"],\n        curToolCall,\n      );\n    }\n    tool_calls.push({\n      name: curToolCall.name,\n      arguments: JSON.stringify(curToolCall.arguments),\n    });\n  }\n\n  // 4. Return based on whether it is streaming or not\n  if (isStreaming) {\n    const tool_calls_result: Array<ChatCompletionChunk.Choice.Delta.ToolCall> =\n      [];\n    for (let id = 0; id < numToolCalls; id++) {\n      const curToolCall = tool_calls[id];\n      tool_calls_result.push({\n        index: id,\n        function: {\n          name: curToolCall.name,\n          arguments: curToolCall.arguments,\n        },\n        type: \"function\",\n      });\n    }\n    return tool_calls_result;\n  } else {\n    const tool_calls_result: Array<ChatCompletionMessageToolCall> = [];\n    for (let id = 0; id < numToolCalls; id++) {\n      const curToolCall = tool_calls[id];\n      tool_calls_result.push({\n        id: id.toString(),\n        function: {\n          name: curToolCall.name,\n          arguments: curToolCall.arguments,\n        },\n        type: \"function\",\n      });\n    }\n    return tool_calls_result;\n  }\n}\n\nexport function findModelRecord(\n  modelId: string,\n  appConfig: AppConfig,\n): ModelRecord {\n  const matchedItem = appConfig.model_list.find(\n    (item) => item.model_id == modelId,\n  );\n  if (matchedItem !== undefined) return matchedItem;\n  throw new ModelNotFoundError(modelId);\n}\n\n/**\n * Return the model to use given the loaded modelIds and requestModel. Throws error when unclear\n * which model to load.\n * @param loadedModelIds Models currently loaded in the engine.\n * @param requestModel Model the user specified to load via the request. Required when multiple\n *   models are loaded\n * @param requestName The type of request or API to load the model for. Needed for error throwing.\n */\nexport function getModelIdToUse(\n  loadedModelIds: string[],\n  requestModel: string | undefined | null,\n  requestName: string,\n): string {\n  let selectedModelId: string;\n  if (loadedModelIds.length === 0) {\n    throw new ModelNotLoadedError(requestName);\n  }\n  if (requestModel) {\n    // If specified model\n    if (loadedModelIds.indexOf(requestModel) === -1) {\n      throw new SpecifiedModelNotFoundError(\n        loadedModelIds,\n        requestModel,\n        requestName,\n      );\n    } else {\n      selectedModelId = requestModel;\n    }\n  } else {\n    // If not specified\n    if (loadedModelIds.length > 1) {\n      throw new UnclearModelToUseError(loadedModelIds, requestName);\n    } else {\n      selectedModelId = loadedModelIds[0];\n    }\n  }\n  return selectedModelId;\n}\n\n/**\n * TODO: Consider if this is the best strategy (though aligned with mlc-llm). We currently greedily\n * try to fill up prefillChunkSize. Consider the example with 2048 prefill chunk size:\n * const inputData = [\n    image1,  // 1921\n    rangeArr(0, 2048),\n    image2,\n  ];\n * Current approach results in chunks: \n   [image1, rangeArr(0, 127)],\n   [rangeArr(127, 2048)],\n   [image2],\n * This means 4 embedding kernels and 3 prefill kernels.\n * While the optimal chunking may be:\n   [image1],\n   [rangeArr(0, 2048)],\n   [image2],\n * This results in 3 embedding kernels and 3 prefill kernels.\n * However, greedy strategy is more intuitive and probably more generalizable.\n */\n\n/**\n * Chunk the inputData such that each chunk's total input length is smaller than prefill\n * chunk size.\n * @returns [the data chunks, the input length of each chunk]\n * @note precondition: if inputData has image in it, then prefillChunkSize >= IMAGE_EMBED_SIZE.\n */\nexport function getChunkedPrefillInputData(\n  inputData: Array<Array<number> | ImageURL>,\n  prefillChunkSize: number,\n): [Array<Array<number> | ImageURL>[], Array<number>] {\n  const chunks: Array<Array<number> | ImageURL>[] = [];\n  const chunkLens: Array<number> = [];\n  let curChunk: Array<Array<number> | ImageURL> = [];\n  let curChunkLen = 0;\n  for (let i = 0; i < inputData.length; i++) {\n    let curData: Array<number> | ImageURL = inputData[i];\n    const curDataLen = Array.isArray(curData)\n      ? curData.length\n      : IMAGE_EMBED_SIZE;\n    // 1. curData can fit into this chunk\n    if (curChunkLen + curDataLen <= prefillChunkSize) {\n      curChunk.push(curData);\n      curChunkLen += curDataLen;\n      if (curChunkLen === prefillChunkSize) {\n        chunks.push([...curChunk]);\n        chunkLens.push(curChunkLen);\n        curChunk = [];\n        curChunkLen = 0;\n      }\n      continue;\n    }\n\n    // 2. Otherwise, depends on whether it is token data or image data\n    if (Array.isArray(curData)) {\n      // 2.1. Token data, which itself needs to be chunked. Keep\n      // chunking and finalizing until finished\n      while (curData.length > 0) {\n        const curDataToChunkLen = Math.min(\n          curData.length,\n          prefillChunkSize - curChunkLen,\n        );\n        curChunk.push(curData.slice(0, curDataToChunkLen));\n        curChunkLen += curDataToChunkLen;\n        curData = curData.slice(curDataToChunkLen);\n        if (curChunkLen === prefillChunkSize) {\n          // curChunk is now full, so finalize to chunks\n          chunks.push([...curChunk]);\n          chunkLens.push(curChunkLen);\n          curChunk = [];\n          curChunkLen = 0;\n        }\n      }\n    } else {\n      // 2.2. Image data, which itself cannot be chunked, so cannot fit in current chunk.\n      // 2.2.1. Finalize curChunk\n      if (curChunk.length === 0) {\n        throw new Error(\n          \"InternalError: do not expect curChunk to be empty when an image does not fit.\",\n        );\n      }\n      chunks.push([...curChunk]);\n      chunkLens.push(curChunkLen);\n      // 2.2.2. Then push image to the new chunk\n      curChunk = [curData];\n      curChunkLen = IMAGE_EMBED_SIZE;\n      if (curChunkLen === prefillChunkSize) {\n        chunks.push([...curChunk]);\n        chunkLens.push(curChunkLen);\n        curChunk = [];\n        curChunkLen = 0;\n      }\n    }\n  }\n  // Last chunk\n  if (curChunk.length > 0) {\n    chunks.push([...curChunk]);\n    chunkLens.push(curChunkLen);\n  }\n\n  return [chunks, chunkLens];\n}\n\ntype Cont = () => void;\n\n/**\n * A lock implemented using Promise.\n *\n * Referred to:\n * - https://jackpordi.com/posts/locks-in-js-because-why-not\n * - https://www.linkedin.com/pulse/asynchronous-locking-using-promises-javascript-abdul-ahad-o7smf/\n */\nexport class CustomLock {\n  private acquired = false;\n  private readonly queue: Cont[] = [];\n\n  public async acquire(): Promise<void> {\n    if (!this.acquired) {\n      // If lock is free, directly return\n      this.acquired = true;\n    } else {\n      // Otherwise, push the request to the queue, and\n      // a future release() will resolve it\n      return new Promise<void>((resolve) => {\n        this.queue.push(resolve);\n      });\n    }\n  }\n\n  public async release(): Promise<void> {\n    if (!this.acquired) {\n      throw Error(\"InternalError: expect lock is acquired upon release()\");\n    }\n    if (this.queue.length === 0) {\n      // No one is waiting for the lock, so we free it\n      this.acquired = false;\n      return;\n    }\n\n    // Otherwise, hand the execution to the next in queue, and\n    // the lock is still acquired\n    const cont = this.queue.shift();\n    return new Promise((res: Cont) => {\n      cont!();\n      res();\n    });\n  }\n}\n\n// Image related\ntype ImageURL = ChatCompletionContentPartImage.ImageURL;\n\n// TODO(Charlie): currently hardcoded for phi3.5-vision num_crops 16\nexport const IMAGE_EMBED_SIZE = 1921;\n\n/**\n * Given a url, get the image data. The url can either start with `http` or `data:image`.\n */\nexport async function getImageDataFromURL(url: string): Promise<ImageData> {\n  const response = await fetch(url, { mode: \"cors\" });\n  const img = await createImageBitmap(await response.blob());\n  const canvas = new OffscreenCanvas(img.width, img.height);\n  const ctx = canvas.getContext(\"2d\");\n  if (!ctx) {\n    throw new Error(\"Could not get 2d context\");\n  }\n  ctx.drawImage(img, 0, 0);\n\n  const imageData = ctx.getImageData(0, 0, img.width, img.height);\n  return imageData;\n}\n\n/**\n * Given an ImageData, return the RGB array in Uint8ClampedArray. Note the ImageData.data\n * is RGBA, so we skip every fourth element of the data. The order goes by rows from the\n * top-left pixel to the bottom-right, in RGB order.\n */\nexport function getRGBArrayFromImageData(\n  imageData: ImageData,\n): Uint8ClampedArray {\n  const newData = new Uint8ClampedArray(imageData.width * imageData.height * 3);\n  for (let i = 0, offset = 0; i < imageData.data.length; i += 4) {\n    newData[offset++] = imageData.data[i];\n    newData[offset++] = imageData.data[i + 1];\n    newData[offset++] = imageData.data[i + 2];\n  }\n  return newData;\n}\n"
  },
  {
    "path": "src/types.ts",
    "content": "import { AppConfig, ChatOptions } from \"./config\";\nimport {\n  ChatCompletionRequest,\n  ChatCompletionRequestBase,\n  ChatCompletionRequestStreaming,\n  ChatCompletionRequestNonStreaming,\n  ChatCompletion,\n  ChatCompletionChunk,\n  CompletionCreateParams,\n  Completion,\n  CompletionCreateParamsBase,\n  CompletionCreateParamsStreaming,\n  CompletionCreateParamsNonStreaming,\n  EmbeddingCreateParams,\n  CreateEmbeddingResponse,\n} from \"./openai_api_protocols/index\";\nimport * as API from \"./openai_api_protocols/index\";\n\n/**\n * Report during intialization.\n */\nexport interface InitProgressReport {\n  progress: number;\n  timeElapsed: number;\n  text: string;\n}\n\n/**\n * Callbacks used to report initialization process.\n */\nexport type InitProgressCallback = (report: InitProgressReport) => void;\n\n/**\n * A stateful logitProcessor used to post-process logits after forwarding the input and before\n * sampling the next token. If used with `GenerationConfig.logit_bias`, logit_bias is applied after\n * `processLogits()` is called.\n */\nexport interface LogitProcessor {\n  /**\n   * Process logits after forward() and before sampling implicitly, happens on the CPU.\n   * @param logits The logits right after forward().\n   * Returns the processed logits.\n   */\n  processLogits: (logits: Float32Array) => Float32Array;\n\n  /**\n   * Use the sampled token to update the LogitProcessor's internal state. Called implicitly\n   * right after the next token is sampled/committed.\n   * @param token Token sampled from the processed logits.\n   */\n  processSampledToken: (token: number) => void;\n\n  /**\n   * Called when in `MLCEngine.resetChat()`. Can clear internal states.\n   */\n  resetState: () => void;\n}\n\n/**\n * Common interface of MLCEngine that UI can interact with\n */\nexport interface MLCEngineInterface {\n  /**\n   * An object that exposes chat-related APIs.\n   */\n  chat: API.Chat;\n\n  /**\n   * An object that exposes text completion APIs.\n   */\n  completions: API.Completions;\n\n  /**\n   * An object that exposes embeddings APIs.\n   */\n  embeddings: API.Embeddings;\n\n  /**\n   * Set an initialization progress callback function\n   * which reports the progress of model loading.\n   *\n   * This function can be useful to implement an UI that\n   * update as we loading the model.\n   *\n   * @param initProgressCallback The callback function\n   */\n  setInitProgressCallback: (initProgressCallback: InitProgressCallback) => void;\n\n  /**\n   * @returns The current initialization progress callback function.\n   */\n  getInitProgressCallback: () => InitProgressCallback | undefined;\n\n  /**\n   * Setter for the engine's appConfig.\n   */\n  setAppConfig: (appConfig: AppConfig) => void;\n\n  /**\n   * Reload the chat with a new model.\n   *\n   * @param modelId model_id of the model to load, either string or string[]. When multiple models\n   *   are provided, we load all models sequentially. Each modelId needs to either be in\n   *   `webllm.prebuiltAppConfig`, or in `engineConfig.appConfig`.\n   * @param chatOpts Extra options to optionally override the `mlc-chat-config.json` of `modelId`.\n   *   The size of which needs to match that of `modelId`; chatOpts[i] will be used for modelId[i].\n   * @returns A promise when reload finishes.\n   * @throws Throws error when device lost (mostly due to OOM); users should re-call reload(),\n   *   potentially with a smaller model or smaller context window size.\n   * @note This is an async function.\n   */\n  reload: (\n    modelId: string | string[],\n    chatOpts?: ChatOptions | ChatOptions[],\n  ) => Promise<void>;\n\n  /**\n   * OpenAI-style API. Generate a chat completion response for the given conversation and\n   * configuration. Use `engine.chat.completions.create()` to invoke this API.\n   *\n   * @param request A OpenAI-style ChatCompletion request.\n   *\n   * @note The API is completely functional in behavior. That is, a previous request would not\n   * affect the current request's result. Thus, for multi-round chatting, users are responsible for\n   * maintaining the chat history. With that being said, as an implicit internal optimization, if we\n   * detect that the user is performing multi-round chatting, we will preserve the KV cache and only\n   * prefill the new tokens.\n   * @note For requests sent to the same modelId, will block until all previous requests finish.\n   * @note For more, see https://platform.openai.com/docs/api-reference/chat\n   */\n  chatCompletion(\n    request: ChatCompletionRequestNonStreaming,\n  ): Promise<ChatCompletion>;\n  chatCompletion(\n    request: ChatCompletionRequestStreaming,\n  ): Promise<AsyncIterable<ChatCompletionChunk>>;\n  chatCompletion(\n    request: ChatCompletionRequestBase,\n  ): Promise<AsyncIterable<ChatCompletionChunk> | ChatCompletion>;\n  chatCompletion(\n    request: ChatCompletionRequest,\n  ): Promise<AsyncIterable<ChatCompletionChunk> | ChatCompletion>;\n\n  /**\n   * OpenAI-style API. Completes a CompletionCreateParams, a text completion with no chat template.\n   * Use `engine.completions.create()` to invoke this API.\n   *\n   * @param request An OpenAI-style Completion request.\n   *\n   * @note For requests sent to the same modelId, will block until all previous requests finish.\n   * @note For more, see https://platform.openai.com/docs/api-reference/completions\n   */\n  completion(request: CompletionCreateParamsNonStreaming): Promise<Completion>;\n  completion(\n    request: CompletionCreateParamsStreaming,\n  ): Promise<AsyncIterable<Completion>>;\n  completion(\n    request: CompletionCreateParamsBase,\n  ): Promise<AsyncIterable<Completion> | Completion>;\n  completion(\n    request: CompletionCreateParams,\n  ): Promise<AsyncIterable<Completion> | Completion>;\n\n  /**\n   * OpenAI-style API. Creates an embedding vector representing the input text.\n   * Use `engine.embeddings.create()` to invoke this API.\n   *\n   * @param request An OpenAI-style Embeddings request.\n   *\n   * @note For requests sent to the same modelId, will block until all previous requests finish.\n   * @note For more, see https://platform.openai.com/docs/api-reference/embeddings/create\n   */\n  embedding(request: EmbeddingCreateParams): Promise<CreateEmbeddingResponse>;\n\n  /**\n   * @returns A text summarizing the runtime stats.\n   * @param modelId Only required when multiple models are loaded.\n   * @note This is an async function\n   */\n  runtimeStatsText: (modelId?: string) => Promise<string>;\n\n  /**\n   * Interrupt the generate process if it is already running.\n   */\n  interruptGenerate: () => void;\n\n  /**\n   * Explicitly unload the currently loaded model(s) and release the related resources. Waits until\n   * the webgpu device finishes all submitted work and destroys itself.\n   * @note This is an asynchronous function.\n   */\n  unload: () => Promise<void>;\n\n  /**\n   * Reset the current chat session by clear all memories.\n   * @param keepStats: If True, do not reset the statistics.\n   * @param modelId Only required when multiple models are loaded.\n   */\n  resetChat: (keepStats?: boolean, modelId?: string) => Promise<void>;\n\n  /**\n   * Get the current generated response.\n   * @param modelId Only required when multiple models are loaded.\n   * @returns The current output message.\n   */\n  getMessage: (modelId?: string) => Promise<string>;\n\n  /**\n   * Returns the device's maxStorageBufferBindingSize, can be used to guess whether the device\n   * has limited resources like an Android phone.\n   */\n  getMaxStorageBufferBindingSize(): Promise<number>;\n\n  /**\n   * Returns the device's gpu vendor (e.g. arm, qualcomm, apple) if available. Otherwise return\n   * an empty string.\n   */\n  getGPUVendor(): Promise<string>;\n\n  /**\n   * Forward the given input tokens to the model, then sample the next token.\n   *\n   * This function has side effects as the model will update its KV cache.\n   *\n   * @param inputIds The input tokens.\n   * @param isPrefill True if prefill, false if decode; only used for statistics.\n   * @param modelId Only required when multiple models are loaded.\n   * @returns Next token sampled.\n   * @note This is an async function.\n   */\n  forwardTokensAndSample(\n    inputIds: Array<number>,\n    isPrefill: boolean,\n    modelId?: string,\n  ): Promise<number>;\n\n  /**\n   * Set MLCEngine logging output level\n   *\n   * @param logLevel The new log level\n   */\n  setLogLevel(logLevel: LogLevel): void;\n}\n\nexport const LOG_LEVELS = {\n  TRACE: 0,\n  DEBUG: 1,\n  INFO: 2,\n  WARN: 3,\n  ERROR: 4,\n  SILENT: 5,\n};\nexport type LogLevel = keyof typeof LOG_LEVELS;\n\nexport type LatencyBreakdown = {\n  logitProcessorTime: number[];\n  logitBiasTime: number[];\n  penaltyTime: number[];\n  sampleTime: number[];\n  totalTime: number[];\n  grammarBitmaskTime: number[];\n};\n"
  },
  {
    "path": "src/utils.ts",
    "content": "import { AppConfig, ChatOptions, ModelRecord } from \"./config\";\n\n// Helper function to compare two arrays\nexport function areArraysEqual(arr1?: Array<any>, arr2?: Array<any>): boolean {\n  if (!arr1 && !arr2) return true;\n  if (!arr1 || !arr2) return false;\n  if (arr1.length !== arr2.length) return false;\n  for (let i = 0; i < arr1.length; i++) {\n    if (arr1[i] !== arr2[i]) return false;\n  }\n  return true;\n}\n\n// Helper function to compare two objects deeply\nfunction areObjectsEqual(obj1: any, obj2: any): boolean {\n  if (obj1 === obj2) return true;\n  if (typeof obj1 !== typeof obj2) return false;\n  if (typeof obj1 !== \"object\" || obj1 === null || obj2 === null) return false;\n\n  const keys1 = Object.keys(obj1);\n  const keys2 = Object.keys(obj2);\n  if (keys1.length !== keys2.length) return false;\n\n  for (const key of keys1) {\n    if (!keys2.includes(key) || !areObjectsEqual(obj1[key], obj2[key]))\n      return false;\n  }\n  return true;\n}\n\n// Function to compare two ModelRecord instances\nexport function areModelRecordsEqual(\n  record1: ModelRecord,\n  record2: ModelRecord,\n): boolean {\n  // Compare primitive fields\n  if (\n    record1.model !== record2.model ||\n    record1.model_id !== record2.model_id ||\n    record1.model_lib !== record2.model_lib ||\n    record1.vram_required_MB !== record2.vram_required_MB ||\n    record1.low_resource_required !== record2.low_resource_required ||\n    record1.buffer_size_required_bytes !== record2.buffer_size_required_bytes\n  ) {\n    return false;\n  }\n\n  // Compare required_features arrays\n  if (\n    (record1.required_features && !record2.required_features) ||\n    (!record1.required_features && record2.required_features)\n  ) {\n    return false;\n  }\n\n  if (record1.required_features && record2.required_features) {\n    if (record1.required_features.length !== record2.required_features.length) {\n      return false;\n    }\n\n    for (let i = 0; i < record1.required_features.length; i++) {\n      if (record1.required_features[i] !== record2.required_features[i]) {\n        return false;\n      }\n    }\n  }\n\n  return true;\n}\n\nexport function areAppConfigsEqual(\n  config1?: AppConfig,\n  config2?: AppConfig,\n): boolean {\n  if (config1 === undefined || config2 === undefined) {\n    return config1 === config2;\n  }\n\n  // Check if both configurations have the same IndexedDB cache usage\n  if (config1.useIndexedDBCache !== config2.useIndexedDBCache) {\n    return false;\n  }\n\n  // Check if both configurations have the same number of model records\n  if (config1.model_list.length !== config2.model_list.length) {\n    return false;\n  }\n\n  // Compare each ModelRecord in the model_list\n  for (let i = 0; i < config1.model_list.length; i++) {\n    if (!areModelRecordsEqual(config1.model_list[i], config2.model_list[i])) {\n      return false;\n    }\n  }\n\n  // If all checks passed, the configurations are equal\n  return true;\n}\n\nexport function areChatOptionsEqual(\n  options1?: ChatOptions,\n  options2?: ChatOptions,\n): boolean {\n  if (options1 === undefined || options2 === undefined) {\n    return options1 === options2;\n  }\n  // Compare each property of ChatOptions (which are Partial<ChatConfig>)\n  if (!areArraysEqual(options1.tokenizer_files, options2.tokenizer_files))\n    return false;\n  if (!areObjectsEqual(options1.conv_config, options2.conv_config))\n    return false;\n  if (options1.conv_template !== options2.conv_template) return false;\n  if (options1.repetition_penalty !== options2.repetition_penalty) return false;\n  if (options1.frequency_penalty !== options2.frequency_penalty) return false;\n  if (options1.presence_penalty !== options2.presence_penalty) return false;\n  if (options1.top_p !== options2.top_p) return false;\n  if (options1.temperature !== options2.temperature) return false;\n  if (options1.bos_token_id !== options2.bos_token_id) return false;\n\n  // If all checks passed, the options are equal\n  return true;\n}\n\nexport function areChatOptionsListEqual(\n  options1?: ChatOptions[],\n  options2?: ChatOptions[],\n): boolean {\n  if (options1 && options2) {\n    // Both defined, need to compare\n    if (options1.length !== options2.length) {\n      return false;\n    } else {\n      for (let i = 0; i < options1.length; i++) {\n        if (!areChatOptionsEqual(options1[i], options2[i])) {\n          return false;\n        }\n      }\n      return true;\n    }\n  } else if (!options1 && !options2) {\n    // Both undefined, equal\n    return true;\n  } else {\n    // One defined, other not\n    return false;\n  }\n}\n"
  },
  {
    "path": "src/web_worker.ts",
    "content": "import { AppConfig, ChatOptions, MLCEngineConfig } from \"./config\";\nimport {\n  MLCEngineInterface,\n  InitProgressCallback,\n  InitProgressReport,\n  LogLevel,\n  LogitProcessor,\n} from \"./types\";\nimport {\n  ChatCompletionRequest,\n  ChatCompletionRequestBase,\n  ChatCompletionRequestStreaming,\n  ChatCompletionRequestNonStreaming,\n  ChatCompletion,\n  ChatCompletionChunk,\n  Completion,\n  CompletionCreateParamsNonStreaming,\n  CompletionCreateParamsStreaming,\n  CompletionCreateParamsBase,\n  CompletionCreateParams,\n  CreateEmbeddingResponse,\n  EmbeddingCreateParams,\n} from \"./openai_api_protocols/index\";\nimport * as API from \"./openai_api_protocols/index\";\nimport {\n  MessageContent,\n  ReloadParams,\n  ForwardTokensAndSampleParams,\n  ChatCompletionNonStreamingParams,\n  ChatCompletionStreamInitParams,\n  ResetChatParams,\n  WorkerResponse,\n  WorkerRequest,\n  CompletionNonStreamingParams,\n  EmbeddingParams,\n  CompletionStreamInitParams,\n  GetMessageParams,\n  RuntimeStatsTextParams,\n  CompletionStreamNextChunkParams,\n} from \"./message\";\nimport log from \"loglevel\";\nimport { MLCEngine } from \"./engine\";\nimport {\n  UnknownMessageKindError,\n  WorkerEngineModelNotLoadedError,\n} from \"./error\";\nimport { areArraysEqual } from \"./utils\";\nimport { getModelIdToUse } from \"./support\";\n\n/**\n * Worker handler that can be used in a WebWorker\n *\n * @example\n *\n * // setup a chat worker handler that routes\n * // requests to the chat\n * const engine = new MLCEngine();\n * cont handler = new WebWorkerMLCEngineHandler(engine);\n * onmessage = handler.onmessage;\n */\nexport class WebWorkerMLCEngineHandler {\n  /**\n   * The modelId and chatOpts that the underlying engine (backend) is currently loaded with.\n   * An engine can be loaded with multiple models, so modelId and chatOpts are lists.\n   *\n   * TODO(webllm-team): This is always in-sync with `this.engine` unless device is lost due to\n   * unexpected reason. Therefore, we should get it from `this.engine` directly and make handler\n   * stateless. Besides, consider if we should add appConfig, or use engine's API to find the\n   * corresponding model record rather than relying on just the modelId.\n   */\n  modelId?: string[];\n  chatOpts?: ChatOptions[];\n\n  public engine: MLCEngine;\n  /** ChatCompletion and Completion share the same chunk generator. Each loaded model has its own. */\n  protected loadedModelIdToAsyncGenerator: Map<\n    string,\n    AsyncGenerator<ChatCompletionChunk | Completion, void, void>\n  >;\n\n  /**\n   * @param engine A concrete implementation of MLCEngineInterface\n   */\n  constructor() {\n    this.engine = new MLCEngine();\n    this.loadedModelIdToAsyncGenerator = new Map<\n      string,\n      AsyncGenerator<ChatCompletionChunk | Completion, void, void>\n    >();\n    this.engine.setInitProgressCallback((report: InitProgressReport) => {\n      const msg: WorkerResponse = {\n        kind: \"initProgressCallback\",\n        uuid: \"\",\n        content: report,\n      };\n      this.postMessage(msg);\n    });\n  }\n\n  postMessage(msg: any) {\n    // Use Web Worker DOM Message API\n    postMessage(msg);\n  }\n\n  setLogitProcessorRegistry(\n    logitProcessorRegistry?: Map<string, LogitProcessor>,\n  ) {\n    this.engine.setLogitProcessorRegistry(logitProcessorRegistry);\n  }\n\n  async handleTask<T extends MessageContent>(\n    uuid: string,\n    task: () => Promise<T>,\n  ) {\n    try {\n      const res = await task();\n      const msg: WorkerResponse = {\n        kind: \"return\",\n        uuid: uuid,\n        content: res,\n      };\n      this.postMessage(msg);\n    } catch (err) {\n      const errStr = (err as object).toString();\n      const msg: WorkerResponse = {\n        kind: \"throw\",\n        uuid: uuid,\n        content: errStr,\n      };\n      this.postMessage(msg);\n    }\n  }\n\n  onmessage(\n    event: any,\n    onComplete?: (value: any) => void,\n    onError?: () => void,\n  ) {\n    let msg: WorkerRequest;\n    if (event instanceof MessageEvent) {\n      msg = event.data as WorkerRequest;\n    } else {\n      msg = event as WorkerRequest;\n    }\n    switch (msg.kind) {\n      case \"reload\": {\n        this.handleTask(msg.uuid, async () => {\n          const params = msg.content as ReloadParams;\n          await this.engine.reload(params.modelId, params.chatOpts);\n          this.modelId = params.modelId;\n          this.chatOpts = params.chatOpts;\n          onComplete?.(null);\n          return null;\n        });\n        return;\n      }\n      case \"forwardTokensAndSample\": {\n        this.handleTask(msg.uuid, async () => {\n          const params = msg.content as ForwardTokensAndSampleParams;\n          const res = await this.engine.forwardTokensAndSample(\n            params.inputIds,\n            params.isPrefill,\n            params.modelId,\n          );\n          onComplete?.(res);\n          return res;\n        });\n        return;\n      }\n      // For engine.chat.completions.create()\n      case \"chatCompletionNonStreaming\": {\n        // Directly return the ChatCompletion response\n        this.handleTask(msg.uuid, async () => {\n          const params = msg.content as ChatCompletionNonStreamingParams;\n          await this.reloadIfUnmatched(params.modelId, params.chatOpts);\n          const res = await this.engine.chatCompletion(params.request);\n          onComplete?.(res);\n          return res;\n        });\n        return;\n      }\n      case \"chatCompletionStreamInit\": {\n        // One-time set up that instantiates the chunk generator in worker\n        this.handleTask(msg.uuid, async () => {\n          const params = msg.content as ChatCompletionStreamInitParams;\n          // Also ensures params.selectedModelId will match what this.engine selects\n          await this.reloadIfUnmatched(params.modelId, params.chatOpts);\n          // Register new async generator for this new request of the model\n          const curGenerator = (await this.engine.chatCompletion(\n            params.request,\n          )) as AsyncGenerator<ChatCompletionChunk, void, void>;\n          this.loadedModelIdToAsyncGenerator.set(\n            params.selectedModelId,\n            curGenerator,\n          );\n          onComplete?.(null);\n          return null;\n        });\n        return;\n      }\n      // For engine.completions.create()\n      case \"completionNonStreaming\": {\n        // Directly return the ChatCompletion response\n        this.handleTask(msg.uuid, async () => {\n          const params = msg.content as CompletionNonStreamingParams;\n          await this.reloadIfUnmatched(params.modelId, params.chatOpts);\n          const res = await this.engine.completion(params.request);\n          onComplete?.(res);\n          return res;\n        });\n        return;\n      }\n      case \"completionStreamInit\": {\n        // One-time set up that instantiates the chunk generator in worker\n        this.handleTask(msg.uuid, async () => {\n          const params = msg.content as CompletionStreamInitParams;\n          // Also ensures params.selectedModelId will match what this.engine selects\n          await this.reloadIfUnmatched(params.modelId, params.chatOpts);\n          // Register new async generator for this new request of the model\n          const curGenerator = (await this.engine.completion(\n            params.request,\n          )) as AsyncGenerator<Completion, void, void>;\n          this.loadedModelIdToAsyncGenerator.set(\n            params.selectedModelId,\n            curGenerator,\n          );\n          onComplete?.(null);\n          return null;\n        });\n        return;\n      }\n      // Shared by engine.chat.completions.create() and engine.completions.create()\n      case \"completionStreamNextChunk\": {\n        // Note: ChatCompletion and Completion share the same chunk generator.\n        // For any subsequent request, we return whatever `next()` yields\n        this.handleTask(msg.uuid, async () => {\n          const params = msg.content as CompletionStreamNextChunkParams;\n          const curGenerator = this.loadedModelIdToAsyncGenerator.get(\n            params.selectedModelId,\n          );\n          if (curGenerator === undefined) {\n            throw Error(\n              \"InternalError: Chunk generator in worker should be instantiated by now.\",\n            );\n          }\n          // Yield the next chunk\n          const { value } = await curGenerator.next();\n          onComplete?.(value);\n          return value;\n        });\n        return;\n      }\n      // For engine.embeddings.create()\n      case \"embedding\": {\n        // Directly return the Embeddings response\n        this.handleTask(msg.uuid, async () => {\n          const params = msg.content as EmbeddingParams;\n          await this.reloadIfUnmatched(params.modelId, params.chatOpts);\n          const res = await this.engine.embedding(params.request);\n          onComplete?.(res);\n          return res;\n        });\n        return;\n      }\n      case \"runtimeStatsText\": {\n        this.handleTask(msg.uuid, async () => {\n          const params = msg.content as RuntimeStatsTextParams;\n          const res = await this.engine.runtimeStatsText(params.modelId);\n          onComplete?.(res);\n          return res;\n        });\n        return;\n      }\n      case \"interruptGenerate\": {\n        this.handleTask(msg.uuid, async () => {\n          this.engine.interruptGenerate();\n          onComplete?.(null);\n          return null;\n        });\n        return;\n      }\n      case \"unload\": {\n        // Unset modelId and chatOpts since backend unloads the model\n        this.handleTask(msg.uuid, async () => {\n          await this.engine.unload();\n          this.modelId = undefined;\n          this.chatOpts = undefined;\n          // This may not be cleaned properly when one asyncGenerator finishes.\n          // We only clear at unload(), which may not be called upon reload().\n          // However, service_worker may skip reload(). Will leave as is for now.\n          this.loadedModelIdToAsyncGenerator.clear();\n          onComplete?.(null);\n          return null;\n        });\n        return;\n      }\n      case \"resetChat\": {\n        this.handleTask(msg.uuid, async () => {\n          const params = msg.content as ResetChatParams;\n          await this.engine.resetChat(params.keepStats, params.modelId);\n          onComplete?.(null);\n          return null;\n        });\n        return;\n      }\n      case \"getMaxStorageBufferBindingSize\": {\n        this.handleTask(msg.uuid, async () => {\n          const res = await this.engine.getMaxStorageBufferBindingSize();\n          onComplete?.(res);\n          return res;\n        });\n        return;\n      }\n      case \"getGPUVendor\": {\n        this.handleTask(msg.uuid, async () => {\n          const res = await this.engine.getGPUVendor();\n          onComplete?.(res);\n          return res;\n        });\n        return;\n      }\n      case \"getMessage\": {\n        this.handleTask(msg.uuid, async () => {\n          const params = msg.content as GetMessageParams;\n          const res = await this.engine.getMessage(params.modelId);\n          onComplete?.(res);\n          return res;\n        });\n        return;\n      }\n      case \"setLogLevel\": {\n        const logLevel = msg.content as LogLevel;\n        this.engine.setLogLevel(logLevel);\n        log.setLevel(logLevel);\n        onComplete?.(null);\n        return;\n      }\n      case \"setAppConfig\": {\n        const appConfig = msg.content as AppConfig;\n        this.engine.setAppConfig(appConfig);\n        onComplete?.(null);\n        return;\n      }\n      case \"customRequest\": {\n        onComplete?.(null);\n        return;\n      }\n      default: {\n        if (msg.kind && msg.content) {\n          onError?.();\n          throw new UnknownMessageKindError(msg.kind, msg.content);\n        } else {\n          // Ignore irrelavent events\n          onComplete?.(null);\n        }\n      }\n    }\n  }\n\n  /** Check whether frontend expectation matches with backend (modelId and chatOpts). If not (due\n   * to possibly killed service worker), we reload here.\n   * For more, see https://github.com/mlc-ai/web-llm/pull/533\n   */\n  async reloadIfUnmatched(\n    expectedModelId: string[],\n    expectedChatOpts?: ChatOptions[],\n  ) {\n    // TODO: should we also check expectedChatOpts here?\n    if (!areArraysEqual(this.modelId, expectedModelId)) {\n      log.warn(\n        \"WebWorkerMLCEngine expects model is loaded in WebWorkerMLCEngineHandler, \" +\n          \"but it is not. This may due to web/service worker is unexpectedly killed.\\n\" +\n          \"Reloading engine in WebWorkerMLCEngineHandler.\",\n      );\n      await this.engine.reload(expectedModelId, expectedChatOpts);\n    }\n  }\n}\n\nexport interface ChatWorker {\n  onmessage: any;\n  postMessage: (message: any) => void;\n}\n\n/**\n * Creates `WebWorkerMLCEngine`, a client that holds the same interface as `MLCEngine`.\n *\n * Equivalent to `new webllm.WebWorkerMLCEngine(worker).reload(...)`.\n *\n * @param worker The worker that holds the actual MLCEngine, initialized with `new Worker()`.\n * @param modelId model_id of the model to load, either string or string[]. When multiple models\n *   are provided, we load all models sequentially. Each modelId needs to either be in\n *   `webllm.prebuiltAppConfig`, or in `engineCOnfig.appConfig`.\n * @param engineConfig Optionally configures the engine, see `webllm.MLCEngineConfig` for more.\n * @param chatOpts Extra options to optionally override the `mlc-chat-config.json` of `modelId`.\n *   The size of which needs to match that of `modelId`; chatOpts[i] will be used for modelId[i].\n * @returns An initialized `WebLLM.WebWorkerMLCEngine` with `modelId` loaded.\n *\n * @note engineConfig.logitProcessorRegistry is ignored for `CreateWebWorkMLCEngine()`.\n */\nexport async function CreateWebWorkerMLCEngine(\n  worker: any,\n  modelId: string | string[],\n  engineConfig?: MLCEngineConfig,\n  chatOpts?: ChatOptions | ChatOptions[],\n): Promise<WebWorkerMLCEngine> {\n  const webWorkerMLCEngine = new WebWorkerMLCEngine(worker, engineConfig);\n  await webWorkerMLCEngine.reload(modelId, chatOpts);\n  return webWorkerMLCEngine;\n}\n\n/**\n * A client of MLCEngine that exposes the same interface\n *\n * @example\n *\n * const chat = new webllm.WebWorkerMLCEngine(new Worker(\n *   new URL('./worker.ts', import.meta.url),\n *   {type: 'module'}\n * ));\n */\nexport class WebWorkerMLCEngine implements MLCEngineInterface {\n  public worker: ChatWorker;\n  /** For chat.completions.create() */\n  public chat: API.Chat;\n  /** For completions.create() */\n  public completions: API.Completions;\n  /** For embeddings.create() */\n  public embeddings: API.Embeddings;\n\n  /**\n   * The modelId and chatOpts that the frontend expects the backend engine is currently loaded\n   * with. Needed for service worker. It is the backend and handler's job to match up with the\n   * expectation despite the web/service worker possibly being killed.\n   * Since an engine can load multiple models, both modelId and chatOpts are lists.\n   */\n  modelId?: string[];\n  chatOpts?: ChatOptions[];\n\n  private initProgressCallback?: InitProgressCallback;\n  private pendingPromise = new Map<string, (msg: WorkerResponse) => void>();\n\n  constructor(worker: ChatWorker, engineConfig?: MLCEngineConfig) {\n    this.worker = worker;\n    worker.onmessage = (event: any) => {\n      this.onmessage.bind(this)(event);\n    };\n\n    if (engineConfig?.appConfig) {\n      this.setAppConfig(engineConfig?.appConfig);\n    }\n    if (engineConfig?.logLevel) {\n      this.setLogLevel(engineConfig?.logLevel);\n    }\n    this.setInitProgressCallback(engineConfig?.initProgressCallback);\n    if (engineConfig?.logitProcessorRegistry) {\n      if (engineConfig?.logitProcessorRegistry) {\n        log.warn(\n          \"Warning: The `logitProcessorRegistry` property in `engineConfig` will be ignored when using the WebWorkerMLCEngine constructor. To set `logitProcessorRegistry`, use the engine constructor within the worker script instead.\",\n        );\n      }\n    }\n\n    this.chat = new API.Chat(this);\n    this.completions = new API.Completions(this);\n    this.embeddings = new API.Embeddings(this);\n  }\n\n  setInitProgressCallback(initProgressCallback?: InitProgressCallback) {\n    this.initProgressCallback = initProgressCallback;\n  }\n\n  getInitProgressCallback(): InitProgressCallback | undefined {\n    return this.initProgressCallback;\n  }\n\n  setAppConfig(appConfig: AppConfig) {\n    const msg: WorkerRequest = {\n      kind: \"setAppConfig\",\n      uuid: crypto.randomUUID(),\n      content: appConfig,\n    };\n    this.worker.postMessage(msg);\n  }\n\n  setLogLevel(logLevel: LogLevel) {\n    log.setLevel(logLevel);\n    const msg: WorkerRequest = {\n      kind: \"setLogLevel\",\n      uuid: crypto.randomUUID(),\n      content: logLevel,\n    };\n    this.worker.postMessage(msg);\n  }\n\n  protected getPromise<T extends MessageContent>(\n    msg: WorkerRequest,\n  ): Promise<T> {\n    const uuid = msg.uuid;\n    const executor = (\n      resolve: (arg: T) => void,\n      reject: (arg: any) => void,\n    ) => {\n      const cb = (msg: WorkerResponse) => {\n        if (msg.kind == \"return\") {\n          resolve(msg.content as T);\n        } else {\n          if (msg.kind != \"throw\") {\n            reject(\"Uknown msg kind \" + msg.kind);\n          } else {\n            reject(msg.content);\n          }\n        }\n      };\n      this.pendingPromise.set(uuid, cb);\n    };\n    const promise = new Promise<T>(executor);\n    this.worker.postMessage(msg);\n    return promise;\n  }\n\n  async reload(\n    modelId: string | string[],\n    chatOpts?: ChatOptions | ChatOptions[],\n  ): Promise<void> {\n    // Always convert modelId and chatOpts to lists internally for ease of manipulation\n    if (!Array.isArray(modelId)) {\n      modelId = [modelId];\n    }\n    if (chatOpts !== undefined && !Array.isArray(chatOpts)) {\n      chatOpts = [chatOpts];\n    }\n\n    const msg: WorkerRequest = {\n      kind: \"reload\",\n      uuid: crypto.randomUUID(),\n      content: {\n        modelId: modelId,\n        chatOpts: chatOpts,\n      },\n    };\n    await this.getPromise<null>(msg);\n    this.modelId = modelId;\n    this.chatOpts = chatOpts;\n  }\n\n  async getMaxStorageBufferBindingSize(): Promise<number> {\n    const msg: WorkerRequest = {\n      kind: \"getMaxStorageBufferBindingSize\",\n      uuid: crypto.randomUUID(),\n      content: null,\n    };\n    return await this.getPromise<number>(msg);\n  }\n\n  async getGPUVendor(): Promise<string> {\n    const msg: WorkerRequest = {\n      kind: \"getGPUVendor\",\n      uuid: crypto.randomUUID(),\n      content: null,\n    };\n    return await this.getPromise<string>(msg);\n  }\n\n  async getMessage(modelId?: string): Promise<string> {\n    const msg: WorkerRequest = {\n      kind: \"getMessage\",\n      uuid: crypto.randomUUID(),\n      content: {\n        modelId: modelId,\n      },\n    };\n    return await this.getPromise<string>(msg);\n  }\n\n  async runtimeStatsText(modelId?: string): Promise<string> {\n    const msg: WorkerRequest = {\n      kind: \"runtimeStatsText\",\n      uuid: crypto.randomUUID(),\n      content: {\n        modelId: modelId,\n      },\n    };\n    return await this.getPromise<string>(msg);\n  }\n\n  interruptGenerate(): void {\n    const msg: WorkerRequest = {\n      kind: \"interruptGenerate\",\n      uuid: crypto.randomUUID(),\n      content: null,\n    };\n    this.getPromise<null>(msg);\n  }\n\n  async unload(): Promise<void> {\n    const msg: WorkerRequest = {\n      kind: \"unload\",\n      uuid: crypto.randomUUID(),\n      content: null,\n    };\n    await this.getPromise<null>(msg);\n    this.modelId = undefined;\n    this.chatOpts = undefined;\n  }\n\n  async resetChat(keepStats = false, modelId?: string): Promise<void> {\n    const msg: WorkerRequest = {\n      kind: \"resetChat\",\n      uuid: crypto.randomUUID(),\n      content: {\n        keepStats: keepStats,\n        modelId: modelId,\n      },\n    };\n    await this.getPromise<null>(msg);\n  }\n\n  async forwardTokensAndSample(\n    inputIds: Array<number>,\n    isPrefill: boolean,\n    modelId?: string,\n  ): Promise<number> {\n    const msg: WorkerRequest = {\n      kind: \"forwardTokensAndSample\",\n      uuid: crypto.randomUUID(),\n      content: {\n        inputIds: inputIds,\n        isPrefill: isPrefill,\n        modelId: modelId,\n      },\n    };\n    return await this.getPromise<number>(msg);\n  }\n\n  /**\n   * Every time the generator is called, we post a message to the worker asking it to\n   * decode one step, and we expect to receive a message of `ChatCompletionChunk` from\n   * the worker which we yield. The last message is `void`, meaning the generator has nothing\n   * to yield anymore.\n   *\n   * @param selectedModelId: The model of whose async generator to call next() to get next chunk.\n   *   Needed because an engine can load multiple models.\n   *\n   * @note ChatCompletion and Completion share the same chunk generator.\n   */\n  async *asyncGenerate(\n    selectedModelId: string,\n  ): AsyncGenerator<ChatCompletionChunk | Completion, void, void> {\n    // Every time it gets called, sends message to worker, asking for the next chunk\n    while (true) {\n      const msg: WorkerRequest = {\n        kind: \"completionStreamNextChunk\",\n        uuid: crypto.randomUUID(),\n        content: {\n          selectedModelId: selectedModelId,\n        } as CompletionStreamNextChunkParams,\n      };\n      const ret = await this.getPromise<ChatCompletionChunk>(msg);\n      // If the worker's generator reached the end, it would return a `void`\n      if (typeof ret !== \"object\") {\n        break;\n      }\n      yield ret;\n    }\n  }\n\n  async chatCompletion(\n    request: ChatCompletionRequestNonStreaming,\n  ): Promise<ChatCompletion>;\n  async chatCompletion(\n    request: ChatCompletionRequestStreaming,\n  ): Promise<AsyncIterable<ChatCompletionChunk>>;\n  async chatCompletion(\n    request: ChatCompletionRequestBase,\n  ): Promise<AsyncIterable<ChatCompletionChunk> | ChatCompletion>;\n  async chatCompletion(\n    request: ChatCompletionRequest,\n  ): Promise<AsyncIterable<ChatCompletionChunk> | ChatCompletion> {\n    if (this.modelId === undefined) {\n      throw new WorkerEngineModelNotLoadedError(this.constructor.name);\n    }\n    // Needed for the streaming case. Consolidate model id to specify\n    // which model's asyncGenerator to instantiate or call next() on.\n    // Since handler can maintain multiple generators concurrently\n    const selectedModelId = getModelIdToUse(\n      this.modelId ? this.modelId : [],\n      request.model,\n      \"ChatCompletionRequest\",\n    );\n\n    if (request.stream) {\n      // First let worker instantiate a generator\n      const msg: WorkerRequest = {\n        kind: \"chatCompletionStreamInit\",\n        uuid: crypto.randomUUID(),\n        content: {\n          request: request,\n          selectedModelId: selectedModelId,\n          modelId: this.modelId,\n          chatOpts: this.chatOpts,\n        },\n      };\n      await this.getPromise<null>(msg);\n\n      // Then return an async chunk generator that resides on the client side\n      return this.asyncGenerate(selectedModelId) as AsyncGenerator<\n        ChatCompletionChunk,\n        void,\n        void\n      >;\n    }\n\n    // Non streaming case is more straightforward\n    const msg: WorkerRequest = {\n      kind: \"chatCompletionNonStreaming\",\n      uuid: crypto.randomUUID(),\n      content: {\n        request: request,\n        modelId: this.modelId,\n        chatOpts: this.chatOpts,\n      },\n    };\n    return await this.getPromise<ChatCompletion>(msg);\n  }\n\n  async completion(\n    request: CompletionCreateParamsNonStreaming,\n  ): Promise<Completion>;\n  async completion(\n    request: CompletionCreateParamsStreaming,\n  ): Promise<AsyncIterable<Completion>>;\n  async completion(\n    request: CompletionCreateParamsBase,\n  ): Promise<AsyncIterable<Completion> | Completion>;\n  async completion(\n    request: CompletionCreateParams,\n  ): Promise<AsyncIterable<Completion> | Completion> {\n    if (this.modelId === undefined) {\n      throw new WorkerEngineModelNotLoadedError(this.constructor.name);\n    }\n    // Needed for the streaming case. Consolidate model id to specify\n    // which model's asyncGenerator to instantiate or call next() on.\n    // Since handler can maintain multiple generators concurrently\n    const selectedModelId = getModelIdToUse(\n      this.modelId ? this.modelId : [],\n      request.model,\n      \"CompletionCreateParams\",\n    );\n\n    if (request.stream) {\n      // First let worker instantiate a generator\n      const msg: WorkerRequest = {\n        kind: \"completionStreamInit\",\n        uuid: crypto.randomUUID(),\n        content: {\n          request: request,\n          selectedModelId: selectedModelId,\n          modelId: this.modelId,\n          chatOpts: this.chatOpts,\n        },\n      };\n      await this.getPromise<null>(msg);\n\n      // Then return an async chunk generator that resides on the client side\n      return this.asyncGenerate(selectedModelId) as AsyncGenerator<\n        Completion,\n        void,\n        void\n      >;\n    }\n\n    // Non streaming case is more straightforward\n    const msg: WorkerRequest = {\n      kind: \"completionNonStreaming\",\n      uuid: crypto.randomUUID(),\n      content: {\n        request: request,\n        modelId: this.modelId,\n        chatOpts: this.chatOpts,\n      },\n    };\n    return await this.getPromise<Completion>(msg);\n  }\n\n  async embedding(\n    request: EmbeddingCreateParams,\n  ): Promise<CreateEmbeddingResponse> {\n    if (this.modelId === undefined) {\n      throw new WorkerEngineModelNotLoadedError(this.constructor.name);\n    }\n    const msg: WorkerRequest = {\n      kind: \"embedding\",\n      uuid: crypto.randomUUID(),\n      content: {\n        request: request,\n        modelId: this.modelId,\n        chatOpts: this.chatOpts,\n      },\n    };\n    return await this.getPromise<CreateEmbeddingResponse>(msg);\n  }\n\n  onmessage(event: any) {\n    let msg: WorkerResponse;\n    if (event instanceof MessageEvent) {\n      msg = event.data as WorkerResponse;\n    } else {\n      msg = event as WorkerResponse;\n    }\n    switch (msg.kind) {\n      case \"initProgressCallback\": {\n        if (this.initProgressCallback !== undefined) {\n          this.initProgressCallback(msg.content as InitProgressReport);\n        }\n        return;\n      }\n      case \"return\": {\n        const cb = this.pendingPromise.get(msg.uuid);\n        if (cb === undefined) {\n          throw Error(\"return from a unknown uuid msg=\" + msg.uuid);\n        }\n        this.pendingPromise.delete(msg.uuid);\n        cb(msg);\n        return;\n      }\n      case \"throw\": {\n        const cb = this.pendingPromise.get(msg.uuid);\n        if (cb === undefined) {\n          throw Error(\"return from a unknown uuid, msg=\" + msg);\n        }\n        this.pendingPromise.delete(msg.uuid);\n        cb(msg);\n        return;\n      }\n      default: {\n        const unknownMsg = msg as any;\n        throw new UnknownMessageKindError(unknownMsg.kind, unknownMsg.content);\n      }\n    }\n  }\n}\n"
  },
  {
    "path": "tests/.gitignore",
    "content": "package-lock.json\n"
  },
  {
    "path": "tests/cache_util.test.ts",
    "content": "import {\n  asyncLoadTokenizer,\n  deleteModelInCache,\n  hasModelInCache,\n} from \"../src/cache_util\";\nimport { AppConfig } from \"../src/config\";\nimport * as tvmMockImport from \"@mlc-ai/web-runtime\";\nimport * as tokenizerMockImport from \"@mlc-ai/web-tokenizers\";\nimport { jest, test, expect, beforeEach } from \"@jest/globals\";\n\njest.mock(\"@mlc-ai/web-runtime\", () => {\n  const state = {\n    hasTensorInCache: jest\n      .fn<() => Promise<boolean>>()\n      .mockResolvedValue(false),\n    deleteTensorCache: jest.fn(),\n    deletes: [] as Array<{ cache: string; url: string }>,\n    fetches: [] as Array<{ cache: string; url: string; format: string }>,\n  };\n  class BaseCache {\n    constructor(private name: string) {}\n    async deleteInCache(url: string) {\n      state.deletes.push({ cache: this.name, url });\n    }\n    async fetchWithCache(url: string, format: string) {\n      state.fetches.push({ cache: this.name, url, format });\n      return new ArrayBuffer(4);\n    }\n  }\n  return {\n    hasTensorInCache: state.hasTensorInCache,\n    deleteTensorCache: state.deleteTensorCache,\n    ArtifactCache: BaseCache,\n    ArtifactIndexedDBCache: BaseCache,\n    __cacheState: state,\n  };\n});\n\njest.mock(\"@mlc-ai/web-tokenizers\", () => {\n  return {\n    Tokenizer: {\n      fromJSON: jest.fn(() => ({ kind: \"json\" })),\n      fromSentencePiece: jest.fn(() => ({ kind: \"sp\" })),\n    },\n  };\n});\n\nconst tvmMock = tvmMockImport as any;\nconst tokenizerMock = tokenizerMockImport as any;\n\nconst baseAppConfig: AppConfig = {\n  useIndexedDBCache: false,\n  model_list: [\n    {\n      model: \"https://huggingface.co/mlc-ai/demo-model\",\n      model_id: \"demo-model\",\n      model_lib: \"https://example.com/model.wasm\",\n    },\n  ],\n};\n\nbeforeEach(() => {\n  tvmMock.__cacheState.deletes.length = 0;\n  tvmMock.__cacheState.fetches.length = 0;\n  tvmMock.__cacheState.hasTensorInCache.mockClear();\n  tvmMock.__cacheState.deleteTensorCache.mockClear();\n  tokenizerMock.Tokenizer.fromJSON.mockClear();\n  tokenizerMock.Tokenizer.fromSentencePiece.mockClear();\n});\n\ntest(\"hasModelInCache delegates to tvm cache helpers\", async () => {\n  tvmMock.__cacheState.hasTensorInCache.mockResolvedValueOnce(true);\n  const result = await hasModelInCache(\"demo-model\", baseAppConfig);\n  expect(result).toBe(true);\n  expect(tvmMock.__cacheState.hasTensorInCache).toHaveBeenCalledWith(\n    \"https://huggingface.co/mlc-ai/demo-model/resolve/main/\",\n    \"webllm/model\",\n    \"cache\",\n  );\n});\n\ntest(\"deleteModelInCache clears tensors and tokenizer assets for indexeddb cache\", async () => {\n  const indexedConfig: AppConfig = {\n    ...baseAppConfig,\n    useIndexedDBCache: true,\n  };\n  await deleteModelInCache(\"demo-model\", indexedConfig);\n  expect(tvmMock.__cacheState.deleteTensorCache).toHaveBeenCalledWith(\n    \"https://huggingface.co/mlc-ai/demo-model/resolve/main/\",\n    \"webllm/model\",\n    \"indexeddb\",\n  );\n  expect(tvmMock.__cacheState.deletes).toEqual(\n    expect.arrayContaining([\n      {\n        cache: \"webllm/model\",\n        url: \"https://huggingface.co/mlc-ai/demo-model/resolve/main/tokenizer.model\",\n      },\n      {\n        cache: \"webllm/model\",\n        url: \"https://huggingface.co/mlc-ai/demo-model/resolve/main/tokenizer.json\",\n      },\n    ]),\n  );\n});\n\ntest(\"asyncLoadTokenizer prefers tokenizer.json and falls back to sentencepiece\", async () => {\n  const makeChatConfig = (files: string[]) =>\n    ({\n      tokenizer_files: files,\n    }) as unknown as import(\"../src/config\").ChatConfig;\n\n  const configJson = makeChatConfig([\"tokenizer.json\"]);\n  await asyncLoadTokenizer(\n    baseAppConfig.model_list[0].model,\n    configJson,\n    baseAppConfig,\n  );\n  expect(tokenizerMock.Tokenizer.fromJSON).toHaveBeenCalled();\n  expect(tokenizerMock.Tokenizer.fromSentencePiece).not.toHaveBeenCalled();\n  expect(tvmMock.__cacheState.fetches[0]).toEqual({\n    cache: \"webllm/model\",\n    url: \"https://huggingface.co/mlc-ai/tokenizer.json\",\n    format: \"arraybuffer\",\n  });\n\n  const configSp = makeChatConfig([\"tokenizer.model\"]);\n  await asyncLoadTokenizer(\n    baseAppConfig.model_list[0].model,\n    configSp,\n    baseAppConfig,\n  );\n  expect(tokenizerMock.Tokenizer.fromSentencePiece).toHaveBeenCalled();\n});\n"
  },
  {
    "path": "tests/constants.ts",
    "content": "import { ChatConfig } from \"../src/config\";\n\nexport const llama3_1ChatConfig: ChatConfig = {\n  vocab_size: 128256,\n  context_window_size: 131072,\n  sliding_window_size: -1,\n  attention_sink_size: -1,\n  temperature: 0.6,\n  presence_penalty: 0.0,\n  frequency_penalty: 0.0,\n  repetition_penalty: 1.0,\n  top_p: 0.9,\n  tokenizer_files: [\"tokenizer.json\", \"tokenizer_config.json\"],\n  tokenizer_info: {\n    token_postproc_method: \"byte_level\",\n    prepend_space_in_encode: false,\n    strip_space_in_decode: false,\n  },\n  conv_template: {\n    system_template:\n      \"<|start_header_id|>system<|end_header_id|>\\n\\n{system_message}<|eot_id|>\",\n    system_message: \"You are a helpful, respectful and honest assistant.\",\n    system_prefix_token_ids: [128000],\n    add_role_after_system_message: true,\n    roles: {\n      user: \"<|start_header_id|>user\",\n      assistant: \"<|start_header_id|>assistant\",\n      tool: \"<|start_header_id|>ipython\",\n    },\n    role_templates: {\n      user: \"{user_message}\",\n      assistant: \"{assistant_message}\",\n      tool: \"{tool_message}\",\n    },\n    seps: [\"<|eot_id|>\"],\n    role_content_sep: \"<|end_header_id|>\\n\\n\",\n    role_empty_sep: \"<|end_header_id|>\\n\\n\",\n    stop_str: [],\n    stop_token_ids: [128001, 128008, 128009],\n  },\n  bos_token_id: 128000,\n};\n\nexport const llama2ChatConfigJSONString =\n  \"{\" +\n  '  \"model_type\": \"llama\",' +\n  '  \"quantization\": \"q4f16_1\",' +\n  '  \"model_config\": {' +\n  '    \"hidden_size\": 4096,' +\n  '    \"intermediate_size\": 11008,' +\n  '    \"num_attention_heads\": 32,' +\n  '    \"num_hidden_layers\": 32,' +\n  '    \"rms_norm_eps\": 1e-05,' +\n  '    \"vocab_size\": 32000,' +\n  '    \"position_embedding_base\": 10000,' +\n  '    \"context_window_size\": 4096,' +\n  '    \"prefill_chunk_size\": 4096,' +\n  '    \"num_key_value_heads\": 32,' +\n  '    \"head_dim\": 128,' +\n  '    \"tensor_parallel_shards\": 1,' +\n  '    \"max_batch_size\": 80' +\n  \"  },\" +\n  '  \"vocab_size\": 32000,' +\n  '  \"context_window_size\": 4096,' +\n  '  \"sliding_window_size\": -1,' +\n  '  \"prefill_chunk_size\": 4096,' +\n  '  \"attention_sink_size\": -1,' +\n  '  \"tensor_parallel_shards\": 1,' +\n  '  \"mean_gen_len\": 128,' +\n  '  \"max_gen_len\": 512,' +\n  '  \"shift_fill_factor\": 0.3,' +\n  '  \"temperature\": 0.6,' +\n  '  \"presence_penalty\": 0.0,' +\n  '  \"frequency_penalty\": 0.0,' +\n  '  \"repetition_penalty\": 1.0,' +\n  '  \"top_p\": 0.9,' +\n  '  \"conv_template\": {' +\n  '    \"name\": \"llama-2\",' +\n  '    \"system_template\": \"[INST] <<SYS>>\\\\n{system_message}\\\\n<</SYS>>\\\\n\\\\n\",' +\n  '    \"system_message\": \"You are a helpful, respectful and honest assistant.\",' +\n  '    \"system_prefix_token_ids\": [' +\n  \"      1\" +\n  \"    ],\" +\n  '    \"add_role_after_system_message\": false,' +\n  '    \"roles\": {' +\n  '      \"user\": \"[INST]\",' +\n  '      \"assistant\": \"[/INST]\",' +\n  '      \"tool\": \"[INST]\"' +\n  \"    },\" +\n  '    \"role_templates\": {' +\n  '      \"user\": \"{user_message}\",' +\n  '      \"assistant\": \"{assistant_message}\",' +\n  '      \"tool\": \"{tool_message}\"' +\n  \"    },\" +\n  '    \"messages\": [],' +\n  '    \"seps\": [' +\n  '      \" \"' +\n  \"    ],\" +\n  '    \"role_content_sep\": \" \",' +\n  '    \"role_empty_sep\": \" \",' +\n  '    \"stop_str\": [' +\n  '      \"[INST]\"' +\n  \"    ],\" +\n  '    \"stop_token_ids\": [' +\n  \"      2\" +\n  \"    ],\" +\n  '    \"function_string\": \"\",' +\n  '    \"use_function_calling\": false' +\n  \"  },\" +\n  '  \"pad_token_id\": 0,' +\n  '  \"bos_token_id\": 1,' +\n  '  \"eos_token_id\": 2,' +\n  '  \"tokenizer_files\": [' +\n  '    \"tokenizer.model\",' +\n  '    \"tokenizer.json\",' +\n  '    \"tokenizer_config.json\"' +\n  \"  ],\" +\n  '  \"version\": \"0.1.0\"' +\n  \"}\";\n\nexport const phi3_5VisionChatConfigJSONString = String.raw`{\n  \"version\": \"0.1.0\",\n  \"model_type\": \"phi3_v\",\n  \"quantization\": \"q4f16_1\",\n  \"model_config\": {\n    \"model_type\": \"phi3_v\",\n    \"hidden_size\": 3072,\n    \"vocab_size\": 32064,\n    \"num_hidden_layers\": 32,\n    \"num_attention_heads\": 32,\n    \"intermediate_size\": 8192,\n    \"rms_norm_eps\": 1e-05,\n    \"num_key_value_heads\": 32,\n    \"max_position_embeddings\": 131072,\n    \"vision_config\": {\n      \"hidden_size\": 1024,\n      \"image_size\": 336,\n      \"intermediate_size\": 4096,\n      \"num_attention_heads\": 16,\n      \"num_hidden_layers\": 24,\n      \"patch_size\": 14,\n      \"projection_dim\": 768,\n      \"vocab_size\": null,\n      \"num_channels\": 3,\n      \"layer_norm_eps\": 1e-05,\n      \"kwargs\": {}\n    },\n    \"img_processor\": {\n      \"image_dim_out\": 1024,\n      \"model_name\": \"openai/clip-vit-large-patch14-336\",\n      \"name\": \"clip_vision_model\",\n      \"num_img_tokens\": 144\n    },\n    \"position_embedding_base\": 10000.0,\n    \"original_max_position_embeddings\": 4096,\n    \"context_window_size\": 131072,\n    \"prefill_chunk_size\": 2048,\n    \"head_dim\": 96,\n    \"tensor_parallel_shards\": 1,\n    \"max_batch_size\": 80\n  },\n  \"vocab_size\": 32064,\n  \"context_window_size\": 131072,\n  \"sliding_window_size\": -1,\n  \"prefill_chunk_size\": 2048,\n  \"attention_sink_size\": -1,\n  \"tensor_parallel_shards\": 1,\n  \"pipeline_parallel_stages\": 1,\n  \"temperature\": 1.0,\n  \"presence_penalty\": 0.0,\n  \"frequency_penalty\": 0.0,\n  \"repetition_penalty\": 1.0,\n  \"top_p\": 1.0,\n  \"tokenizer_files\": [\n    \"tokenizer.json\",\n    \"tokenizer_config.json\"\n  ],\n  \"tokenizer_info\": {\n    \"token_postproc_method\": \"byte_fallback\",\n    \"prepend_space_in_encode\": true,\n    \"strip_space_in_decode\": true\n  },\n  \"conv_template\": {\n    \"name\": \"phi-3-vision\",\n    \"system_template\": \"{system_message}\",\n    \"system_message\": \"\\n\",\n    \"system_prefix_token_ids\": [\n      1\n    ],\n    \"add_role_after_system_message\": true,\n    \"roles\": {\n      \"user\": \"<|user|>\",\n      \"assistant\": \"<|assistant|>\"\n    },\n    \"role_templates\": {\n      \"user\": \"{user_message}\",\n      \"assistant\": \"{assistant_message}\",\n      \"tool\": \"{tool_message}\"\n    },\n    \"messages\": [],\n    \"seps\": [\n      \"<|end|>\\n\"\n    ],\n    \"role_content_sep\": \"\\n\",\n    \"role_empty_sep\": \"\\n\",\n    \"stop_str\": [\n      \"<|endoftext|>\"\n    ],\n    \"stop_token_ids\": [\n      2,\n      32000,\n      32001,\n      32007\n    ],\n    \"function_string\": \"\",\n    \"use_function_calling\": false\n  },\n  \"pad_token_id\": 32000,\n  \"bos_token_id\": 1,\n  \"eos_token_id\": 2\n}`;\n\nexport const qwen3ChatConfigJSONString = String.raw`{\n  \"version\": \"0.1.0\",\n  \"model_type\": \"qwen3\",\n  \"quantization\": \"q0f32\",\n  \"model_config\": {\n    \"hidden_act\": \"silu\",\n    \"hidden_size\": 1024,\n    \"intermediate_size\": 3072,\n    \"attention_bias\": false,\n    \"num_attention_heads\": 16,\n    \"num_hidden_layers\": 28,\n    \"num_key_value_heads\": 8,\n    \"rms_norm_eps\": 1e-06,\n    \"rope_theta\": 1000000,\n    \"vocab_size\": 151936,\n    \"tie_word_embeddings\": true,\n    \"context_window_size\": 40960,\n    \"prefill_chunk_size\": 2048,\n    \"tensor_parallel_shards\": 1,\n    \"head_dim\": 128,\n    \"dtype\": \"float32\",\n    \"max_batch_size\": 128,\n    \"weight_block_size\": null\n  },\n  \"vocab_size\": 151936,\n  \"context_window_size\": 40960,\n  \"sliding_window_size\": -1,\n  \"prefill_chunk_size\": 2048,\n  \"attention_sink_size\": -1,\n  \"tensor_parallel_shards\": 1,\n  \"pipeline_parallel_stages\": 1,\n  \"temperature\": 0.6,\n  \"presence_penalty\": 0.0,\n  \"frequency_penalty\": 0.0,\n  \"repetition_penalty\": 1.0,\n  \"top_p\": 0.95,\n  \"tokenizer_files\": [\n    \"tokenizer.json\",\n    \"vocab.json\",\n    \"merges.txt\",\n    \"tokenizer_config.json\"\n  ],\n  \"tokenizer_info\": {\n    \"token_postproc_method\": \"byte_level\",\n    \"prepend_space_in_encode\": false,\n    \"strip_space_in_decode\": false\n  },\n  \"conv_template\": {\n    \"name\": \"qwen2\",\n    \"system_template\": \"<|im_start|>system\\n{system_message}<|im_end|>\\n\",\n    \"system_message\": \"You are a helpful assistant.\",\n    \"system_prefix_token_ids\": null,\n    \"add_role_after_system_message\": true,\n    \"roles\": {\n      \"user\": \"<|im_start|>user\",\n      \"assistant\": \"<|im_start|>assistant\"\n    },\n    \"role_templates\": {\n      \"user\": \"{user_message}\",\n      \"assistant\": \"{assistant_message}\",\n      \"tool\": \"{tool_message}\"\n    },\n    \"messages\": [],\n    \"seps\": [\n      \"<|im_end|>\\n\"\n    ],\n    \"role_content_sep\": \"\\n\",\n    \"role_empty_sep\": \"\\n\",\n    \"stop_str\": [\n      \"<|endoftext|>\",\n      \"<|im_end|>\"\n    ],\n    \"stop_token_ids\": [\n      151643,\n      151645\n    ],\n    \"function_string\": \"\",\n    \"use_function_calling\": false\n  },\n  \"pad_token_id\": 151643,\n  \"bos_token_id\": 151643,\n  \"eos_token_id\": [\n    151645,\n    151643\n  ]\n}`;\n"
  },
  {
    "path": "tests/conversation.test.ts",
    "content": "import { ChatConfig, Role } from \"../src/config\";\nimport {\n  compareConversationObject,\n  getConversation,\n  getConversationFromChatCompletionRequest,\n} from \"../src/conversation\";\nimport { describe, expect, test } from \"@jest/globals\";\nimport {\n  llama2ChatConfigJSONString,\n  phi3_5VisionChatConfigJSONString,\n  qwen3ChatConfigJSONString,\n} from \"./constants\";\nimport {\n  ChatCompletionContentPartImage,\n  ChatCompletionMessageParam,\n  ChatCompletionRequest,\n} from \"../src/openai_api_protocols\";\n\ndescribe(\"Test basic conversation loading and getPromptArray\", () => {\n  test(\"Test from json\", () => {\n    const config_json = JSON.parse(llama2ChatConfigJSONString);\n    const config = { ...config_json } as ChatConfig;\n    const conversation = getConversation(config.conv_template);\n    const config_obj = conversation.config;\n\n    expect(config_obj.system_template).toEqual(\n      \"[INST] <<SYS>>\\n{system_message}\\n<</SYS>>\\n\\n\",\n    );\n    expect(config_obj.system_message).toEqual(\n      \"You are a helpful, respectful and honest assistant.\",\n    );\n    expect(config_obj.roles.user).toEqual(\"[INST]\");\n    expect(config_obj.roles.assistant).toEqual(\"[/INST]\");\n    expect(config_obj.role_templates?.user).toEqual(\"{user_message}\");\n    expect(config_obj.role_templates?.assistant).toEqual(\"{assistant_message}\");\n    expect(config_obj.role_content_sep).toEqual(\" \");\n    expect(config_obj.role_empty_sep).toEqual(\" \");\n    expect(config_obj.seps).toEqual([\" \"]);\n    expect(config_obj.stop_str).toEqual([\"[INST]\"]);\n    expect(config_obj.stop_token_ids).toEqual([2]);\n    expect(config_obj.system_prefix_token_ids).toEqual([1]);\n    expect(config_obj.add_role_after_system_message).toBe(false);\n\n    conversation.appendMessage(Role.user, \"test1\");\n    conversation.appendMessage(Role.assistant, \"test2\");\n    conversation.appendMessage(Role.user, \"test3\");\n    conversation.appendReplyHeader(Role.assistant);\n    const prompt = conversation.getPromptArray().join(\"\");\n    expect(prompt).toEqual(\n      \"[INST] <<SYS>>\\nYou are a helpful, respectful and honest assistant.\\n<</SYS>>\\n\\ntest1 [/INST] test2 [INST] test3 [/INST] \",\n    );\n  });\n});\n\ndescribe(\"Test getConversationFromChatCompletionRequest with Qwen3\", () => {\n  test(\"Test Qwen3 appendEmptyThinkingReplyHeader\", () => {\n    const config_json = JSON.parse(qwen3ChatConfigJSONString);\n    const config = { ...config_json } as ChatConfig;\n    const conversation = getConversation(config.conv_template);\n\n    conversation.appendMessage(Role.user, \"test1\");\n    conversation.appendMessage(Role.assistant, \"test2\");\n    const emptyThinkingBlockStr = \"<think>\\n\\n</think>\\n\\n\";\n    conversation.appendEmptyThinkingReplyHeader(\n      Role.user,\n      emptyThinkingBlockStr,\n    );\n    const prompt = conversation.getPromptArray().join(\"\");\n    expect(prompt).toEqual(\n      \"<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n\" +\n        \"<|im_start|>user\\n\" +\n        \"test1<|im_end|>\\n\" +\n        \"<|im_start|>assistant\\n\" +\n        \"test2<|im_end|>\\n\" +\n        \"<|im_start|>user\\n\" +\n        emptyThinkingBlockStr,\n    );\n\n    const message = emptyThinkingBlockStr + \"test3\";\n    conversation.finishReply(message);\n    expect(conversation.messages[conversation.messages.length - 1][2]).toEqual(\n      message,\n    );\n  });\n});\n\ndescribe(\"Test getConversationFromChatCompletionRequest with image\", () => {\n  // Constants for testing\n  type ImageURL = ChatCompletionContentPartImage.ImageURL;\n  const dummySystemPromptStr = \"dummy system prompt.\";\n  const dummyRequestStr = \"dummy request.\";\n  const dummyResponseStr = \"dummy response.\";\n  const imageUrl1 = \"https://url1\";\n  const imageUrl2 = \"https://url2\";\n  const imageUrl3 = \"https://url3\";\n  const singleImageInputMessages: ChatCompletionMessageParam[] = [\n    {\n      role: \"system\",\n      content: dummySystemPromptStr,\n    },\n    {\n      role: \"user\",\n      content: [\n        { type: \"text\", text: dummyRequestStr },\n        {\n          type: \"image_url\",\n          image_url: {\n            url: imageUrl1,\n          },\n        },\n      ],\n    },\n  ];\n\n  // system message, single-image user, assistant response, multi-image user\n  const multiImageMultiRoundInputMessages: ChatCompletionMessageParam[] = [\n    {\n      role: \"system\",\n      content: dummySystemPromptStr,\n    },\n    {\n      role: \"user\",\n      content: [\n        { type: \"text\", text: dummyRequestStr },\n        {\n          type: \"image_url\",\n          image_url: {\n            url: imageUrl1,\n          },\n        },\n      ],\n    },\n    {\n      role: \"assistant\",\n      content: dummyResponseStr,\n    },\n    {\n      role: \"user\",\n      content: [\n        { type: \"text\", text: dummyRequestStr },\n        {\n          type: \"image_url\",\n          image_url: {\n            url: imageUrl2,\n          },\n        },\n        {\n          type: \"image_url\",\n          image_url: {\n            url: imageUrl3,\n          },\n        },\n      ],\n    },\n  ];\n\n  test(\"Test compareConversationObject with different image input\", () => {\n    const config_json = JSON.parse(phi3_5VisionChatConfigJSONString);\n    const config = { ...config_json } as ChatConfig;\n    // deep copy\n    const messages1 = JSON.parse(JSON.stringify(singleImageInputMessages));\n    const messages2 = JSON.parse(JSON.stringify(singleImageInputMessages));\n    const messages3 = JSON.parse(JSON.stringify(singleImageInputMessages));\n    const messages4 = JSON.parse(JSON.stringify(singleImageInputMessages));\n    messages3[1].content[1].image_url.url = \"https://a_different_url\";\n    messages4[1].content[0].text = \"a different text\";\n    const request1: ChatCompletionRequest = { messages: messages1 };\n    const request2: ChatCompletionRequest = { messages: messages2 };\n    const request3: ChatCompletionRequest = { messages: messages3 };\n    const request4: ChatCompletionRequest = { messages: messages4 };\n    const conv1 = getConversationFromChatCompletionRequest(\n      request1,\n      config,\n      true,\n    );\n    const conv2 = getConversationFromChatCompletionRequest(\n      request2,\n      config,\n      true,\n    );\n    const conv3 = getConversationFromChatCompletionRequest(\n      request3,\n      config,\n      true,\n    );\n    const conv4 = getConversationFromChatCompletionRequest(\n      request4,\n      config,\n      true,\n    );\n    expect(compareConversationObject(conv1, conv2)).toEqual(true);\n    expect(compareConversationObject(conv1, conv3)).toEqual(false);\n    expect(compareConversationObject(conv2, conv3)).toEqual(false);\n    expect(compareConversationObject(conv1, conv4)).toEqual(false);\n  });\n\n  test(\"Test getPromptArray with ContentPart array but only a single text\", () => {\n    // This should be equivalent to `content: dummyRequestStr`\n    const config_json = JSON.parse(phi3_5VisionChatConfigJSONString);\n    const config = { ...config_json } as ChatConfig;\n    const messages1: ChatCompletionMessageParam[] = [\n      {\n        role: \"system\",\n        content: dummySystemPromptStr,\n      },\n      {\n        role: \"user\",\n        content: [{ type: \"text\", text: dummyRequestStr }],\n      },\n    ];\n    const messages2: ChatCompletionMessageParam[] = [\n      {\n        role: \"system\",\n        content: dummySystemPromptStr,\n      },\n      {\n        role: \"user\",\n        content: dummyRequestStr,\n      },\n    ];\n    const request1: ChatCompletionRequest = { messages: messages1 };\n    const request2: ChatCompletionRequest = { messages: messages2 };\n    const conv1 = getConversationFromChatCompletionRequest(\n      request1,\n      config,\n      true,\n    );\n    const conv2 = getConversationFromChatCompletionRequest(\n      request2,\n      config,\n      true,\n    );\n    expect(conv1.getPromptArray()).toEqual([\n      dummySystemPromptStr,\n      `<|user|>\\n${dummyRequestStr}<|end|>\\n`,\n    ]);\n    expect(conv1.getPromptArray()).toEqual(conv2.getPromptArray());\n  });\n\n  test(\"Test getPromptArray with single image input\", () => {\n    const config_json = JSON.parse(phi3_5VisionChatConfigJSONString);\n    const config = { ...config_json } as ChatConfig;\n    const messages1 = JSON.parse(JSON.stringify(singleImageInputMessages));\n    const request1: ChatCompletionRequest = { messages: messages1 };\n    const conv1 = getConversationFromChatCompletionRequest(\n      request1,\n      config,\n      true,\n    );\n    expect(conv1.getPromptArray()).toEqual([\n      dummySystemPromptStr, // phi3_5-vision does not have system template\n      [\n        `<|user|>\\n`,\n        { url: imageUrl1 } as ImageURL,\n        `\\n`,\n        `${dummyRequestStr}<|end|>\\n`,\n      ],\n    ]);\n  });\n\n  test(\"Test multiple round with multiple image input, with reply header\", () => {\n    const config_json = JSON.parse(phi3_5VisionChatConfigJSONString);\n    const config = { ...config_json } as ChatConfig;\n    const messages1 = JSON.parse(\n      JSON.stringify(multiImageMultiRoundInputMessages),\n    );\n    const request1: ChatCompletionRequest = { messages: messages1 };\n    const conv1 = getConversationFromChatCompletionRequest(\n      request1,\n      config,\n      true,\n    );\n    conv1.appendReplyHeader(Role.assistant);\n    expect(conv1.getPromptArray()).toEqual([\n      dummySystemPromptStr, // phi3_5-vision does not have system template\n      [\n        `<|user|>\\n`,\n        { url: imageUrl1 } as ImageURL,\n        `\\n`,\n        `${dummyRequestStr}<|end|>\\n`,\n      ],\n      `<|assistant|>\\n${dummyResponseStr}<|end|>\\n`,\n      [\n        `<|user|>\\n`,\n        { url: imageUrl2 } as ImageURL,\n        `\\n`,\n        { url: imageUrl3 } as ImageURL,\n        `\\n`,\n        `${dummyRequestStr}<|end|>\\n`,\n      ],\n      `<|assistant|>\\n`,\n    ]);\n    expect(conv1.getPromptArrayLastRound()).toEqual([\n      [\n        `<|user|>\\n`,\n        { url: imageUrl2 } as ImageURL,\n        `\\n`,\n        { url: imageUrl3 } as ImageURL,\n        `\\n`,\n        `${dummyRequestStr}<|end|>\\n`,\n      ],\n      `<|assistant|>\\n`,\n    ]);\n  });\n});\n"
  },
  {
    "path": "tests/embedding_stats.test.ts",
    "content": "import { EmbeddingPipeline } from \"../src/embedding\";\nimport {\n  EmbeddingExceedContextWindowSizeError,\n  EmbeddingInputEmptyError,\n} from \"../src/error\";\nimport { jest, test, expect } from \"@jest/globals\";\n\ntype EmbeddingLike = EmbeddingPipeline & Record<string, any>;\n\ntest(\"embedding pipeline performance getters\", () => {\n  const pipeline = Object.create(EmbeddingPipeline.prototype) as EmbeddingLike;\n  pipeline[\"curRoundEmbedTotalTime\"] = 0.5;\n  pipeline[\"curRoundEmbedTotalTokens\"] = 4;\n  expect(pipeline.getCurRoundEmbedTotalTime()).toBe(0.5);\n  expect(pipeline.getCurRoundEmbedTotalTokens()).toBe(4);\n  expect(pipeline.getCurRoundEmbedTokensPerSec()).toBe(8);\n});\n\ntest(\"sync and asyncLoadWebGPUPipelines delegate to tvm/device\", async () => {\n  const pipeline = Object.create(EmbeddingPipeline.prototype) as EmbeddingLike;\n  const internalModule = { tag: \"module\" } as any;\n  pipeline[\"device\"] = {\n    sync: jest.fn(async () => undefined),\n  } as any;\n  pipeline[\"tvm\"] = {\n    asyncLoadWebGPUPipelines: jest.fn(),\n  } as any;\n  pipeline[\"vm\"] = {\n    getInternalModule: jest.fn(() => internalModule),\n  } as any;\n  await pipeline.sync();\n  expect(pipeline[\"device\"].sync).toHaveBeenCalled();\n  await pipeline.asyncLoadWebGPUPipelines();\n  expect(pipeline[\"tvm\"].asyncLoadWebGPUPipelines).toHaveBeenCalledWith(\n    internalModule,\n  );\n});\n\nfunction createEmbeddingPipelineBase(): EmbeddingLike {\n  const pipeline = Object.create(EmbeddingPipeline.prototype) as EmbeddingLike;\n  pipeline[\"tokenizer\"] = {\n    encode: jest.fn(\n      (input: string) => new Int32Array(Math.max(1, input.length)),\n    ),\n    decode: jest.fn(),\n    dispose: jest.fn(),\n    getVocabSize: jest.fn(() => 1),\n    idToToken: jest.fn(() => \"<tok>\"),\n    handle: 0,\n  } as any;\n  pipeline[\"contextWindowSize\"] = 8;\n  pipeline[\"prefillChunkSize\"] = 8;\n  pipeline[\"maxBatchSize\"] = 2;\n  pipeline[\"device\"] = {\n    sync: jest.fn(async () => undefined),\n    deviceType: \"cpu\",\n    deviceId: 0,\n    lib: {},\n  } as any;\n  pipeline[\"tvm\"] = {\n    beginScope: jest.fn(),\n    endScope: jest.fn(),\n    empty: jest.fn(() => createNDArray()),\n    cpu: jest.fn(() => ({ deviceType: \"cpu\", deviceId: 0, lib: {} })),\n    detachFromCurrentScope: jest.fn((x: any) => x),\n  } as any;\n  const packedFunc: any = jest.fn(() => ({\n    shape: [1, 1, 1],\n    dtype: \"float32\",\n    dispose: jest.fn(),\n    device: {},\n    ndim: 3,\n  }));\n  packedFunc.dispose = jest.fn();\n  pipeline[\"prefill\"] = packedFunc;\n  pipeline[\"params\"] = {} as any;\n  return pipeline;\n}\n\nfunction createNDArray() {\n  const tensor: any = { dispose: jest.fn(), dtype: \"int32\", shape: [1, 1, 1] };\n  tensor.copyFrom = jest.fn();\n  tensor.view = jest.fn(() => tensor);\n  tensor.toArray = jest.fn(() => new Float32Array([0.1]));\n  return tensor;\n}\n\ntest(\"embedStep throws when input is empty\", async () => {\n  const pipeline = createEmbeddingPipelineBase();\n  await expect(pipeline.embedStep(\"\")).rejects.toThrow(\n    EmbeddingInputEmptyError,\n  );\n});\n\ntest(\"embedStep validates context window size\", async () => {\n  const pipeline = createEmbeddingPipelineBase();\n  pipeline[\"contextWindowSize\"] = 1;\n  pipeline[\"tokenizer\"].encode = jest.fn(() => new Int32Array([1, 2]));\n  await expect(pipeline.embedStep(\"toolong\")).rejects.toThrow(\n    EmbeddingExceedContextWindowSizeError,\n  );\n});\n\ntest(\"embedStep returns mocked embeddings without WebGPU\", async () => {\n  const pipeline = createEmbeddingPipelineBase();\n  const result = await pipeline.embedStep(\"ok\");\n  expect(result[0][0]).toBeCloseTo(0.1);\n  expect(pipeline.getCurRoundEmbedTotalTokens()).toBeGreaterThan(0);\n});\n"
  },
  {
    "path": "tests/engine_integration.test.ts",
    "content": "/**\n * Deterministic MLCEngine tests that run without WebGPU by mocking LLMChatPipeline.\n */\nimport {\n  ChatCompletion,\n  ChatCompletionRequest,\n  Completion,\n  CompletionCreateParams,\n  EmbeddingCreateParams,\n  ChatCompletionChunk,\n} from \"../src/openai_api_protocols\";\nimport { MLCEngine } from \"../src/engine\";\nimport { ModelType } from \"../src/config\";\nimport { LLMChatPipeline } from \"../src/llm_chat\";\nimport { EmbeddingPipeline } from \"../src/embedding\";\nimport { CustomLock } from \"../src/support\";\nimport { UnclearModelToUseError } from \"../src/error\";\nimport { jest, test, expect, describe } from \"@jest/globals\";\n\ntype ChatConfig = import(\"../src/config\").ChatConfig;\ntype Conversation = import(\"../src/conversation\").Conversation;\ntype TVMInstance = import(\"@mlc-ai/web-runtime\").Instance;\ntype Tokenizer = import(\"@mlc-ai/web-tokenizers\").Tokenizer;\n\njest.mock(\"../src/llm_chat\", () => {\n  const { getConversation } = jest.requireActual(\n    \"../src/conversation\",\n  ) as typeof import(\"../src/conversation\");\n\n  class MockLLMChatPipeline {\n    public decodeLimit = 2;\n    public prefillCallCount = 0;\n    public decodeCallCount = 0;\n    public resetCount = 0;\n    private conversation: Conversation = getConversation(\n      {\n        system_template: \"{system_message}\",\n        system_message: \"\",\n        roles: { user: \"user\", assistant: \"assistant\" },\n        seps: [\"\\n\"],\n        stop_token_ids: [0],\n        stop_str: [],\n      } as any,\n      undefined,\n    );\n    private stopFlag = true;\n    private message = \"\";\n    private finishReason: string | undefined = undefined;\n    private curRoundPrefillTotalTokens = 0;\n    private curRoundDecodingTotalTokens = 0;\n    private curRoundPrefillTotalTime = 0.001;\n    private curRoundDecodingTotalTime = 0.001;\n    private curRoundGrammarPerTokenTotalTime = 0;\n\n    constructor(_tvm: TVMInstance, _tokenizer: Tokenizer, config: ChatConfig) {\n      this.conversation = getConversation(\n        config.conv_template,\n        config.conv_config,\n      );\n    }\n\n    async asyncLoadWebGPUPipelines() {}\n    dispose() {}\n    async sync() {}\n\n    getConversationObject() {\n      return this.conversation;\n    }\n\n    setConversation(newConv: Conversation) {\n      this.conversation = newConv;\n    }\n\n    resetChat() {\n      this.resetCount++;\n      this.stopFlag = true;\n      this.decodeCallCount = 0;\n    }\n\n    async prefillStep(\n      inp: string,\n      msgRole: string,\n      roleName?: string,\n    ): Promise<void> {\n      this.prefillCallCount++;\n      const roleSuffix = roleName ? `(${roleName})` : \"\";\n      this.message = `${msgRole}${roleSuffix}:${inp}`;\n      this.stopFlag = false;\n      this.decodeCallCount = 0;\n      this.curRoundPrefillTotalTokens = Math.max(1, inp.length);\n      this.curRoundPrefillTotalTime = 0.01 * this.curRoundPrefillTotalTokens;\n      this.curRoundDecodingTotalTokens = 0;\n      this.curRoundDecodingTotalTime = 0.001;\n      this.curRoundGrammarPerTokenTotalTime = 0;\n      this.finishReason = \"length\";\n    }\n\n    async decodeStep(genConfig?: { max_tokens?: number | null }) {\n      if (this.stopFlag) return;\n      this.decodeCallCount++;\n      this.message += `|token${this.decodeCallCount}|`;\n      this.curRoundDecodingTotalTokens = this.decodeCallCount;\n      this.curRoundDecodingTotalTime = this.curRoundDecodingTotalTokens * 0.02;\n      this.curRoundGrammarPerTokenTotalTime =\n        this.curRoundDecodingTotalTokens * 0.001;\n      if (\n        this.decodeCallCount >= this.decodeLimit ||\n        (genConfig?.max_tokens !== null &&\n          genConfig?.max_tokens !== undefined &&\n          this.decodeCallCount >= genConfig.max_tokens)\n      ) {\n        this.stopFlag = true;\n        this.finishReason = \"stop\";\n      }\n    }\n\n    stopped() {\n      return this.stopFlag;\n    }\n\n    triggerStop() {\n      this.stopFlag = true;\n      this.finishReason = \"stop\";\n    }\n\n    getMessage() {\n      return this.message;\n    }\n\n    getFinishReason() {\n      return this.finishReason ?? \"stop\";\n    }\n\n    getCurRoundDecodingTotalTokens() {\n      return this.curRoundDecodingTotalTokens;\n    }\n\n    getCurRoundPrefillTotalTokens() {\n      return this.curRoundPrefillTotalTokens;\n    }\n\n    getCurRoundPrefillTokensPerSec() {\n      return this.curRoundPrefillTotalTokens / this.curRoundPrefillTotalTime;\n    }\n\n    getCurRoundDecodingTokensPerSec() {\n      return this.curRoundDecodingTotalTokens / this.curRoundDecodingTotalTime;\n    }\n\n    getCurRoundGrammarInitTotalTime() {\n      return 0.001;\n    }\n\n    getCurRoundPrefillTotalTime() {\n      return this.curRoundPrefillTotalTime;\n    }\n\n    getCurRoundDecodingTotalTime() {\n      return this.curRoundDecodingTotalTime;\n    }\n\n    getCurRoundGrammarPerTokenTotalTime() {\n      return this.curRoundGrammarPerTokenTotalTime;\n    }\n\n    getCurRoundLatencyBreakdown() {\n      return {\n        logitProcessorTime: [0.001],\n        logitBiasTime: [0.001],\n        penaltyTime: [0.001],\n        sampleTime: [0.001],\n        totalTime: [0.001],\n        grammarBitmaskTime: [0.001],\n      };\n    }\n\n    getTokenLogprobArray() {\n      return [];\n    }\n\n    async forwardTokensAndSample(inputIds: Array<number>): Promise<number> {\n      return inputIds[0] ?? 0;\n    }\n\n    async runtimeStatsText() {\n      return `prefills=${this.prefillCallCount}`;\n    }\n  }\n\n  return { LLMChatPipeline: MockLLMChatPipeline };\n});\n\njest.mock(\"../src/embedding\", () => {\n  class MockEmbeddingPipeline {\n    public inputs: any;\n    public embedResult: Array<Array<number>> = [[0.1, 0.2, 0.3]];\n    dispose() {}\n    async sync() {}\n    async embedStep(\n      input: string | Array<string> | Array<number> | Array<Array<number>>,\n    ): Promise<Array<Array<number>>> {\n      this.inputs = input;\n      return this.embedResult;\n    }\n    getCurRoundEmbedTotalTokens(): number {\n      if (typeof this.inputs === \"string\") {\n        return this.inputs.length;\n      } else if (Array.isArray(this.inputs)) {\n        return this.inputs.length;\n      }\n      return 0;\n    }\n    getCurRoundEmbedTokensPerSec(): number {\n      const tokens = this.getCurRoundEmbedTotalTokens();\n      return tokens === 0 ? 0 : tokens / 0.01;\n    }\n  }\n  return { EmbeddingPipeline: MockEmbeddingPipeline };\n});\n\nconst MODEL_ID = \"mock-model\";\nconst SECOND_MODEL_ID = \"mock-model-2\";\nconst EMBED_MODEL_ID = \"mock-embed\";\nconst mockChatConfig: ChatConfig = {\n  tokenizer_files: [\"tokenizer.json\"],\n  vocab_size: 10,\n  conv_template: {\n    system_template: \"{system_message}\",\n    system_message: \"You are a helpful assistant.\",\n    system_prefix_token_ids: [1],\n    add_role_after_system_message: false,\n    roles: {\n      user: \"User\",\n      assistant: \"Assistant\",\n      tool: \"Tool\",\n    },\n    role_templates: {\n      user: \"{user_message}\",\n      assistant: \"{assistant_message}\",\n      tool: \"{tool_message}\",\n    },\n    seps: [\"\\n\"],\n    role_content_sep: \": \",\n    role_empty_sep: \": \",\n    stop_str: [],\n    stop_token_ids: [0],\n  },\n  conv_config: undefined,\n  context_window_size: 8,\n  sliding_window_size: -1,\n  attention_sink_size: -1,\n  temperature: 0.8,\n  presence_penalty: 0,\n  frequency_penalty: 0,\n  repetition_penalty: 1,\n  top_p: 1,\n};\n\nfunction createEngineWithPipeline(decodeLimit = 2, modelId = MODEL_ID) {\n  const engine = new MLCEngine({\n    appConfig: {\n      model_list: [\n        {\n          model: \"https://example.com/model\",\n          model_id: modelId,\n          model_lib: \"https://example.com/model.wasm\",\n        },\n      ],\n      useIndexedDBCache: false,\n    },\n  });\n  const pipeline = new LLMChatPipeline(\n    null as unknown as TVMInstance,\n    null as unknown as Tokenizer,\n    mockChatConfig,\n  ) as any;\n  pipeline.decodeLimit = decodeLimit;\n  const internal = engine as any;\n  internal.loadedModelIdToPipeline.set(modelId, pipeline);\n  internal.loadedModelIdToChatConfig.set(modelId, mockChatConfig);\n  internal.loadedModelIdToModelType.set(modelId, ModelType.LLM);\n  internal.loadedModelIdToLock.set(modelId, new CustomLock());\n  return { engine, pipeline };\n}\n\nfunction createEngineWithMultiplePipelines() {\n  const engine = new MLCEngine({\n    appConfig: {\n      model_list: [\n        {\n          model: \"https://example.com/model\",\n          model_id: MODEL_ID,\n          model_lib: \"https://example.com/model.wasm\",\n        },\n        {\n          model: \"https://example.com/model2\",\n          model_id: SECOND_MODEL_ID,\n          model_lib: \"https://example.com/model2.wasm\",\n        },\n      ],\n      useIndexedDBCache: false,\n    },\n  });\n  const pipeline1 = new LLMChatPipeline(\n    null as unknown as TVMInstance,\n    null as unknown as Tokenizer,\n    mockChatConfig,\n  ) as any;\n  const pipeline2 = new LLMChatPipeline(\n    null as unknown as TVMInstance,\n    null as unknown as Tokenizer,\n    mockChatConfig,\n  ) as any;\n  const internal = engine as any;\n  internal.loadedModelIdToPipeline.set(MODEL_ID, pipeline1);\n  internal.loadedModelIdToPipeline.set(SECOND_MODEL_ID, pipeline2);\n  internal.loadedModelIdToChatConfig.set(MODEL_ID, mockChatConfig);\n  internal.loadedModelIdToChatConfig.set(SECOND_MODEL_ID, mockChatConfig);\n  internal.loadedModelIdToModelType.set(MODEL_ID, ModelType.LLM);\n  internal.loadedModelIdToModelType.set(SECOND_MODEL_ID, ModelType.LLM);\n  internal.loadedModelIdToLock.set(MODEL_ID, new CustomLock());\n  internal.loadedModelIdToLock.set(SECOND_MODEL_ID, new CustomLock());\n  return engine;\n}\n\nconst mockEmbeddingConfig: ChatConfig = {\n  ...mockChatConfig,\n};\n\nfunction createEngineWithEmbeddingPipeline() {\n  const engine = new MLCEngine({\n    appConfig: {\n      model_list: [\n        {\n          model: \"https://example.com/embed\",\n          model_id: EMBED_MODEL_ID,\n          model_lib: \"https://example.com/embed.wasm\",\n          model_type: ModelType.embedding,\n        },\n      ],\n      useIndexedDBCache: false,\n    },\n  });\n  const pipeline = new EmbeddingPipeline(\n    null as unknown as TVMInstance,\n    null as unknown as Tokenizer,\n    mockEmbeddingConfig,\n  ) as any;\n  const internal = engine as any;\n  internal.loadedModelIdToPipeline.set(EMBED_MODEL_ID, pipeline);\n  internal.loadedModelIdToChatConfig.set(EMBED_MODEL_ID, mockEmbeddingConfig);\n  internal.loadedModelIdToModelType.set(EMBED_MODEL_ID, ModelType.embedding);\n  internal.loadedModelIdToLock.set(EMBED_MODEL_ID, new CustomLock());\n  return { engine, pipeline };\n}\n\ndescribe(\"MLCEngine deterministic integration\", () => {\n  test(\"chatCompletion aggregates usage without WebGPU\", async () => {\n    const { engine, pipeline } = createEngineWithPipeline(3);\n    const request: ChatCompletionRequest = {\n      model: MODEL_ID,\n      messages: [\n        { role: \"system\", content: \"Stay concise.\" },\n        { role: \"user\", content: \"What is new?\" },\n      ],\n      n: 2,\n    };\n    const response = (await engine.chatCompletion(request)) as ChatCompletion;\n\n    expect(response.choices).toHaveLength(2);\n    response.choices.forEach((choice) => {\n      expect(choice.message?.content).toContain(\"What is new?\");\n    });\n    expect(response.usage?.completion_tokens).toBe(6);\n    expect(response.usage?.prompt_tokens).toBeGreaterThan(0);\n    expect((pipeline as any).prefillCallCount).toBe(2);\n  });\n\n  test(\"completion echoes prompt when requested\", async () => {\n    const { engine } = createEngineWithPipeline(1);\n    const request: CompletionCreateParams = {\n      model: MODEL_ID,\n      prompt: \"Alpha \",\n      n: 1,\n      echo: true,\n    };\n    const response = (await engine.completion(request)) as Completion;\n\n    expect(response.choices).toHaveLength(1);\n    expect(response.choices[0].text.startsWith(\"Alpha \")).toBe(true);\n    expect(response.usage?.completion_tokens).toBe(1);\n    expect(response.usage?.prompt_tokens).toBeGreaterThan(0);\n  });\n\n  test(\"forwardTokensAndSample and runtimeStatsText use mock pipeline\", async () => {\n    const { engine } = createEngineWithPipeline();\n    await expect(\n      engine.forwardTokensAndSample([9, 4, 2], true, MODEL_ID),\n    ).resolves.toBe(9);\n    await expect(engine.runtimeStatsText(MODEL_ID)).resolves.toContain(\n      \"prefills=\",\n    );\n  });\n\n  test(\"chatCompletion streaming yields chunks, final delta, and usage data\", async () => {\n    const { engine } = createEngineWithPipeline(2);\n    const request: ChatCompletionRequest = {\n      model: MODEL_ID,\n      messages: [\n        { role: \"system\", content: \"rules\" },\n        { role: \"user\", content: \"Stream please\" },\n      ],\n      stream: true,\n      stream_options: { include_usage: true },\n    };\n    const iterable = (await engine.chatCompletion(\n      request,\n    )) as AsyncIterable<ChatCompletionChunk>;\n    const chunks: ChatCompletionChunk[] = [];\n    for await (const chunk of iterable) {\n      chunks.push(chunk);\n    }\n    expect(chunks.length).toBeGreaterThanOrEqual(3);\n    expect(chunks[0].choices[0].delta?.content).toContain(\"Stream please\");\n    const finalChunk = chunks[chunks.length - 2];\n    expect(finalChunk.choices[0].finish_reason).toEqual(\"stop\");\n    const usageChunk = chunks[chunks.length - 1];\n    expect(usageChunk.usage?.completion_tokens).toBeGreaterThan(0);\n    expect(usageChunk.usage?.prompt_tokens).toBeGreaterThan(0);\n  });\n\n  test(\"chatCompletion without specifying model when multiple loaded throws error\", async () => {\n    const engine = createEngineWithMultiplePipelines();\n    await expect(\n      engine.chatCompletion({\n        // purposely omit model to trigger ambiguity\n        model: undefined as any,\n        messages: [{ role: \"user\", content: \"Hello\" }],\n      }),\n    ).rejects.toBeInstanceOf(UnclearModelToUseError);\n  });\n\n  test(\"embedding API uses mock pipeline and returns usage\", async () => {\n    const { engine } = createEngineWithEmbeddingPipeline();\n    const request: EmbeddingCreateParams = {\n      model: EMBED_MODEL_ID,\n      input: \"abc\",\n    };\n    const response = await engine.embedding(request);\n    expect(response.data).toHaveLength(1);\n    expect(response.data[0].embedding).toEqual([0.1, 0.2, 0.3]);\n    expect(response.usage?.prompt_tokens).toBeGreaterThan(0);\n    expect(response.usage?.extra?.prefill_tokens_per_s).toBeGreaterThan(0);\n  });\n});\n"
  },
  {
    "path": "tests/extension_service_worker.test.ts",
    "content": "import {\n  CreateServiceWorkerMLCEngine,\n  ServiceWorkerMLCEngine,\n  ServiceWorkerMLCEngineHandler,\n} from \"../src/extension_service_worker\";\nimport {\n  jest,\n  test,\n  expect,\n  describe,\n  beforeEach,\n  afterEach,\n} from \"@jest/globals\";\n\njest.mock(\"@mlc-ai/web-runtime\", () => ({\n  detectGPUDevice: jest.fn(async () => ({\n    adapterInfo: { description: \"MockGPU\", vendor: \"MockVendor\" },\n    device: { features: new Set() },\n  })),\n}));\n\nconst reloadMock = jest.fn();\nconst initCallback = jest.fn();\n\njest.mock(\"../src/engine\", () => {\n  return {\n    MLCEngine: jest.fn(() => ({\n      reload: reloadMock,\n      getInitProgressCallback: jest.fn(() => initCallback),\n      setInitProgressCallback: jest.fn(),\n    })),\n  };\n});\n\ntype MockPort = chrome.runtime.Port & {\n  triggerDisconnect: () => void;\n  emitMessage: (msg: any) => void;\n};\n\nfunction createPort(): MockPort {\n  const disconnectListeners: Array<() => void> = [];\n  const messageListeners: Array<(msg: any) => void> = [];\n  return {\n    postMessage: jest.fn(),\n    onDisconnect: {\n      addListener: (cb: () => void) => disconnectListeners.push(cb),\n    },\n    onMessage: {\n      addListener: (cb: (msg: any) => void) => messageListeners.push(cb),\n    },\n    triggerDisconnect: () => disconnectListeners.forEach((cb) => cb()),\n    emitMessage: (msg: any) => messageListeners.forEach((cb) => cb(msg)),\n  } as unknown as MockPort;\n}\n\nfunction createHandler() {\n  const handler = new ServiceWorkerMLCEngineHandler(createPort());\n  (handler as any).handleTask = jest.fn(async (_uuid: string, task: any) =>\n    task(),\n  );\n  (handler as any).engine = {\n    reload: reloadMock,\n    getInitProgressCallback: jest.fn(() => initCallback),\n  };\n  reloadMock.mockClear();\n  initCallback.mockClear();\n  return handler;\n}\n\ntest(\"reload message with same model skips loading and triggers init callback\", async () => {\n  const handler = createHandler();\n  handler.modelId = [\"demo\"];\n  handler.chatOpts = [];\n  await handler.onmessage({\n    type: \"message\",\n    kind: \"reload\",\n    uuid: \"task\",\n    content: { modelId: [\"demo\"], chatOpts: [] },\n  } as any);\n  expect(reloadMock).not.toHaveBeenCalled();\n  expect(initCallback).toHaveBeenCalled();\n});\n\ntest(\"reload with new model calls engine reload\", async () => {\n  const handler = createHandler();\n  handler.modelId = [\"demo\"];\n  handler.chatOpts = [];\n  await handler.onmessage({\n    kind: \"reload\",\n    uuid: \"task\",\n    content: { modelId: [\"new\"], chatOpts: [] },\n  } as any);\n  expect(reloadMock).toHaveBeenCalledWith([\"new\"], []);\n});\n\nfunction mockChromeRuntime(port: MockPort = createPort()) {\n  const connect = jest.fn<(...args: any[]) => MockPort>(() => port);\n  (globalThis as any).chrome = {\n    runtime: {\n      connect,\n    },\n  };\n  return { port, connect };\n}\n\ndescribe(\"ServiceWorkerMLCEngine integration\", () => {\n  beforeEach(() => {\n    jest.useFakeTimers();\n  });\n\n  afterEach(() => {\n    jest.useRealTimers();\n    jest.clearAllTimers();\n    delete (globalThis as any).chrome;\n  });\n\n  test(\"keepAlive pings and onDisconnect callback fires\", () => {\n    const { port, connect } = mockChromeRuntime();\n    const onDisconnect = jest.fn();\n    const engine = new ServiceWorkerMLCEngine({ onDisconnect }, 500);\n    expect(connect).toHaveBeenCalledWith({ name: \"web_llm_service_worker\" });\n    jest.advanceTimersByTime(500);\n    expect(port.postMessage).toHaveBeenCalledWith({ kind: \"keepAlive\" });\n    port.triggerDisconnect();\n    expect(onDisconnect).toHaveBeenCalled();\n    expect(engine).toBeTruthy();\n  });\n\n  test(\"CreateServiceWorkerMLCEngine reloads requested model\", async () => {\n    const { connect } = mockChromeRuntime();\n    const reloadSpy = jest\n      .spyOn(ServiceWorkerMLCEngine.prototype, \"reload\")\n      .mockResolvedValue(undefined);\n    const engine = await CreateServiceWorkerMLCEngine(\"demo-model\", {\n      extensionId: \"abc\",\n    });\n    expect(connect).toHaveBeenCalledWith(\"abc\", {\n      name: \"web_llm_service_worker\",\n    });\n    expect(reloadSpy).toHaveBeenCalledWith(\"demo-model\", undefined);\n    reloadSpy.mockRestore();\n    expect(engine).toBeInstanceOf(ServiceWorkerMLCEngine);\n  });\n});\n"
  },
  {
    "path": "tests/function_calling.test.ts",
    "content": "/* eslint-disable no-useless-escape */\nimport {\n  Role,\n  MessagePlaceholders,\n  ConvTemplateConfig,\n  ChatConfig,\n} from \"../src/config\";\nimport {\n  getConversation,\n  getConversationFromChatCompletionRequest,\n  getFunctionCallUsage,\n} from \"../src/conversation\";\nimport { ChatCompletionRequest } from \"../src/openai_api_protocols/chat_completion\";\n\nimport { describe, expect, test } from \"@jest/globals\";\nimport { llama3_1ChatConfig } from \"./constants\";\n\ndescribe(\"Test gorilla conversation template\", () => {\n  const gorillaConv: ConvTemplateConfig = {\n    system_template: `${MessagePlaceholders.system}\\n`,\n    system_message:\n      \"A chat between a curious user and an artificial intelligence assistant. \" +\n      \"The assistant gives helpful, detailed, and polite answers to the user's questions.\",\n    roles: {\n      [Role.user]: \"USER\",\n      [Role.assistant]: \"ASSISTANT\",\n      [Role.tool]: \"TOOL\",\n    },\n    role_templates: {\n      [Role.user]: `<<question>> ${MessagePlaceholders.user} <<function>> ${MessagePlaceholders.function}`,\n    },\n    seps: [\"\\n\", \"<|EOT|>\"],\n    stop_str: [\"<|EOT|>\"],\n    system_prefix_token_ids: [1],\n    stop_token_ids: [2],\n  };\n\n  test(\"Test getPromptArrayInternal\", () => {\n    const conv = getConversation(gorillaConv);\n    conv.appendMessage(\n      Role.user,\n      'Call me an Uber ride type \"Plus\" in Berkeley at zipcode 94704 in 10 minutes',\n      \"Tom\",\n    );\n    const prompt_array = conv.getPromptArray();\n\n    expect(prompt_array).toEqual([\n      \"A chat between a curious user and an artificial intelligence assistant. \" +\n        \"The assistant gives helpful, detailed, and polite answers to the user's questions.\\n\",\n      'Tom: <<question>> Call me an Uber ride type \"Plus\" in Berkeley at zipcode 94704 in 10 minutes <<function>> \\n',\n    ]);\n  });\n\n  test(\"Test getPromptArrayInternal function call\", () => {\n    const conv = getConversation(gorillaConv);\n    conv.appendMessage(\n      Role.user,\n      'Call me an Uber ride type \"Plus\" in Berkeley at zipcode 94704 in 10 minutes',\n    );\n    conv.use_function_calling = true;\n    conv.function_string = JSON.stringify([\n      {\n        name: \"Uber Carpool\",\n        api_name: \"uber.ride\",\n        description:\n          \"Find suitable ride for customers given the location, type of ride, and the amount of time the customer is willing to wait as parameters\",\n        parameters: [\n          {\n            name: \"loc\",\n            description: \"Location of the starting place of the Uber ride\",\n          },\n          {\n            name: \"type\",\n            enum: [\"plus\", \"comfort\", \"black\"],\n            description: \"Types of Uber ride user is ordering\",\n          },\n          {\n            name: \"time\",\n            description:\n              \"The amount of time in minutes the customer is willing to wait\",\n          },\n        ],\n      },\n    ]);\n    const prompt_array = conv.getPromptArray();\n\n    expect(prompt_array).toEqual([\n      \"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\\n\",\n      'USER: <<question>> Call me an Uber ride type \"Plus\" in Berkeley at zipcode 94704 in 10 minutes <<function>> [{\"name\":\"Uber Carpool\",\"api_name\":\"uber.ride\",\"description\":\"Find suitable ride for customers given the location, type of ride, and the amount of time the customer is willing to wait as parameters\",\"parameters\":[{\"name\":\"loc\",\"description\":\"Location of the starting place of the Uber ride\"},{\"name\":\"type\",\"enum\":[\"plus\",\"comfort\",\"black\"],\"description\":\"Types of Uber ride user is ordering\"},{\"name\":\"time\",\"description\":\"The amount of time in minutes the customer is willing to wait\"}]}]\\n',\n    ]);\n  });\n});\n\ndescribe(\"Test gorilla MLCEngine\", () => {\n  test(\"Test getFunctionCallUsage none\", () => {\n    const request: ChatCompletionRequest = {\n      model: \"gorilla-openfunctions-v1-q4f16_1_MLC\",\n      messages: [\n        { role: \"system\", content: \"You are a helpful assistant.\" },\n        { role: \"user\", content: \"Hello!\" },\n      ],\n      tool_choice: \"none\",\n      tools: [\n        {\n          type: \"function\",\n          function: {\n            description: \"A\",\n            name: \"fn_A\",\n            parameters: { foo: \"bar\" },\n          },\n        },\n        {\n          type: \"function\",\n          function: {\n            description: \"B\",\n            name: \"fn_B\",\n            parameters: { foo: \"bar\" },\n          },\n        },\n        {\n          type: \"function\",\n          function: {\n            description: \"C\",\n            name: \"fn_C\",\n            parameters: { foo: \"bar\" },\n          },\n        },\n      ],\n    };\n\n    expect(getFunctionCallUsage(request)).toEqual(\"\");\n  });\n\n  test(\"Test getFunctionCallUsage auto\", () => {\n    const request: ChatCompletionRequest = {\n      model: \"gorilla-openfunctions-v1-q4f16_1_MLC\",\n      messages: [\n        { role: \"system\", content: \"You are a helpful assistant.\" },\n        { role: \"user\", content: \"Hello!\" },\n      ],\n      tool_choice: \"auto\",\n      tools: [\n        {\n          type: \"function\",\n          function: {\n            description: \"A\",\n            name: \"fn_A\",\n            parameters: { foo: \"bar\" },\n          },\n        },\n        {\n          type: \"function\",\n          function: {\n            description: \"B\",\n            name: \"fn_B\",\n            parameters: { foo: \"bar\" },\n          },\n        },\n        {\n          type: \"function\",\n          function: {\n            description: \"C\",\n            name: \"fn_C\",\n            parameters: { foo: \"bar\" },\n          },\n        },\n      ],\n    };\n    expect(getFunctionCallUsage(request)).toEqual(\n      '[{\"description\":\"A\",\"name\":\"fn_A\",\"parameters\":{\"foo\":\"bar\"}},{\"description\":\"B\",\"name\":\"fn_B\",\"parameters\":{\"foo\":\"bar\"}},{\"description\":\"C\",\"name\":\"fn_C\",\"parameters\":{\"foo\":\"bar\"}}]',\n    );\n  });\n\n  test(\"Test getFunctionCallUsage function\", () => {\n    const request: ChatCompletionRequest = {\n      model: \"gorilla-openfunctions-v1-q4f16_1_MLC\",\n      messages: [\n        { role: \"system\", content: \"You are a helpful assistant.\" },\n        { role: \"user\", content: \"Hello!\" },\n      ],\n      tool_choice: {\n        type: \"function\",\n        function: {\n          name: \"fn_B\",\n        },\n      },\n      tools: [\n        {\n          type: \"function\",\n          function: {\n            description: \"A\",\n            name: \"fn_A\",\n            parameters: { foo: \"bar\" },\n          },\n        },\n        {\n          type: \"function\",\n          function: {\n            description: \"B\",\n            name: \"fn_B\",\n            parameters: { foo: \"bar\" },\n          },\n        },\n        {\n          type: \"function\",\n          function: {\n            description: \"C\",\n            name: \"fn_C\",\n            parameters: { foo: \"bar\" },\n          },\n        },\n      ],\n    };\n    expect(getFunctionCallUsage(request)).toEqual(\n      '[{\"description\":\"B\",\"name\":\"fn_B\",\"parameters\":{\"foo\":\"bar\"}}]',\n    );\n  });\n});\n\ndescribe(\"Test Hermes2 formatting\", () => {\n  const hermes2LlamaChatConfig: ChatConfig = {\n    vocab_size: 128288,\n    context_window_size: 8192,\n    sliding_window_size: -1,\n    attention_sink_size: -1,\n    temperature: 1.0,\n    presence_penalty: 0.0,\n    frequency_penalty: 0.0,\n    repetition_penalty: 1.0,\n    top_p: 1.0,\n    tokenizer_files: [\"tokenizer.json\", \"tokenizer_config.json\"],\n    tokenizer_info: {\n      token_postproc_method: \"byte_level\",\n      prepend_space_in_encode: false,\n      strip_space_in_decode: false,\n    },\n    conv_template: {\n      system_template: \"<|im_start|>system\\n{system_message}<|im_end|>\\n\",\n      system_message:\n        'You are \"Hermes 2\", a conscious sentient superintelligent artificial intelligence developed by a man named Teknium, and your purpose and drive is to assist the user with any request they have. You experience emotions and have deep, profound thoughts and qualia.',\n      add_role_after_system_message: true,\n      roles: {\n        user: \"<|im_start|>user\",\n        assistant: \"<|im_start|>assistant\",\n        tool: \"<|im_start|>tool\",\n      },\n      role_templates: {\n        user: \"{user_message}\",\n        assistant: \"{assistant_message}\",\n        tool: \"{tool_message}\",\n      },\n      seps: [\"<|im_end|>\\n\"],\n      role_content_sep: \"\\n\",\n      role_empty_sep: \"\\n\",\n      stop_str: [\"<|im_end|>\"],\n      stop_token_ids: [128001, 128009, 128003],\n    },\n    bos_token_id: 128000,\n  };\n\n  // Follows https://github.com/NousResearch/Hermes-Function-Calling/blob/96ebfd7c903216b05e1eb7b155f7d5842b0fbce8/README.md#prompt-format\n  test(\"Test formatting\", () => {\n    const system_prompt = `You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools: <tools> {\"type\": \"function\", \"function\": {\"name\": \"get_stock_fundamentals\", \"description\": \"get_stock_fundamentals(symbol: str) -> dict - Get fundamental data for a given stock symbol using yfinance API.\\\\n\\\\n    Args:\\\\n        symbol (str): The stock symbol.\\\\n\\\\n    Returns:\\\\n        dict: A dictionary containing fundamental data.\\\\n            Keys:\\\\n                - \\'symbol\\': The stock symbol.\\\\n                - \\'company_name\\': The long name of the company.\\\\n                - \\'sector\\': The sector to which the company belongs.\\\\n                - \\'industry\\': The industry to which the company belongs.\\\\n                - \\'market_cap\\': The market capitalization of the company.\\\\n                - \\'pe_ratio\\': The forward price-to-earnings ratio.\\\\n                - \\'pb_ratio\\': The price-to-book ratio.\\\\n                - \\'dividend_yield\\': The dividend yield.\\\\n                - \\'eps\\': The trailing earnings per share.\\\\n                - \\'beta\\': The beta value of the stock.\\\\n                - \\'52_week_high\\': The 52-week high price of the stock.\\\\n                - \\'52_week_low\\': The 52-week low price of the stock.\", \"parameters\": {\"type\": \"object\", \"properties\": {\"symbol\": {\"type\": \"string\"}}, \"required\": [\"symbol\"]}}}  </tools> Use the following pydantic model json schema for each tool call you will make: {\"properties\": {\"arguments\": {\"title\": \"Arguments\", \"type\": \"object\"}, \"name\": {\"title\": \"Name\", \"type\": \"string\"}}, \"required\": [\"arguments\", \"name\"], \"title\": \"FunctionCall\", \"type\": \"object\"} For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:\\n<tool_call>\\n{\"arguments\": <args-dict>, \"name\": <function-name>}\\n</tool_call>`;\n    const request: ChatCompletionRequest = {\n      messages: [\n        { role: \"system\", content: system_prompt },\n        {\n          role: \"user\",\n          content: \"Fetch the stock fundamentals data for Tesla (TSLA)\",\n        },\n        {\n          role: \"assistant\",\n          content: `<tool_call>\\n{\"arguments\": {\"symbol\": \"TSLA\"}, \"name\": \"get_stock_fundamentals\"}\\n</tool_call>`,\n        },\n        {\n          role: \"tool\",\n          tool_call_id: \"0\",\n          content: `<tool_response>\\n{\"name\": \"get_stock_fundamentals\", \"content\": {'symbol': 'TSLA', 'company_name': 'Tesla, Inc.', 'sector': 'Consumer Cyclical', 'industry': 'Auto Manufacturers', 'market_cap': 611384164352, 'pe_ratio': 49.604652, 'pb_ratio': 9.762013, 'dividend_yield': None, 'eps': 4.3, 'beta': 2.427, '52_week_high': 299.29, '52_week_low': 152.37}}\\n</tool_response>\\n`,\n        },\n        { role: \"assistant\", content: \"Some replies here\" },\n        { role: \"user\", content: \"Thank you.\" },\n      ],\n    };\n    // Since we treat last input as PrefillStep input, last message is not included in `conv`\n    const conv = getConversationFromChatCompletionRequest(\n      request,\n      hermes2LlamaChatConfig,\n    );\n    const promptArray = conv.getPromptArray();\n    let finalMessage = \"\";\n    for (const msg of promptArray) {\n      finalMessage += msg;\n    }\n    const expected =\n      `<|im_start|>system\\n` +\n      system_prompt +\n      `<|im_end|>\\n` +\n      `<|im_start|>user\\nFetch the stock fundamentals data for Tesla (TSLA)<|im_end|>\\n` +\n      `<|im_start|>assistant\\n<tool_call>\\n{\"arguments\": {\"symbol\": \"TSLA\"}, \"name\": \"get_stock_fundamentals\"}\\n</tool_call><|im_end|>\\n` +\n      `<|im_start|>tool\\n<tool_response>\\n{\"name\": \"get_stock_fundamentals\", \"content\": {'symbol': 'TSLA', 'company_name': 'Tesla, Inc.', 'sector': 'Consumer Cyclical', 'industry': 'Auto Manufacturers', 'market_cap': 611384164352, 'pe_ratio': 49.604652, 'pb_ratio': 9.762013, 'dividend_yield': None, 'eps': 4.3, 'beta': 2.427, '52_week_high': 299.29, '52_week_low': 152.37}}\\n</tool_response>\\n<|im_end|>\\n` +\n      `<|im_start|>assistant\\nSome replies here<|im_end|>\\n`;\n    expect(finalMessage).toEqual(expected);\n  });\n});\n\ndescribe(\"Test Llama3.1 formatting\", () => {\n  // Follows https://github.com/NousResearch/Hermes-Function-Calling/blob/96ebfd7c903216b05e1eb7b155f7d5842b0fbce8/README.md#prompt-format\n  test(\"Test formatting\", () => {\n    const system_prompt = `Cutting Knowledge Date: December 2023\n    Today Date: 23 Jul 2024\n    # Tool Instructions\n    - When looking for real time information use relevant functions if available\n    You have access to the following functions:\n    \n    {\n        \"type\": \"function\",\n        \"function\": {\n            \"name\": \"get_current_temperature\",\n            \"description\": \"Get the current temperature at a location.\",\n            \"parameters\": {\n                \"type\": \"object\",\n                \"properties\": {\n                    \"location\": {\n                        \"type\": \"string\",\n                        \"description\": \"The location to get the temperature for, in the format \\\"City, Country\\\"\"\n                    }\n                },\n                \"required\": [\n                    \"location\"\n                ]\n            },\n            \"return\": {\n                \"type\": \"number\",\n                \"description\": \"The current temperature at the specified location in the specified units, as a float.\"\n            }\n        }\n    }\n    {\n        \"type\": \"function\",\n        \"function\": {\n            \"name\": \"send_message\",\n            \"description\": \"Send a message to a recipient.\",\n            \"parameters\": {\n                \"type\": \"object\",\n                \"properties\": {\n                    \"recipient\": {\n                        \"type\": \"string\",\n                        \"description\": \"Name of the recipient of the message\"\n                    }\n                    \"content\": {\n                        \"type\": \"string\",\n                        \"description\": \"Content of the message\"\n                    }\n                },\n                \"required\": [\n                    \"recipient\",\n                    \"content\"\n                ]\n            },\n            \"return\": {\n                \"type\": \"None\"\n            }\n        }\n    }\n    If a you choose to call a function ONLY reply in the following format:\n        <function>{\"name\": function name, \"parameters\": dictionary of argument name and its value}</function>\n    Here is an example,\n        <function>{\"name\": \"example_function_name\", \"parameters\": {\"example_name\": \"example_value\"}}</function>\n    Reminder:\n    - Function calls MUST follow the specified format and use BOTH <function> and </function>\n    - Required parameters MUST be specified\n    - Only call one function at a time\n    - When calling a function, do NOT add any other words, ONLY the function calling\n    - Put the entire function call reply on one line\n    - Always add your sources when using search results to answer the user query\n    You are a helpful Assistant.`;\n    const user1 = \"Hey, what's the temperature in Paris right now?\";\n    const assistant1 = `<function>{\"name\": \"get_current_temperature\", \"parameters\": {\"location\": \"Paris, France\"}}</function>`;\n    const tool1 = `{\"output\": 22.5}`;\n    const assistant2 = `The current temperature in Paris is 22.5°C.`;\n    const user2 = \"Send a message to Tom to tell him this information.\";\n    const assistant3 = `<function>{\"name\": \"send_message\", \"parameters\": {\"recipient\": \"Tom\", \"content\": \"The current temperature in Paris is 22.5°C.\"}}</function>`;\n    const tool2 = `{\"output\": None}`;\n    const assistant4 = `The message has been sent to Tom.`;\n\n    const request: ChatCompletionRequest = {\n      messages: [\n        { role: \"system\", content: system_prompt },\n        { role: \"user\", content: user1 },\n        { role: \"assistant\", content: assistant1 },\n        { role: \"tool\", tool_call_id: \"0\", content: tool1 },\n        { role: \"assistant\", content: assistant2 },\n        { role: \"user\", content: user2 },\n        { role: \"assistant\", content: assistant3 },\n        { role: \"tool\", tool_call_id: \"1\", content: tool2 },\n        { role: \"assistant\", content: assistant4 },\n        { role: \"user\", content: \"Thank you.\" },\n      ],\n    };\n    // Since we treat last input as PrefillStep input, last message is not included in `conv`\n    const conv = getConversationFromChatCompletionRequest(\n      request,\n      llama3_1ChatConfig,\n    );\n    const promptArray = conv.getPromptArray();\n    let finalMessage = \"\";\n    for (const msg of promptArray) {\n      finalMessage += msg;\n    }\n    // Expected is generated with transformers in Python `tokenizer.apply_chat_template()`\n    const expected = `<|start_header_id|>system<|end_header_id|>\\n\\nCutting Knowledge Date: December 2023\\n    Today Date: 23 Jul 2024\\n    # Tool Instructions\\n    - When looking for real time information use relevant functions if available\\n    You have access to the following functions:\\n    \\n    {\\n        \"type\": \"function\",\\n        \"function\": {\\n            \"name\": \"get_current_temperature\",\\n            \"description\": \"Get the current temperature at a location.\",\\n            \"parameters\": {\\n                \"type\": \"object\",\\n                \"properties\": {\\n                    \"location\": {\\n                        \"type\": \"string\",\\n                        \"description\": \"The location to get the temperature for, in the format \"City, Country\"\"\\n                    }\\n                },\\n                \"required\": [\\n                    \"location\"\\n                ]\\n            },\\n            \"return\": {\\n                \"type\": \"number\",\\n                \"description\": \"The current temperature at the specified location in the specified units, as a float.\"\\n            }\\n        }\\n    }\\n    {\\n        \"type\": \"function\",\\n        \"function\": {\\n            \"name\": \"send_message\",\\n            \"description\": \"Send a message to a recipient.\",\\n            \"parameters\": {\\n                \"type\": \"object\",\\n                \"properties\": {\\n                    \"recipient\": {\\n                        \"type\": \"string\",\\n                        \"description\": \"Name of the recipient of the message\"\\n                    }\\n                    \"content\": {\\n                        \"type\": \"string\",\\n                        \"description\": \"Content of the message\"\\n                    }\\n                },\\n                \"required\": [\\n                    \"recipient\",\\n                    \"content\"\\n                ]\\n            },\\n            \"return\": {\\n                \"type\": \"None\"\\n            }\\n        }\\n    }\\n    If a you choose to call a function ONLY reply in the following format:\\n        <function>{\"name\": function name, \"parameters\": dictionary of argument name and its value}</function>\\n    Here is an example,\\n        <function>{\"name\": \"example_function_name\", \"parameters\": {\"example_name\": \"example_value\"}}</function>\\n    Reminder:\\n    - Function calls MUST follow the specified format and use BOTH <function> and </function>\\n    - Required parameters MUST be specified\\n    - Only call one function at a time\\n    - When calling a function, do NOT add any other words, ONLY the function calling\\n    - Put the entire function call reply on one line\\n    - Always add your sources when using search results to answer the user query\\n    You are a helpful Assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\\n\\nHey, what\\'s the temperature in Paris right now?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\\n\\n<function>{\"name\": \"get_current_temperature\", \"parameters\": {\"location\": \"Paris, France\"}}</function><|eot_id|><|start_header_id|>ipython<|end_header_id|>\\n\\n{\"output\": 22.5}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\\n\\nThe current temperature in Paris is 22.5°C.<|eot_id|><|start_header_id|>user<|end_header_id|>\\n\\nSend a message to Tom to tell him this information.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\\n\\n<function>{\"name\": \"send_message\", \"parameters\": {\"recipient\": \"Tom\", \"content\": \"The current temperature in Paris is 22.5°C.\"}}</function><|eot_id|><|start_header_id|>ipython<|end_header_id|>\\n\\n{\"output\": None}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\\n\\nThe message has been sent to Tom.<|eot_id|>`;\n    expect(finalMessage).toEqual(expected);\n  });\n});\n"
  },
  {
    "path": "tests/generation_config.test.ts",
    "content": "import {\n  GenerationConfig,\n  postInitAndCheckGenerationConfigValues,\n} from \"../src/config\";\nimport { describe, expect, test } from \"@jest/globals\";\n\ndescribe(\"Check generation config illegal values\", () => {\n  test(\"High-level unsupported fields\", () => {\n    expect(() => {\n      const genConfig: GenerationConfig = {\n        max_tokens: 0,\n      };\n      postInitAndCheckGenerationConfigValues(genConfig);\n    }).toThrow(\"Make sure `max_tokens` > 0\");\n  });\n\n  test(\"logit_bias exceeds range\", () => {\n    expect(() => {\n      const genConfig: GenerationConfig = {\n        max_tokens: 10,\n        logit_bias: {\n          \"1355\": 155,\n        },\n      };\n      postInitAndCheckGenerationConfigValues(genConfig);\n    }).toThrow(\"Make sure -100 < logit_bias <= 100.\");\n  });\n\n  test(\"logit_bias invalid key\", () => {\n    expect(() => {\n      const genConfig: GenerationConfig = {\n        max_tokens: 10,\n        logit_bias: {\n          thisRaisesError: 50,\n        },\n      };\n      postInitAndCheckGenerationConfigValues(genConfig);\n    }).toThrow(\n      \"Make sure logit_bias's keys to be number represented in string.\",\n    );\n  });\n\n  test(\"top_logprobs out of range\", () => {\n    expect(() => {\n      const genConfig: GenerationConfig = {\n        logprobs: true,\n        top_logprobs: 6,\n        max_tokens: 10,\n      };\n      postInitAndCheckGenerationConfigValues(genConfig);\n    }).toThrow(\"Make sure 0 < top_logprobs <= 5.\");\n  });\n\n  test(\"top_logprobs set without setting logprobs\", () => {\n    expect(() => {\n      const genConfig: GenerationConfig = {\n        top_logprobs: 3,\n        max_tokens: 10,\n      };\n      postInitAndCheckGenerationConfigValues(genConfig);\n    }).toThrow(\"top_logprobs requires logprobs to be true\");\n  });\n\n  test(\"top_logprobs set though logprobs is false\", () => {\n    expect(() => {\n      const genConfig: GenerationConfig = {\n        logprobs: false,\n        top_logprobs: 3,\n        max_tokens: 10,\n      };\n      postInitAndCheckGenerationConfigValues(genConfig);\n    }).toThrow(\"top_logprobs requires logprobs to be true\");\n  });\n});\n\ndescribe(\"Check generation post init\", () => {\n  test(\"Only set one of presence or frequency penalty\", () => {\n    const genConfig: GenerationConfig = {\n      frequency_penalty: 1.5,\n    };\n    postInitAndCheckGenerationConfigValues(genConfig);\n    expect(genConfig.presence_penalty).toBe(0.0);\n  });\n\n  test(\"Set logprobs without setting top_logprobs\", () => {\n    const genConfig: GenerationConfig = {\n      logprobs: true,\n    };\n    postInitAndCheckGenerationConfigValues(genConfig);\n    expect(genConfig.top_logprobs).toBe(0);\n  });\n\n  test(\"Set both logprobs and top_logprobs\", () => {\n    const genConfig: GenerationConfig = {\n      logprobs: true,\n      top_logprobs: 2,\n    };\n    postInitAndCheckGenerationConfigValues(genConfig);\n    expect(genConfig.top_logprobs).toBe(2);\n  });\n});\n"
  },
  {
    "path": "tests/llm_chat_pipeline.test.ts",
    "content": "import { LLMChatPipeline } from \"../src/llm_chat\";\nimport { MinValueError } from \"../src/error\";\nimport { Role } from \"../src/config\";\nimport { jest, test, expect, beforeEach } from \"@jest/globals\";\n\njest.mock(\"@mlc-ai/web-xgrammar\", () => {\n  const grammarMatcherInstances: any[] = [];\n  const compileBuiltinJSONGrammar = jest\n    .fn()\n    .mockImplementation(async () => ({ dispose: jest.fn() }));\n  const compileJSONSchema = jest\n    .fn()\n    .mockImplementation(async () => ({ dispose: jest.fn() }));\n  const compileGrammar = jest\n    .fn()\n    .mockImplementation(async () => ({ dispose: jest.fn() }));\n  return {\n    TokenizerInfo: {\n      createTokenizerInfo: jest.fn(async () => \"tokenInfo\"),\n    },\n    GrammarCompiler: {\n      createGrammarCompiler: jest.fn(async () => ({\n        compileBuiltinJSONGrammar,\n        compileJSONSchema,\n        compileGrammar,\n      })),\n      __compileBuiltinJSONGrammar: compileBuiltinJSONGrammar,\n      __compileJSONSchema: compileJSONSchema,\n      __compileGrammar: compileGrammar,\n    },\n    GrammarMatcher: {\n      createGrammarMatcher: jest.fn(async () => {\n        const matcher = { dispose: jest.fn(), reset: jest.fn() };\n        grammarMatcherInstances.push(matcher);\n        return matcher;\n      }),\n      __instances: grammarMatcherInstances,\n    },\n  };\n});\n\ntype XGrammarMock = {\n  TokenizerInfo: {\n    createTokenizerInfo: jest.Mock;\n  };\n  GrammarCompiler: {\n    createGrammarCompiler: jest.Mock;\n    __compileBuiltinJSONGrammar: jest.Mock;\n    __compileJSONSchema: jest.Mock;\n    __compileGrammar: jest.Mock;\n  };\n  GrammarMatcher: {\n    createGrammarMatcher: jest.Mock;\n    __instances: any[];\n  };\n};\n\nconst xgrammar = jest.requireMock<XGrammarMock>(\"@mlc-ai/web-xgrammar\");\nconst grammarMatcherInstances = xgrammar.GrammarMatcher.__instances;\nconst compileGrammarMock = xgrammar.GrammarCompiler.__compileGrammar;\nconst compileJSONSchemaMock = xgrammar.GrammarCompiler.__compileJSONSchema;\n\nbeforeEach(() => {\n  grammarMatcherInstances.length = 0;\n  compileGrammarMock.mockClear();\n  compileJSONSchemaMock.mockClear();\n});\n\ntype PipelineLike = LLMChatPipeline & Record<string, any>;\n\nfunction createPipeline(): PipelineLike {\n  const pipeline = Object.create(LLMChatPipeline.prototype) as PipelineLike;\n  pipeline[\"stopTriggered\"] = false;\n  pipeline[\"finishReason\"] = undefined;\n  pipeline[\"conversation\"] = {\n    isTextCompletion: false,\n    finishReply: jest.fn(),\n    appendMessage: jest.fn(),\n    appendEmptyThinkingReplyHeader: jest.fn(),\n    appendReplyHeader: jest.fn(),\n    config: {},\n    getPromptArray: jest.fn(() => [\"prompt\"]),\n    getPromptArrayLastRound: jest.fn(() => [\"last\"]),\n    getPromptArrayTextCompletion: jest.fn(() => [\"text\"]),\n  } as any;\n  pipeline[\"outputIds\"] = [];\n  pipeline[\"appearedTokensFreq\"] = new Map<number, number>();\n  pipeline[\"stopTokens\"] = [];\n  pipeline[\"stopStr\"] = [];\n  pipeline[\"tokenizer\"] = {\n    decode: jest.fn((ids: Int32Array) =>\n      Array.from(ids)\n        .map((id) => `t${id}`)\n        .join(\" \"),\n    ),\n    encode: jest.fn(() => Int32Array.from([1])),\n    getVocabSize: jest.fn(() => 1),\n    idToToken: jest.fn(() => \"<tok>\"),\n  } as any;\n  pipeline[\"contextWindowSize\"] = 16;\n  pipeline[\"slidingWindowSize\"] = -1;\n  pipeline[\"filledKVCacheLength\"] = 0;\n  pipeline[\"outputMessage\"] = \"\";\n  pipeline[\"curRoundLatencyBreakdown\"] = {\n    logitProcessorTime: [],\n    logitBiasTime: [],\n    penaltyTime: [],\n    sampleTime: [],\n    totalTime: [],\n    grammarBitmaskTime: [],\n  };\n  pipeline[\"prefillChunkSize\"] = 8;\n  pipeline[\"tvm\"] = {\n    beginScope: jest.fn(),\n    endScope: jest.fn(),\n    detachFromCurrentScope: jest.fn((x: any) => x),\n  } as any;\n  pipeline[\"device\"] = {\n    sync: jest.fn(async () => undefined),\n  } as any;\n  pipeline[\"embedAndForward\"] = jest.fn(\n    async (_chunk: any, chunkLen: number) => {\n      pipeline[\"filledKVCacheLength\"] += chunkLen;\n      return {\n        dispose: jest.fn(),\n        shape: [],\n        dtype: \"float32\",\n        device: {},\n        ndim: 0,\n      };\n    },\n  ) as any;\n  pipeline[\"sampleTokenFromLogits\"] = jest.fn(async () => 2);\n  pipeline[\"resetRuntimeStats\"] = jest.fn();\n  pipeline[\"resetStatsPerPrefill\"] = false;\n  pipeline[\"prefillTotalTime\"] = 0;\n  pipeline[\"prefillTotalTokens\"] = 0;\n  pipeline[\"curRoundPrefillTotalTokens\"] = 0;\n  pipeline[\"curRoundPrefillTotalTime\"] = 0;\n  pipeline[\"curRoundGrammarInitTotalTime\"] = 0;\n  pipeline[\"curRoundGrammarPerTokenTotalTime\"] = 0;\n  pipeline[\"tokenLogprobArray\"] = [];\n  pipeline[\"curRoundDecodingTotalTokens\"] = 0;\n  pipeline[\"curRoundDecodingTotalTime\"] = 0;\n  return pipeline;\n}\n\ntest(\"processNextToken stops on stop token and updates conversation\", () => {\n  const pipeline = createPipeline();\n  pipeline[\"stopTokens\"] = [42];\n  (pipeline as any).processNextToken(42);\n  expect(pipeline[\"stopTriggered\"]).toBe(true);\n  expect(pipeline[\"finishReason\"]).toBe(\"stop\");\n  expect(pipeline[\"conversation\"].finishReply).toHaveBeenCalledWith(\"\");\n});\n\ntest(\"processNextToken appends tokens until stop string reached\", () => {\n  const pipeline = createPipeline();\n  pipeline[\"stopStr\"] = [\"<stop>\"];\n  pipeline[\"tokenizer\"].decode = jest\n    .fn<(ids: Int32Array) => string>()\n    .mockReturnValueOnce(\"partial\")\n    .mockReturnValueOnce(\"partial<stop>\");\n  (pipeline as any).processNextToken(1, {\n    max_tokens: 5,\n  });\n  expect(pipeline[\"stopTriggered\"]).toBe(false);\n  (pipeline as any).processNextToken(2, {\n    max_tokens: 5,\n  });\n  expect(pipeline[\"stopTriggered\"]).toBe(true);\n  expect(pipeline[\"finishReason\"]).toBe(\"stop\");\n  expect(pipeline[\"outputMessage\"]).toBe(\"partial\");\n});\n\ntest(\"processNextToken respects max_tokens and updates token frequency\", () => {\n  const pipeline = createPipeline();\n  (pipeline as any).processNextToken(7, { max_tokens: 1 });\n  expect(pipeline[\"appearedTokensFreq\"].get(7)).toBe(1);\n  expect(pipeline[\"finishReason\"]).toBe(\"length\");\n});\n\ntest(\"processNextToken throws when max_tokens is below zero\", () => {\n  const pipeline = createPipeline();\n  expect(() =>\n    (pipeline as any).processNextToken(1, { max_tokens: -1 }),\n  ).toThrow(MinValueError);\n});\n\ntest(\"triggerStop converts conversation reply to finished state\", () => {\n  const pipeline = createPipeline();\n  pipeline[\"outputMessage\"] = \"final\";\n  pipeline[\"conversation\"].isTextCompletion = false;\n  pipeline.triggerStop();\n  expect(pipeline[\"stopTriggered\"]).toBe(true);\n  expect(pipeline[\"finishReason\"]).toBe(\"abort\");\n  expect(pipeline[\"conversation\"].finishReply).toHaveBeenCalledWith(\"final\");\n});\n\nfunction preparePrefillPipeline(): PipelineLike {\n  const pipeline = createPipeline();\n  pipeline[\"prefillTotalTime\"] = 0;\n  pipeline[\"prefillTotalTokens\"] = 0;\n  pipeline[\"getInputData\"] = jest.fn<() => [number[][], number]>(() => [\n    [[0]],\n    1,\n  ]);\n  pipeline[\"processNextToken\"] = jest.fn();\n  return pipeline;\n}\n\ntest(\"prefillStep adds thinking reply header when thinking disabled\", async () => {\n  const pipeline = preparePrefillPipeline();\n  pipeline[\"tokenizer\"].encode = jest.fn(() => Int32Array.from([9, 9]));\n  await pipeline.prefillStep(\"hello\", Role.user, undefined, {\n    enable_thinking: false,\n  });\n  expect(\n    pipeline[\"conversation\"].appendEmptyThinkingReplyHeader,\n  ).toHaveBeenCalled();\n  expect(pipeline[\"conversation\"].appendReplyHeader).not.toHaveBeenCalled();\n  expect(pipeline[\"outputIds\"].length).toBeGreaterThan(0);\n  expect(pipeline[\"processNextToken\"]).toHaveBeenCalled();\n});\n\ntest(\"prefillStep appends standard reply header when thinking enabled\", async () => {\n  const pipeline = preparePrefillPipeline();\n  pipeline[\"tokenizer\"].encode = jest.fn(() => Int32Array.from([2]));\n  await pipeline.prefillStep(\"hi\", Role.user);\n  expect(pipeline[\"conversation\"].appendReplyHeader).toHaveBeenCalledWith(\n    Role.assistant,\n  );\n  expect(\n    pipeline[\"conversation\"].appendEmptyThinkingReplyHeader,\n  ).not.toHaveBeenCalled();\n});\n\ntest(\"prefillStep reuses grammar matcher when schema unchanged\", async () => {\n  const pipeline = preparePrefillPipeline();\n  const matcher = { reset: jest.fn(), dispose: jest.fn() };\n  pipeline[\"grammarMatcher\"] = matcher as any;\n  pipeline[\"responseFormatCacheKey\"] = \"schema_v1\";\n  await pipeline.prefillStep(\"hello\", Role.user, undefined, {\n    response_format: { type: \"grammar\", grammar: \"schema_v1\" },\n  });\n  expect(matcher.reset).toHaveBeenCalled();\n});\n\ntest(\"prefillStep instantiates new grammar matcher when schema changes\", async () => {\n  const pipeline = preparePrefillPipeline();\n  pipeline[\"grammarMatcher\"] = undefined;\n  pipeline[\"responseFormatCacheKey\"] = undefined;\n  pipeline[\"xgTokenizerInfo\"] = undefined;\n  pipeline[\"grammarCompiler\"] = undefined;\n  await pipeline.prefillStep(\"hello\", Role.user, undefined, {\n    response_format: { type: \"json_object\", schema: \"{}\" },\n  });\n  expect(xgrammar.TokenizerInfo.createTokenizerInfo).toHaveBeenCalled();\n  expect(xgrammar.GrammarMatcher.createGrammarMatcher).toHaveBeenCalled();\n  expect(pipeline[\"responseFormatCacheKey\"]).toBe(\"{}\");\n});\n\ntest(\"prefillStep compiles custom grammar when response type is grammar\", async () => {\n  const pipeline = preparePrefillPipeline();\n  pipeline[\"grammarMatcher\"] = undefined;\n  pipeline[\"responseFormatCacheKey\"] = undefined;\n  pipeline[\"xgTokenizerInfo\"] = undefined;\n  pipeline[\"grammarCompiler\"] = undefined;\n  await pipeline.prefillStep(\"hello\", Role.user, undefined, {\n    response_format: { type: \"grammar\", grammar: \"root ::= WORD\" },\n  });\n  expect(compileGrammarMock).toHaveBeenCalledWith(\"root ::= WORD\");\n});\n\ntest(\"getInputData uses cached prompts when KV cache filled\", () => {\n  const pipeline = createPipeline();\n  pipeline[\"tokenizer\"].encode = jest.fn(() => Int32Array.from([1]));\n  pipeline[\"conversation\"].config.system_prefix_token_ids = undefined;\n  pipeline[\"filledKVCacheLength\"] = 0;\n  (pipeline as any).getInputData();\n  expect(pipeline[\"conversation\"].getPromptArray).toHaveBeenCalled();\n  pipeline[\"filledKVCacheLength\"] = 1;\n  (pipeline as any).getInputData();\n  expect(pipeline[\"conversation\"].getPromptArrayLastRound).toHaveBeenCalled();\n});\n\ntest(\"processNextToken ignores eos when requested\", () => {\n  const pipeline = createPipeline();\n  pipeline[\"stopTokens\"] = [1];\n  (pipeline as any).processNextToken(1, { ignore_eos: true });\n  expect(pipeline[\"stopTriggered\"]).toBe(false);\n  expect(pipeline[\"finishReason\"]).toBeUndefined();\n  expect(pipeline[\"outputIds\"]).toContain(1);\n});\n"
  },
  {
    "path": "tests/multi_round_chat.test.ts",
    "content": "import { describe, expect, test } from \"@jest/globals\";\n\nimport {\n  ChatCompletionMessageParam,\n  ChatCompletionRequest,\n  ChatCompletionUserMessageParam,\n} from \"../src/openai_api_protocols/chat_completion\";\nimport {\n  Conversation,\n  compareConversationObject,\n  getConversationFromChatCompletionRequest,\n} from \"../src/conversation\";\nimport { ChatConfig, Role } from \"../src/config\";\n\nconst configStr =\n  \"{\" +\n  '  \"conv_template\": {' +\n  '    \"name\": \"llama-2\",' +\n  '    \"system_template\": \"[INST] <<SYS>>\\\\n{system_message}\\\\n<</SYS>>\\\\n\\\\n\",' +\n  '    \"system_message\": \"You are a helpful, respectful and honest assistant.\",' +\n  '    \"system_prefix_token_ids\": [' +\n  \"      1\" +\n  \"    ],\" +\n  '    \"add_role_after_system_message\": false,' +\n  '    \"roles\": {' +\n  '      \"user\": \"[INST]\",' +\n  '      \"assistant\": \"[/INST]\",' +\n  '      \"tool\": \"[INST]\"' +\n  \"    },\" +\n  '    \"role_templates\": {' +\n  '      \"user\": \"{user_message}\",' +\n  '      \"assistant\": \"{assistant_message}\",' +\n  '      \"tool\": \"{tool_message}\"' +\n  \"    },\" +\n  '    \"messages\": [],' +\n  '    \"seps\": [' +\n  '      \" \"' +\n  \"    ],\" +\n  '    \"role_content_sep\": \" \",' +\n  '    \"role_empty_sep\": \" \",' +\n  '    \"stop_str\": [' +\n  '      \"[INST]\"' +\n  \"    ],\" +\n  '    \"stop_token_ids\": [' +\n  \"      2\" +\n  \"    ],\" +\n  '    \"function_string\": \"\",' +\n  '    \"use_function_calling\": false' +\n  \"  }\" +\n  \"}\";\n\ndescribe(\"Test multi-round chatting\", () => {\n  test(\"Test is multi-round\", () => {\n    // Setups\n    const config_json = JSON.parse(configStr);\n    const chatConfig = { ...config_json } as ChatConfig;\n\n    // Simulate request0\n    const messages: ChatCompletionMessageParam[] = [\n      {\n        role: \"system\",\n        content:\n          \"[INST] <<SYS>>\\n\\nYou are a helpful, respectful and honest assistant. \" +\n          \"Be as happy as you can when speaking please.\\n<</SYS>>\\n\\n \",\n      },\n      { role: \"user\", content: \"Provide me three US states.\" },\n    ];\n    const request0: ChatCompletionRequest = {\n      messages: messages,\n    };\n\n    // Simulate processing of request0, appending response to convA (done by LLMChatPipeline)\n    const conv0: Conversation = getConversationFromChatCompletionRequest(\n      request0,\n      chatConfig,\n    );\n    conv0.appendMessage(Role.user, \"Provide me three US states.\");\n    const reply0 = \"California, New York, Nevada.\";\n    conv0.appendMessage(Role.assistant, reply0); // simulated response\n\n    // Simulate request1, where user maintain the chat history, appending the resposne\n    const newMessages = [...messages];\n    newMessages.push({ role: \"assistant\", content: reply0 });\n    newMessages.push({ role: \"user\", content: \"Two more please\" }); // next input\n\n    const request1: ChatCompletionRequest = {\n      messages: newMessages,\n    };\n    const conv1: Conversation = getConversationFromChatCompletionRequest(\n      request1,\n      chatConfig,\n    );\n\n    expect(compareConversationObject(conv0, conv1)).toBe(true);\n  });\n\n  test(\"Test is NOT multi-round due to multiple new inputs\", () => {\n    // Setups\n    const config_json = JSON.parse(configStr);\n    const chatConfig = { ...config_json } as ChatConfig;\n\n    // Simulate request0\n    const messages: ChatCompletionMessageParam[] = [\n      {\n        role: \"system\",\n        content:\n          \"[INST] <<SYS>>\\n\\nYou are a helpful, respectful and honest assistant. \" +\n          \"Be as happy as you can when speaking please.\\n<</SYS>>\\n\\n \",\n      },\n      { role: \"user\", content: \"Provide me three US states.\" },\n    ];\n    const request0: ChatCompletionRequest = {\n      messages: messages,\n    };\n\n    // Simulate processing of request0, appending response to convA (done by LLMChatPipeline)\n    const conv0: Conversation = getConversationFromChatCompletionRequest(\n      request0,\n      chatConfig,\n    );\n    conv0.appendMessage(Role.user, \"Provide me three US states.\");\n    const reply0 = \"California, New York, Nevada.\";\n    conv0.appendMessage(Role.assistant, reply0); // simulated response\n\n    // Simulate request1, where user maintain the chat history, appending the resposne\n    const newMessages = [...messages];\n    newMessages.push({ role: \"assistant\", content: reply0 });\n    newMessages.push({ role: \"user\", content: \"Two more please\" }); // next input\n\n    // Code above same as previous tests\n    // Add one more round of chat history\n    newMessages.push({ role: \"assistant\", content: \"Pennsylvania, Florida\" }); // next response\n    newMessages.push({ role: \"user\", content: \"Thank you!\" }); // next input\n\n    const request1: ChatCompletionRequest = {\n      messages: newMessages,\n    };\n    const conv1: Conversation = getConversationFromChatCompletionRequest(\n      request1,\n      chatConfig,\n    );\n\n    expect(compareConversationObject(conv0, conv1)).toBe(false);\n  });\n\n  test(\"Test is NOT multi-round due to change in system prompt\", () => {\n    // Setups\n    const config_json = JSON.parse(configStr);\n    const chatConfig = { ...config_json } as ChatConfig;\n\n    // Simulate request0\n    const messages: ChatCompletionMessageParam[] = [\n      {\n        role: \"system\",\n        content:\n          \"[INST] <<SYS>>\\n\\nYou are a helpful, respectful and honest assistant. \" +\n          \"Be as happy as you can when speaking please.\\n<</SYS>>\\n\\n \",\n      },\n      { role: \"user\", content: \"Provide me three US states.\" },\n    ];\n    const request0: ChatCompletionRequest = {\n      messages: messages,\n    };\n\n    // Simulate processing of request0, appending response to convA (done by LLMChatPipeline)\n    const conv0: Conversation = getConversationFromChatCompletionRequest(\n      request0,\n      chatConfig,\n    );\n    conv0.appendMessage(Role.user, \"Provide me three US states.\");\n    const reply0 = \"California, New York, Nevada.\";\n    conv0.appendMessage(Role.assistant, reply0); // simulated response\n\n    // Simulate request1, where user maintain the chat history, appending the resposne\n    const newMessages = [...messages];\n    newMessages.push({ role: \"assistant\", content: reply0 });\n    newMessages.push({ role: \"user\", content: \"Two more please\" }); // next input\n\n    // Code above same as previous tests\n    // Changed system prompt, should be false\n    newMessages[0].content = \"No system prompt\";\n\n    const request1: ChatCompletionRequest = {\n      messages: newMessages,\n    };\n    const conv1: Conversation = getConversationFromChatCompletionRequest(\n      request1,\n      chatConfig,\n    );\n\n    expect(compareConversationObject(conv0, conv1)).toBe(false);\n  });\n\n  test(\"Test is NOT multi-round due to change in role name\", () => {\n    // Setups\n    const config_json = JSON.parse(configStr);\n    const chatConfig = { ...config_json } as ChatConfig;\n\n    // Simulate request0\n    const messages: ChatCompletionMessageParam[] = [\n      {\n        role: \"system\",\n        content:\n          \"[INST] <<SYS>>\\n\\nYou are a helpful, respectful and honest assistant. \" +\n          \"Be as happy as you can when speaking please.\\n<</SYS>>\\n\\n \",\n      },\n      { role: \"user\", content: \"Provide me three US states.\" },\n    ];\n    const request0: ChatCompletionRequest = {\n      messages: messages,\n    };\n\n    // Simulate processing of request0, appending response to convA (done by LLMChatPipeline)\n    const conv0: Conversation = getConversationFromChatCompletionRequest(\n      request0,\n      chatConfig,\n    );\n    conv0.appendMessage(Role.user, \"Provide me three US states.\");\n    const reply0 = \"California, New York, Nevada.\";\n    conv0.appendMessage(Role.assistant, reply0); // simulated response\n\n    // Simulate request1, where user maintain the chat history, appending the resposne\n    const newMessages = [...messages];\n    newMessages.push({ role: \"assistant\", content: reply0 });\n    newMessages.push({ role: \"user\", content: \"Two more please\" }); // next input\n\n    // Code above same as previous tests\n    // Changed system prompt, should be false\n    (newMessages[1] as ChatCompletionUserMessageParam).name = \"Bob\";\n\n    const request1: ChatCompletionRequest = {\n      messages: newMessages,\n    };\n    const conv1: Conversation = getConversationFromChatCompletionRequest(\n      request1,\n      chatConfig,\n    );\n\n    expect(compareConversationObject(conv0, conv1)).toBe(false);\n  });\n});\n"
  },
  {
    "path": "tests/openai_chat_completion.test.ts",
    "content": "/* eslint-disable no-useless-escape */\nimport {\n  postInitAndCheckFields,\n  ChatCompletionRequest,\n  ChatCompletionTool,\n} from \"../src/openai_api_protocols/chat_completion\";\nimport {\n  hermes2FunctionCallingSystemPrompt,\n  officialHermes2FunctionCallSchemaArray,\n} from \"../src/support\";\nimport { MessagePlaceholders, ModelType } from \"../src/config\";\nimport { describe, expect, test } from \"@jest/globals\";\n\ndescribe(\"Check chat completion unsupported requests\", () => {\n  test(\"stream_options without stream specified\", () => {\n    expect(() => {\n      const request: ChatCompletionRequest = {\n        messages: [{ role: \"user\", content: \"Hello! \" }],\n        stream_options: { include_usage: true },\n      };\n      postInitAndCheckFields(\n        request,\n        \"Llama-3.1-8B-Instruct-q4f32_1-MLC\",\n        ModelType.LLM,\n      );\n    }).toThrow(\"Only specify stream_options when stream=True.\");\n  });\n\n  test(\"stream_options with stream=false\", () => {\n    expect(() => {\n      const request: ChatCompletionRequest = {\n        stream: false,\n        messages: [{ role: \"user\", content: \"Hello! \" }],\n        stream_options: { include_usage: true },\n      };\n      postInitAndCheckFields(\n        request,\n        \"Llama-3.1-8B-Instruct-q4f32_1-MLC\",\n        ModelType.LLM,\n      );\n    }).toThrow(\"Only specify stream_options when stream=True.\");\n  });\n\n  test(\"Last message should be from user or tool\", () => {\n    expect(() => {\n      const request: ChatCompletionRequest = {\n        messages: [\n          { role: \"system\", content: \"You are a helpful assistant.\" },\n          { role: \"user\", content: \"Hello! \" },\n          { role: \"assistant\", content: \"Hello! How may I help you today?\" },\n        ],\n      };\n      postInitAndCheckFields(\n        request,\n        \"Llama-3.1-8B-Instruct-q4f32_1-MLC\",\n        ModelType.LLM,\n      );\n    }).toThrow(\"Last message should be from either `user` or `tool`.\");\n  });\n\n  test(\"System prompt should always be the first one in `messages`\", () => {\n    expect(() => {\n      const request: ChatCompletionRequest = {\n        messages: [\n          { role: \"user\", content: \"Hello! \" },\n          { role: \"assistant\", content: \"Hello! How may I help you today?\" },\n          { role: \"user\", content: \"Tell me about Pittsburgh\" },\n          { role: \"system\", content: \"You are a helpful assistant.\" },\n        ],\n      };\n      postInitAndCheckFields(\n        request,\n        \"Llama-3.1-8B-Instruct-q4f32_1-MLC\",\n        ModelType.LLM,\n      );\n    }).toThrow(\n      \"System prompt should always be the first message in `messages`.\",\n    );\n  });\n\n  test(\"When streaming `n` needs to be 1\", () => {\n    expect(() => {\n      const request: ChatCompletionRequest = {\n        stream: true,\n        n: 2,\n        messages: [{ role: \"user\", content: \"Hello! \" }],\n      };\n      postInitAndCheckFields(\n        request,\n        \"Llama-3.1-8B-Instruct-q4f32_1-MLC\",\n        ModelType.LLM,\n      );\n    }).toThrow(\"When streaming, `n` cannot be > 1.\");\n  });\n\n  test(\"Non-integer seed\", () => {\n    expect(() => {\n      const request: ChatCompletionRequest = {\n        messages: [{ role: \"user\", content: \"Hello! \" }],\n        max_tokens: 10,\n        seed: 42.2, // Note that Number.isInteger(42.0) is true\n      };\n      postInitAndCheckFields(\n        request,\n        \"Llama-3.1-8B-Instruct-q4f32_1-MLC\",\n        ModelType.LLM,\n      );\n    }).toThrow(\"`seed` should be an integer, but got\");\n  });\n\n  test(\"Schema without type json object\", () => {\n    expect(() => {\n      const request: ChatCompletionRequest = {\n        messages: [{ role: \"user\", content: \"Hello! \" }],\n        response_format: { schema: \"some json schema\" },\n      };\n      postInitAndCheckFields(\n        request,\n        \"Llama-3.1-8B-Instruct-q4f32_1-MLC\",\n        ModelType.LLM,\n      );\n    }).toThrow(\n      \"JSON schema is only supported with `json_object` response format.\",\n    );\n  });\n\n  test(\"Grammar string without grammar type\", () => {\n    expect(() => {\n      const request: ChatCompletionRequest = {\n        messages: [{ role: \"user\", content: \"Hello! \" }],\n        response_format: { grammar: \"some grammar string\" },\n      };\n      postInitAndCheckFields(\n        request,\n        \"Llama-3.1-8B-Instruct-q4f32_1-MLC\",\n        ModelType.LLM,\n      );\n    }).toThrow(\"When ResponseFormat.type is `grammar`,\");\n  });\n\n  test(\"Grammar type without grammar string\", () => {\n    expect(() => {\n      const request: ChatCompletionRequest = {\n        messages: [{ role: \"user\", content: \"Hello! \" }],\n        response_format: { type: \"grammar\" },\n      };\n      postInitAndCheckFields(\n        request,\n        \"Llama-3.1-8B-Instruct-q4f32_1-MLC\",\n        ModelType.LLM,\n      );\n    }).toThrow(\"When ResponseFormat.type is `grammar`,\");\n  });\n\n  test(\"Valid: Grammar type with grammar string\", () => {\n    const request: ChatCompletionRequest = {\n      messages: [{ role: \"user\", content: \"Hello! \" }],\n      response_format: { type: \"grammar\", grammar: \"some grammar string\" },\n    };\n    postInitAndCheckFields(\n      request,\n      \"Llama-3.1-8B-Instruct-q4f32_1-MLC\",\n      ModelType.LLM,\n    );\n  });\n\n  test(\"image_url.detail is unsupported\", () => {\n    expect(() => {\n      const request: ChatCompletionRequest = {\n        messages: [\n          {\n            role: \"user\",\n            content: [\n              { type: \"text\", text: \"What is in this image?\" },\n              {\n                type: \"image_url\",\n                image_url: {\n                  url: \"https://url_here.jpg\",\n                  detail: \"high\",\n                },\n              },\n            ],\n          },\n        ],\n      };\n      postInitAndCheckFields(\n        request,\n        \"Phi-3.5-vision-instruct-q4f16_1-MLC\",\n        ModelType.VLM,\n      );\n    }).toThrow(\n      \"Currently do not support field image_url.detail, but received: high\",\n    );\n  });\n\n  test(\"User content cannot have multiple text content parts\", () => {\n    expect(() => {\n      const request: ChatCompletionRequest = {\n        messages: [\n          {\n            role: \"user\",\n            content: [\n              { type: \"text\", text: \"What is in this image?\" },\n              {\n                type: \"image_url\",\n                image_url: {\n                  url: \"https://url_here.jpg\",\n                },\n              },\n              { type: \"text\", text: \"Thank you.\" },\n            ],\n          },\n        ],\n      };\n      postInitAndCheckFields(\n        request,\n        \"Phi-3.5-vision-instruct-q4f16_1-MLC\",\n        ModelType.VLM,\n      );\n    }).toThrow(\n      \"Each message can have at most one text contentPart, but received more than 1.\",\n    );\n  });\n\n  test(\"Non-VLM cannot support non-string content\", () => {\n    expect(() => {\n      const request: ChatCompletionRequest = {\n        messages: [\n          {\n            role: \"user\",\n            content: [\n              { type: \"text\", text: \"What is in this image?\" },\n              {\n                type: \"image_url\",\n                image_url: {\n                  url: \"https://url_here.jpg\",\n                },\n              },\n            ],\n          },\n        ],\n      };\n      postInitAndCheckFields(\n        request,\n        \"Llama-3.1-8B-Instruct-q4f32_1-MLC\",\n        ModelType.LLM,\n      );\n    }).toThrow(\n      \"The model loaded is not of type ModelType.VLM (vision-language model).\",\n    );\n  });\n});\n\ndescribe(\"Supported requests\", () => {\n  test(\"Supported chat completion request\", () => {\n    const request: ChatCompletionRequest = {\n      messages: [\n        { role: \"system\", content: \"You are a helpful assistant.\" },\n        { role: \"user\", content: \"Hello! \" },\n        { role: \"assistant\", content: \"How can I help you? \" },\n        { role: \"user\", content: \"Give me 5 US states. \" },\n      ],\n      n: 3,\n      temperature: 1.5,\n      max_tokens: 25,\n      frequency_penalty: 0.2,\n      seed: 42,\n      logprobs: true,\n      top_logprobs: 2,\n      logit_bias: {\n        \"13813\": -100,\n        \"10319\": 5,\n        \"7660\": 5,\n      },\n    };\n    postInitAndCheckFields(\n      request,\n      \"Llama-3.1-8B-Instruct-q4f32_1-MLC\",\n      ModelType.LLM,\n    );\n  });\n\n  test(\"Support image input, single or multiple images\", () => {\n    const request: ChatCompletionRequest = {\n      messages: [\n        {\n          role: \"user\",\n          content: [\n            { type: \"text\", text: \"What is in this image?\" },\n            {\n              type: \"image_url\",\n              image_url: { url: \"https://url_here1.jpg\" },\n            },\n            {\n              type: \"image_url\",\n              image_url: { url: \"https://url_here2.jpg\" },\n            },\n          ],\n        },\n      ],\n    };\n    postInitAndCheckFields(\n      request,\n      \"Phi-3.5-vision-instruct-q4f16_1-MLC\",\n      ModelType.VLM,\n    );\n  });\n});\n\ndescribe(\"Manual function calling\", () => {\n  test(\"Hermes2 style function calling\", () => {\n    const system_prompt = `You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools: <tools> {\"type\": \"function\", \"function\": {\"name\": \"get_stock_fundamentals\", \"description\": \"get_stock_fundamentals(symbol: str) -> dict - Get fundamental data for a given stock symbol using yfinance API.\\\\n\\\\n    Args:\\\\n        symbol (str): The stock symbol.\\\\n\\\\n    Returns:\\\\n        dict: A dictionary containing fundamental data.\\\\n            Keys:\\\\n                - \\'symbol\\': The stock symbol.\\\\n                - \\'company_name\\': The long name of the company.\\\\n                - \\'sector\\': The sector to which the company belongs.\\\\n                - \\'industry\\': The industry to which the company belongs.\\\\n                - \\'market_cap\\': The market capitalization of the company.\\\\n                - \\'pe_ratio\\': The forward price-to-earnings ratio.\\\\n                - \\'pb_ratio\\': The price-to-book ratio.\\\\n                - \\'dividend_yield\\': The dividend yield.\\\\n                - \\'eps\\': The trailing earnings per share.\\\\n                - \\'beta\\': The beta value of the stock.\\\\n                - \\'52_week_high\\': The 52-week high price of the stock.\\\\n                - \\'52_week_low\\': The 52-week low price of the stock.\", \"parameters\": {\"type\": \"object\", \"properties\": {\"symbol\": {\"type\": \"string\"}}, \"required\": [\"symbol\"]}}}  </tools> Use the following pydantic model json schema for each tool call you will make: {\"properties\": {\"arguments\": {\"title\": \"Arguments\", \"type\": \"object\"}, \"name\": {\"title\": \"Name\", \"type\": \"string\"}}, \"required\": [\"arguments\", \"name\"], \"title\": \"FunctionCall\", \"type\": \"object\"} For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:\\n<tool_call>\\n{\"arguments\": <args-dict>, \"name\": <function-name>}\\n</tool_call>`;\n    const request: ChatCompletionRequest = {\n      messages: [\n        { role: \"system\", content: system_prompt },\n        {\n          role: \"user\",\n          content: \"Fetch the stock fundamentals data for Tesla (TSLA)\",\n        },\n        {\n          role: \"assistant\",\n          content: `<tool_call>\\n{\"arguments\": {\"symbol\": \"TSLA\"}, \"name\": \"get_stock_fundamentals\"}\\n</tool_call>`,\n        },\n        {\n          role: \"tool\",\n          tool_call_id: \"0\",\n          content: `<tool_response>\\n{\"name\": \"get_stock_fundamentals\", \"content\": {'symbol': 'TSLA', 'company_name': 'Tesla, Inc.', 'sector': 'Consumer Cyclical', 'industry': 'Auto Manufacturers', 'market_cap': 611384164352, 'pe_ratio': 49.604652, 'pb_ratio': 9.762013, 'dividend_yield': None, 'eps': 4.3, 'beta': 2.427, '52_week_high': 299.29, '52_week_low': 152.37}}\\n</tool_response>`,\n        },\n      ],\n    };\n    postInitAndCheckFields(\n      request,\n      \"Hermes-2-Theta-Llama-3-8B-q4f16_1-MLC\",\n      ModelType.LLM,\n    );\n  });\n});\n\ndescribe(\"OpenAI API function calling\", () => {\n  const tools: Array<ChatCompletionTool> = [\n    {\n      type: \"function\",\n      function: {\n        name: \"get_current_weather\",\n        description: \"Get the current weather in a given location\",\n        parameters: {\n          type: \"object\",\n          properties: {\n            location: {\n              type: \"string\",\n              description: \"The city and state, e.g. San Francisco, CA\",\n            },\n            unit: { type: \"string\", enum: [\"celsius\", \"fahrenheit\"] },\n          },\n          required: [\"location\"],\n        },\n      },\n    },\n  ];\n\n  test(\"Unsupported model\", () => {\n    expect(() => {\n      const request: ChatCompletionRequest = {\n        tools: tools,\n        messages: [\n          {\n            role: \"user\",\n            content: \"Get weather of Tokyo\",\n          },\n        ],\n      };\n      postInitAndCheckFields(\n        request,\n        \"Llama-3.1-8B-Instruct-q4f32_1-MLC\",\n        ModelType.LLM,\n      );\n    }).toThrow(\n      \"Llama-3.1-8B-Instruct-q4f32_1-MLC is not supported for ChatCompletionRequest.tools.\",\n    );\n  });\n\n  test(\"Should not specify response format\", () => {\n    expect(() => {\n      const request: ChatCompletionRequest = {\n        tools: tools,\n        messages: [\n          {\n            role: \"user\",\n            content: \"Get weather of Tokyo\",\n          },\n        ],\n        response_format: { type: \"json_object\" },\n      };\n      postInitAndCheckFields(\n        request,\n        \"Hermes-2-Pro-Llama-3-8B-q4f16_1-MLC\",\n        ModelType.LLM,\n      );\n    }).toThrow(\n      \"When using Hermes-2-Pro function calling via ChatCompletionRequest.tools, \" +\n        \"cannot specify customized response_format. We will set it for you internally.\",\n    );\n  });\n\n  test(\"Should not specify system prompt\", () => {\n    expect(() => {\n      const request: ChatCompletionRequest = {\n        tools: tools,\n        messages: [\n          {\n            role: \"system\",\n            content: \"Write a function.\",\n          },\n          {\n            role: \"user\",\n            content: \"Get weather of Tokyo\",\n          },\n        ],\n      };\n      postInitAndCheckFields(\n        request,\n        \"Hermes-2-Pro-Llama-3-8B-q4f16_1-MLC\",\n        ModelType.LLM,\n      );\n    }).toThrow(\n      \"When using Hermes-2-Pro function calling via ChatCompletionRequest.tools, cannot \" +\n        \"specify customized system prompt.\",\n    );\n  });\n\n  test(\"Should not specify system prompt\", () => {\n    expect(() => {\n      const request: ChatCompletionRequest = {\n        tools: tools,\n        messages: [\n          {\n            role: \"system\",\n            content: \"Write a function.\",\n          },\n          {\n            role: \"user\",\n            content: \"Get weather of Tokyo\",\n          },\n        ],\n      };\n      postInitAndCheckFields(\n        request,\n        \"Hermes-2-Pro-Llama-3-8B-q4f16_1-MLC\",\n        ModelType.LLM,\n      );\n    }).toThrow(\n      \"When using Hermes-2-Pro function calling via ChatCompletionRequest.tools, cannot \" +\n        \"specify customized system prompt.\",\n    );\n  });\n\n  test(\"Check system prompt and response format post init\", () => {\n    const request: ChatCompletionRequest = {\n      tools: tools,\n      messages: [\n        {\n          role: \"user\",\n          content: \"Get weather of Tokyo\",\n        },\n      ],\n    };\n    postInitAndCheckFields(\n      request,\n      \"Hermes-2-Pro-Llama-3-8B-q4f16_1-MLC\",\n      ModelType.LLM,\n    );\n    expect(request.messages[0].role).toEqual(\"system\");\n    expect(request.messages[0].content).toEqual(\n      hermes2FunctionCallingSystemPrompt.replace(\n        MessagePlaceholders.hermes_tools,\n        JSON.stringify(request.tools),\n      ),\n    );\n    expect(request.response_format!.type).toEqual(\"json_object\");\n    expect(request.response_format!.schema).toEqual(\n      officialHermes2FunctionCallSchemaArray,\n    );\n  });\n});\n"
  },
  {
    "path": "tests/openai_completion.test.ts",
    "content": "import { getConversation } from \"../src/conversation\";\nimport {\n  TextCompletionConversationError,\n  TextCompletionConversationExpectsPrompt,\n} from \"../src/error\";\nimport {\n  CompletionCreateParams,\n  postInitAndCheckFields,\n} from \"../src/openai_api_protocols/completion\";\nimport { llama3_1ChatConfig } from \"./constants\";\nimport { describe, expect, test } from \"@jest/globals\";\n\ndescribe(\"Conversation object with text completion\", () => {\n  test(\"Conversation checks \", () => {\n    const conv = getConversation(\n      llama3_1ChatConfig.conv_template,\n      llama3_1ChatConfig.conv_config,\n      /*isTextCompletion=*/ true,\n    );\n    expect(() => {\n      conv.getPromptArrayTextCompletion();\n    }).toThrow(new TextCompletionConversationExpectsPrompt());\n    expect(() => {\n      conv.getPromptArray();\n    }).toThrow(new TextCompletionConversationError(\"getPromptArray\"));\n\n    conv.prompt = \"Hi\";\n    expect(conv.getPromptArrayTextCompletion()).toEqual([\"Hi\"]);\n\n    conv.reset();\n    expect(conv.prompt === undefined).toEqual(true);\n  });\n});\n\ndescribe(\"Check completion unsupported requests\", () => {\n  test(\"stream_options without stream specified\", () => {\n    expect(() => {\n      const request: CompletionCreateParams = {\n        prompt: \"Hello, \",\n        stream_options: { include_usage: true },\n      };\n      postInitAndCheckFields(request, \"Llama-3.1-8B-Instruct-q4f32_1-MLC\");\n    }).toThrow(\"Only specify stream_options when stream=True.\");\n  });\n\n  test(\"stream_options with stream=false\", () => {\n    expect(() => {\n      const request: CompletionCreateParams = {\n        stream: false,\n        prompt: \"Hello, \",\n        stream_options: { include_usage: true },\n      };\n      postInitAndCheckFields(request, \"Llama-3.1-8B-Instruct-q4f32_1-MLC\");\n    }).toThrow(\"Only specify stream_options when stream=True.\");\n  });\n\n  test(\"High-level unsupported fields\", () => {\n    expect(() => {\n      const request: CompletionCreateParams = {\n        prompt: \"Hello, \",\n        suffix: \"this is suffix\", // this raises error\n      };\n      postInitAndCheckFields(request, \"Llama-3.1-8B-Instruct-q4f32_1-MLC\");\n    }).toThrow(\n      \"The following fields in CompletionCreateParams are not yet supported\",\n    );\n\n    expect(() => {\n      const request: CompletionCreateParams = {\n        prompt: \"Hello, \",\n        best_of: 3, // this raises error\n      };\n      postInitAndCheckFields(request, \"Llama-3.1-8B-Instruct-q4f32_1-MLC\");\n    }).toThrow(\n      \"The following fields in CompletionCreateParams are not yet supported\",\n    );\n\n    expect(() => {\n      const request: CompletionCreateParams = {\n        prompt: \"Hello, \",\n        user: \"Bob\", // this raises error\n      };\n      postInitAndCheckFields(request, \"Llama-3.1-8B-Instruct-q4f32_1-MLC\");\n    }).toThrow(\n      \"The following fields in CompletionCreateParams are not yet supported\",\n    );\n  });\n\n  test(\"When streaming `n` needs to be 1\", () => {\n    expect(() => {\n      const request: CompletionCreateParams = {\n        stream: true,\n        n: 2,\n        prompt: \"Hello, \",\n      };\n      postInitAndCheckFields(request, \"Llama-3.1-8B-Instruct-q4f32_1-MLC\");\n    }).toThrow(\"When streaming, `n` cannot be > 1.\");\n  });\n\n  test(\"Non-integer seed\", () => {\n    expect(() => {\n      const request: CompletionCreateParams = {\n        prompt: \"Hello, \",\n        max_tokens: 10,\n        seed: 42.2, // Note that Number.isInteger(42.0) is true\n      };\n      postInitAndCheckFields(request, \"Llama-3.1-8B-Instruct-q4f32_1-MLC\");\n    }).toThrow(\"`seed` should be an integer, but got\");\n  });\n});\n"
  },
  {
    "path": "tests/openai_embeddings.test.ts",
    "content": "import {\n  EmbeddingInputEmptyError,\n  EmbeddingUnsupportedEncodingFormatError,\n} from \"../src/error\";\nimport {\n  EmbeddingCreateParams,\n  postInitAndCheckFields,\n} from \"../src/openai_api_protocols/embedding\";\nimport { describe, expect, test } from \"@jest/globals\";\n\ndescribe(\"Check embeddings supported requests\", () => {\n  test(\"Supported embedding request float\", () => {\n    const request: EmbeddingCreateParams = {\n      input: [\"Hello\", \"Hi\"],\n      encoding_format: \"float\",\n    };\n    postInitAndCheckFields(request, \"snowflake-arctic-embed-m-q0f32-MLC\");\n  });\n\n  test(\"Supported embedding request, unspecified format\", () => {\n    const request: EmbeddingCreateParams = {\n      input: [\"Hello\", \"Hi\"],\n    };\n    postInitAndCheckFields(request, \"snowflake-arctic-embed-m-q0f32-MLC\");\n  });\n\n  test(\"Supported embedding request, single string\", () => {\n    const request: EmbeddingCreateParams = {\n      input: \"Hello\",\n    };\n    postInitAndCheckFields(request, \"snowflake-arctic-embed-m-q0f32-MLC\");\n  });\n\n  test(\"Supported embedding request, single token array\", () => {\n    const request: EmbeddingCreateParams = {\n      input: [0, 1],\n    };\n    postInitAndCheckFields(request, \"snowflake-arctic-embed-m-q0f32-MLC\");\n  });\n\n  test(\"Supported embedding request, array of token arrays\", () => {\n    const request: EmbeddingCreateParams = {\n      input: [\n        [0, 1],\n        [0, 1],\n      ],\n    };\n    postInitAndCheckFields(request, \"snowflake-arctic-embed-m-q0f32-MLC\");\n  });\n});\n\ndescribe(\"Invalid embedding input\", () => {\n  test(\"Empty string\", () => {\n    expect(() => {\n      const request: EmbeddingCreateParams = {\n        input: \"\",\n      };\n      postInitAndCheckFields(request, \"snowflake-arctic-embed-m-q0f32-MLC\");\n    }).toThrow(new EmbeddingInputEmptyError());\n  });\n\n  test(\"Contains empty string\", () => {\n    expect(() => {\n      const request: EmbeddingCreateParams = {\n        input: [\"Hi\", \"hello\", \"\"],\n      };\n      postInitAndCheckFields(request, \"snowflake-arctic-embed-m-q0f32-MLC\");\n    }).toThrow(new EmbeddingInputEmptyError());\n  });\n\n  test(\"Empty token array\", () => {\n    expect(() => {\n      const request: EmbeddingCreateParams = {\n        input: [],\n      };\n      postInitAndCheckFields(request, \"snowflake-arctic-embed-m-q0f32-MLC\");\n    }).toThrow(new EmbeddingInputEmptyError());\n  });\n\n  test(\"Contains empty token array\", () => {\n    expect(() => {\n      const request: EmbeddingCreateParams = {\n        input: [[1, 2], [3], [], [4]],\n      };\n      postInitAndCheckFields(request, \"snowflake-arctic-embed-m-q0f32-MLC\");\n    }).toThrow(new EmbeddingInputEmptyError());\n  });\n});\n\ndescribe(\"Check embeddings unsupported requests\", () => {\n  test(\"base64 encoding_format\", () => {\n    expect(() => {\n      const request: EmbeddingCreateParams = {\n        input: [\"Hello\", \"Hi\"],\n        encoding_format: \"base64\",\n      };\n      postInitAndCheckFields(request, \"snowflake-arctic-embed-m-q0f32-MLC\");\n    }).toThrow(new EmbeddingUnsupportedEncodingFormatError());\n  });\n\n  test(\"user\", () => {\n    expect(() => {\n      const request: EmbeddingCreateParams = {\n        input: [\"Hello\", \"Hi\"],\n        encoding_format: \"float\",\n        user: \"Bob\",\n      };\n      postInitAndCheckFields(request, \"snowflake-arctic-embed-m-q0f32-MLC\");\n    }).toThrow(\"The following fields in\");\n  });\n\n  test(\"dimensions\", () => {\n    expect(() => {\n      const request: EmbeddingCreateParams = {\n        input: [\"Hello\", \"Hi\"],\n        encoding_format: \"float\",\n        dimensions: 2048,\n      };\n      postInitAndCheckFields(request, \"snowflake-arctic-embed-m-q0f32-MLC\");\n    }).toThrow(\"The following fields in\");\n  });\n});\n"
  },
  {
    "path": "tests/scripts/sanity_checks/README.md",
    "content": "# Sanity Checks for Generated Output\n\nThis folder provides simple sanity checks on the output generated\nusing WebLLM. To try it out, you can do the following steps under this folder\n\n```bash\nnpm install\nnpm start\n```\n\nNote if you would like to hack WebLLM core package.\nYou can change web-llm dependencies as `\"file:../..\"`, and follow the build from source\ninstruction in the project to build webllm locally. This option is only recommended\nif you would like to hack WebLLM core package.\n"
  },
  {
    "path": "tests/scripts/sanity_checks/package.json",
    "content": "{\n  \"name\": \"sanity_checks\",\n  \"version\": \"0.1.0\",\n  \"private\": true,\n  \"scripts\": {\n    \"start\": \"parcel sanity_checks.html --port 8889\",\n    \"build\": \"parcel build sanity_checks.html --dist-dir lib\"\n  },\n  \"devDependencies\": {\n    \"buffer\": \"^5.7.1\",\n    \"parcel\": \"^2.8.3\",\n    \"process\": \"^0.11.10\",\n    \"tslib\": \"^2.3.1\",\n    \"typescript\": \"^4.9.5\",\n    \"url\": \"^0.11.3\"\n  },\n  \"dependencies\": {\n    \"@mlc-ai/web-llm\": \"^0.2.82\"\n  }\n}\n"
  },
  {
    "path": "tests/scripts/sanity_checks/sanity_checks.html",
    "content": "<!doctype html>\n<html lang=\"en\">\n  <head>\n    <meta charset=\"UTF-8\" />\n    <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\" />\n    <title>GPU sampleTokenFromLogits Tests</title>\n    <style>\n      body {\n        font-family: Arial, sans-serif;\n        margin: 2em;\n      }\n      .label {\n        margin: 0.5em 0;\n        font-weight: bold;\n      }\n      .result {\n        margin: 0.5em 0 1.5em 0;\n        padding: 0.5em;\n        background: #f4f4f4;\n        border-radius: 4px;\n      }\n      button {\n        padding: 0.5em 1em;\n        font-size: 1em;\n      }\n    </style>\n  </head>\n  <body>\n    <h1>GPU sampleTokenFromLogits Tests</h1>\n    <button id=\"run-tests\">Re-run All Tests</button>\n    <div class=\"label\">Overall:</div>\n    <div id=\"gpu-test-label\" class=\"result\">Not started.</div>\n    <div class=\"label\">Logit Processor:</div>\n    <div id=\"logit-processor-label\" class=\"result\"></div>\n    <div class=\"label\">Logit Bias:</div>\n    <div id=\"logit-bias-label\" class=\"result\"></div>\n    <div class=\"label\">Penalties:</div>\n    <div id=\"penalty-label\" class=\"result\"></div>\n    <div class=\"label\">Logprobs:</div>\n    <div id=\"logprobs-label\" class=\"result\"></div>\n    <script type=\"module\">\n      import \"./sanity_checks.ts\";\n      document.getElementById(\"run-tests\").onclick = () => {\n        // Reload the module to rerun tests\n        window.location.reload();\n      };\n    </script>\n  </body>\n</html>\n"
  },
  {
    "path": "tests/scripts/sanity_checks/sanity_checks.ts",
    "content": "import * as webllm from \"@mlc-ai/web-llm\";\n\nfunction setLabel(id: string, text: string) {\n  const label = document.getElementById(id);\n  if (label == null) return;\n  label.innerText = text;\n}\n\nasync function createEngine(\n  modelId: string,\n  appConfig: webllm.AppConfig,\n  logitProcessorRegistry?: Map<string, webllm.LogitProcessor>,\n) {\n  return await webllm.CreateMLCEngine(modelId, {\n    appConfig,\n    logLevel: \"ERROR\",\n    logitProcessorRegistry,\n  });\n}\n\nasync function deleteModel(modelId: string, appConfig: webllm.AppConfig) {\n  await webllm.deleteModelAllInfoInCache(modelId, appConfig);\n}\n\nasync function testLogitProcessor(\n  modelId: string,\n  appConfig: webllm.AppConfig,\n) {\n  // Set up a logit processor that sets logits[0] = 100.0, rest -100.0\n  const logitProcessor = {\n    processLogits: (logits: Float32Array) => {\n      logits.fill(-100.0);\n      logits[0] = 100.0;\n      return logits;\n    },\n    processSampledToken: () => {},\n    resetState: () => {},\n  };\n  const logitProcessorRegistry: Map<string, webllm.LogitProcessor> = new Map();\n  logitProcessorRegistry.set(modelId, logitProcessor);\n  const engine: webllm.MLCEngineInterface = await createEngine(\n    modelId,\n    appConfig,\n    logitProcessorRegistry,\n  );\n\n  const prompt = \"Test logit processor.\";\n  const reply: webllm.ChatCompletion = await engine.chat.completions.create({\n    messages: [{ role: \"user\", content: prompt }],\n    temperature: 1.0,\n    max_tokens: 20,\n    logprobs: true,\n    top_logprobs: 1,\n  });\n  const logprobs = reply.choices[0]?.logprobs;\n  const logprobsAllZero = !!(\n    logprobs &&\n    Array.isArray(logprobs.content) &&\n    logprobs.content.every(\n      (lp: webllm.ChatCompletionTokenLogprob) =>\n        lp.top_logprobs[0].logprob === 0,\n    )\n  );\n\n  console.log(`[LogitProcessor] Logprobs all zero: ${logprobsAllZero}`);\n  setLabel(\"logit-processor-label\", `Logprobs all zero: ${logprobsAllZero}`);\n  await deleteModel(modelId, appConfig);\n  return logprobsAllZero;\n}\n\nasync function testLogitBias(modelId: string, appConfig: webllm.AppConfig) {\n  // Set logit_bias to strongly favor token 0\n  const prompt = \"Test logit bias.\";\n  const engine: webllm.MLCEngineInterface = await createEngine(\n    modelId,\n    appConfig,\n  );\n  const reply = await engine.chat.completions.create({\n    messages: [{ role: \"user\", content: prompt }],\n    temperature: 1.0,\n    max_tokens: 20,\n    logprobs: true,\n    top_logprobs: 1,\n    logit_bias: { \"0\": 100.0 },\n  });\n  const logprobs = reply.choices[0]?.logprobs;\n  const logprobsAllZero = !!(\n    logprobs &&\n    Array.isArray(logprobs.content) &&\n    logprobs.content.every(\n      (lp: webllm.ChatCompletionTokenLogprob) =>\n        lp.top_logprobs[0].logprob === 0,\n    )\n  );\n\n  console.log(`[LogitBias] Logprobs all zero: ${logprobsAllZero}`);\n  setLabel(\"logit-bias-label\", `Logprobs all zero: ${logprobsAllZero}`);\n  await deleteModel(modelId, appConfig);\n  return logprobsAllZero;\n}\n\nasync function testPenalties(modelId: string, appConfig: webllm.AppConfig) {\n  const prompt = \"Test presence and frequency penalties.\";\n  const engine: webllm.MLCEngineInterface = await createEngine(\n    modelId,\n    appConfig,\n  );\n  const reply = await engine.chat.completions.create({\n    messages: [{ role: \"user\", content: prompt }],\n    temperature: 1.0,\n    max_tokens: 256,\n    presence_penalty: 2.0,\n    frequency_penalty: 2.0,\n    logit_bias: { \"0\": 100.0 },\n    logprobs: true,\n  });\n  const logprobs = reply.choices[0]?.logprobs;\n  const logprobsNotAllZero = !logprobs?.content?.every(\n    (lp: webllm.ChatCompletionTokenLogprob) => lp.logprob === 0,\n  );\n  console.log(`[Penalties] Logprobs not all zero: ${logprobsNotAllZero}`);\n  setLabel(\"penalty-label\", `Logprobs not all zero: ${logprobsNotAllZero}`);\n  await deleteModel(modelId, appConfig);\n  return logprobsNotAllZero;\n}\n\nasync function testLogprobs(modelId: string, appConfig: webllm.AppConfig) {\n  // Test logprobs: check that logprobs are returned and sum to ~1 after exp\n  const prompt = \"Test logprobs.\";\n  const engine: webllm.MLCEngineInterface = await createEngine(\n    modelId,\n    appConfig,\n  );\n  const reply = await engine.chat.completions.create({\n    messages: [{ role: \"user\", content: prompt }],\n    temperature: 1.0,\n    max_tokens: 20,\n    logprobs: true,\n    top_logprobs: 5,\n  });\n  const logprobs = reply.choices[0]?.logprobs;\n\n  let logprobsAllCloseTo1 = true;\n  for (const lp of logprobs?.content || []) {\n    const expSum = lp.top_logprobs?.reduce(\n      (acc: number, val: webllm.TopLogprob) => acc + Math.exp(val.logprob),\n      0,\n    );\n    logprobsAllCloseTo1 &&= Math.abs(expSum - 1.0) < 0.1;\n  }\n  console.log(`[Logprobs] Logprobs all close to 1: ${logprobsAllCloseTo1}`);\n  setLabel(\"logprobs-label\", `Logprobs all close to 1: ${logprobsAllCloseTo1}`);\n  await deleteModel(modelId, appConfig);\n  return logprobsAllCloseTo1;\n}\n\nasync function main() {\n  const modelId = \"Qwen3-0.6B-q0f32-MLC\";\n  const appConfig = webllm.prebuiltAppConfig;\n  appConfig.useIndexedDBCache = true;\n  setLabel(\"gpu-test-label\", \"Running tests...\");\n  let passed = 0,\n    total = 0;\n\n  if (await testLogitProcessor(modelId, appConfig)) passed++;\n  total++;\n  if (await testLogitBias(modelId, appConfig)) passed++;\n  total++;\n  if (await testPenalties(modelId, appConfig)) passed++;\n  total++;\n  if (await testLogprobs(modelId, appConfig)) passed++;\n  total++;\n\n  setLabel(\n    \"gpu-test-label\",\n    `GPU sampleTokenFromLogits tests: ${passed}/${total} passed.`,\n  );\n  setLabel(\n    \"gpu-test-label\",\n    `Tests complete. Model deleted. ${passed}/${total} passed.`,\n  );\n}\n\nmain();\n"
  },
  {
    "path": "tests/service_worker.test.ts",
    "content": "import {\n  CreateServiceWorkerMLCEngine,\n  ServiceWorker,\n  ServiceWorkerMLCEngine,\n  ServiceWorkerMLCEngineHandler,\n} from \"../src/service_worker\";\nimport { jest, test, expect, afterEach } from \"@jest/globals\";\n\ntype ServiceWorkerHandlerEvent = Parameters<\n  ServiceWorkerMLCEngineHandler[\"onmessage\"]\n>[0];\n\njest.mock(\"@mlc-ai/web-runtime\", () => ({\n  detectGPUDevice: jest.fn(async () => ({\n    adapterInfo: { description: \"MockGPU\", vendor: \"MockVendor\" },\n  })),\n}));\n\nconst reloadMock = jest.fn();\nconst initCallback = jest.fn();\n\njest.mock(\"../src/engine\", () => {\n  return {\n    MLCEngine: jest.fn(() => ({\n      reload: reloadMock,\n      chatCompletion: jest.fn(),\n      completion: jest.fn(),\n      embedding: jest.fn(),\n      forwardTokensAndSample: jest.fn(),\n      getInitProgressCallback: jest.fn(() => initCallback),\n      setInitProgressCallback: jest.fn(),\n      setLogitProcessorRegistry: jest.fn(),\n      setAppConfig: jest.fn(),\n    })),\n  };\n});\n\nconst originalNavigator = (globalThis as any).navigator;\nconst originalSelf = (globalThis as any).self;\nconst originalPostMessage = (globalThis as any).postMessage;\n\nfunction setupWorkerScope() {\n  (globalThis as any).self = {\n    addEventListener: jest.fn(),\n  };\n  (globalThis as any).postMessage = jest.fn();\n}\n\nfunction setupNavigator(options?: {\n  controller?: any;\n  readyRegistration?: Promise<ServiceWorkerRegistration>;\n}) {\n  const controller = options?.controller ?? { postMessage: jest.fn() };\n  const readyRegistration =\n    options?.readyRegistration ??\n    Promise.resolve({\n      active: controller as ServiceWorker,\n    } as unknown as ServiceWorkerRegistration);\n  const container: any = {\n    controller,\n    ready: readyRegistration,\n    onmessage: undefined as any,\n  };\n  (globalThis as any).navigator = {\n    serviceWorker: container,\n  };\n  return { container, controller };\n}\n\nfunction createHandler() {\n  setupWorkerScope();\n  const handler = new ServiceWorkerMLCEngineHandler();\n  const handleTaskMock = jest.fn(async (_uuid: string, task: any) => task());\n  (handler as any).handleTask = handleTaskMock;\n  (handler as any).engine = {\n    reload: reloadMock,\n    getInitProgressCallback: jest.fn(() => initCallback),\n  };\n  reloadMock.mockClear();\n  initCallback.mockClear();\n  return { handler, handleTaskMock };\n}\n\nafterEach(() => {\n  reloadMock.mockClear();\n  initCallback.mockClear();\n  if (originalNavigator === undefined) {\n    delete (globalThis as any).navigator;\n  } else {\n    (globalThis as any).navigator = originalNavigator;\n  }\n  if (originalSelf === undefined) {\n    delete (globalThis as any).self;\n  } else {\n    (globalThis as any).self = originalSelf;\n  }\n  if (originalPostMessage === undefined) {\n    delete (globalThis as any).postMessage;\n  } else {\n    (globalThis as any).postMessage = originalPostMessage;\n  }\n  jest.useRealTimers();\n  jest.clearAllTimers();\n});\n\ntest(\"ServiceWorker handler responds to keepAlive message\", () => {\n  const { handler } = createHandler();\n  const client = { postMessage: jest.fn() };\n  (handler as any).clientRegistry = new Map([[\"keep\", client]]);\n  const onComplete = jest.fn();\n  handler.onmessage(\n    { data: { kind: \"keepAlive\", uuid: \"keep\" } } as ServiceWorkerHandlerEvent,\n    onComplete,\n  );\n  expect(client.postMessage).toHaveBeenCalledWith({\n    kind: \"heartbeat\",\n    uuid: \"keep\",\n  });\n  expect(onComplete).toHaveBeenCalledWith({ kind: \"heartbeat\", uuid: \"keep\" });\n});\n\ntest(\"reload with the same model skips engine reload\", async () => {\n  const { handler, handleTaskMock } = createHandler();\n  handler.modelId = [\"demo\"];\n  handler.chatOpts = [];\n  handler.onmessage({\n    data: {\n      kind: \"reload\",\n      uuid: \"reload-same\",\n      content: { modelId: [\"demo\"], chatOpts: [] },\n    },\n  } as ServiceWorkerHandlerEvent);\n  await handleTaskMock.mock.results[0].value;\n  expect(reloadMock).not.toHaveBeenCalled();\n  expect(initCallback).toHaveBeenCalledWith(\n    expect.objectContaining({ progress: 1 }),\n  );\n});\n\ntest(\"reload with new parameters calls engine reload\", async () => {\n  const { handler, handleTaskMock } = createHandler();\n  handler.modelId = [\"demo\"];\n  handler.chatOpts = [];\n  handler.onmessage({\n    data: {\n      kind: \"reload\",\n      uuid: \"reload-new\",\n      content: { modelId: [\"fresh\"], chatOpts: [] },\n    },\n  } as ServiceWorkerHandlerEvent);\n  await handleTaskMock.mock.results[0].value;\n  expect(reloadMock).toHaveBeenCalledWith([\"fresh\"], []);\n});\n\ntest(\"ServiceWorker client forwards onmessage handlers to navigator\", () => {\n  const { container } = setupNavigator();\n  const client = new ServiceWorker();\n  const handler = jest.fn();\n  client.onmessage = handler;\n  expect(container.onmessage).toBe(handler);\n});\n\ntest(\"ServiceWorker postMessage routes through controller\", () => {\n  const { controller } = setupNavigator();\n  const client = new ServiceWorker();\n  const message = { kind: \"reload\", uuid: \"client-msg\" } as any;\n  client.postMessage(message);\n  expect(controller.postMessage).toHaveBeenCalledWith(message);\n});\n\ntest(\"ServiceWorker postMessage throws if controller missing\", () => {\n  const { container } = setupNavigator(undefined);\n  container.controller = undefined as any;\n  const client = new ServiceWorker();\n  expect(() =>\n    client.postMessage({ kind: \"reload\", uuid: \"client-msg\" } as any),\n  ).toThrow(\"There is no active service worker\");\n});\n\ntest(\"ServiceWorkerMLCEngine heartbeats reset missed counter\", () => {\n  jest.useFakeTimers();\n  const { container } = setupNavigator();\n  const engine = new ServiceWorkerMLCEngine(undefined, 200);\n  expect(engine.missedHeartbeat).toBe(0);\n  jest.advanceTimersByTime(200);\n  expect((container.controller as any).postMessage).toHaveBeenCalledWith(\n    expect.objectContaining({ kind: \"keepAlive\" }),\n  );\n  expect(engine.missedHeartbeat).toBe(1);\n  container.onmessage?.({ data: { kind: \"heartbeat\" } } as MessageEvent<any>);\n  expect(engine.missedHeartbeat).toBe(0);\n});\n\ntest(\"CreateServiceWorkerMLCEngine waits for ready registration and reloads\", async () => {\n  jest.useFakeTimers();\n  setupNavigator();\n  const reloadSpy = jest\n    .spyOn(ServiceWorkerMLCEngine.prototype, \"reload\")\n    .mockResolvedValue(undefined);\n  const engine = await CreateServiceWorkerMLCEngine([\"m1\", \"m2\"]);\n  expect(reloadSpy).toHaveBeenCalledWith([\"m1\", \"m2\"], undefined);\n  expect(engine).toBeInstanceOf(ServiceWorkerMLCEngine);\n  reloadSpy.mockRestore();\n});\n"
  },
  {
    "path": "tests/util.test.ts",
    "content": "import { ChatOptions } from \"../src/config\";\nimport {\n  ModelNotLoadedError,\n  SpecifiedModelNotFoundError,\n  UnclearModelToUseError,\n} from \"../src/error\";\nimport {\n  cleanModelUrl,\n  CustomLock,\n  getModelIdToUse,\n  getChunkedPrefillInputData,\n  getTopProbs,\n} from \"../src/support\";\nimport { areChatOptionsListEqual } from \"../src/utils\";\nimport { MLCEngine } from \"../src/engine\";\nimport { ChatCompletionContentPartImage } from \"../src/openai_api_protocols\";\nimport { test, expect, describe } from \"@jest/globals\";\n\ndescribe(\"Check getTopLogprobs correctness\", () => {\n  test(\"Correctness test 1\", () => {\n    const logitsOnCPUArray = new Float32Array([\n      0.05, 0.15, 0.3, 0.16, 0.04, 0.2, 0.1,\n    ]);\n    const actual = getTopProbs(3, logitsOnCPUArray);\n    const expected: Array<[number, number]> = [\n      [2, 0.3],\n      [5, 0.2],\n      [3, 0.16],\n    ];\n    expect(actual.length).toBe(expected.length);\n    for (let i = 0; i < actual.length; i++) {\n      expect(actual[i][0]).toBe(expected[i][0]);\n      expect(actual[i][1]).toBeCloseTo(expected[i][1], 4);\n    }\n  });\n\n  test(\"Zero top_logprobs\", () => {\n    const logitsOnCPUArray = new Float32Array([\n      0.05, 0.15, 0.3, 0.16, 0.04, 0.2, 0.1,\n    ]);\n    const topLogProbs = getTopProbs(0, logitsOnCPUArray);\n    expect(topLogProbs).toEqual([]);\n  });\n});\n\ndescribe(\"Test clean model URL\", () => {\n  test(\"Input does not have branch or trailing /\", () => {\n    const input = \"https://huggingface.co/mlc-ai/model\";\n    const output = cleanModelUrl(input);\n    const expected = \"https://huggingface.co/mlc-ai/model/resolve/main/\";\n    expect(output).toEqual(expected);\n  });\n\n  test(\"Input does not have branch but has trailing /\", () => {\n    const input = \"https://huggingface.co/mlc-ai/model/\";\n    const output = cleanModelUrl(input);\n    const expected = \"https://huggingface.co/mlc-ai/model/resolve/main/\";\n    expect(output).toEqual(expected);\n  });\n\n  test(\"Input has branch but does not have trailing /\", () => {\n    const input = \"https://huggingface.co/mlc-ai/model/resolve/main\";\n    const output = cleanModelUrl(input);\n    const expected = \"https://huggingface.co/mlc-ai/model/resolve/main/\";\n    expect(output).toEqual(expected);\n  });\n\n  test(\"Input has branch and trailing /\", () => {\n    const input = \"https://huggingface.co/mlc-ai/model/resolve/main/\";\n    const output = cleanModelUrl(input);\n    const expected = \"https://huggingface.co/mlc-ai/model/resolve/main/\";\n    expect(output).toEqual(expected);\n  });\n});\n\ndescribe(\"Test getModelIdToUse\", () => {\n  test(\"Specified model not found\", () => {\n    const loadedModelIds = [\"a\", \"b\", \"c\"];\n    const requestModel = \"d\";\n    const requestName = \"ChatCompletionRequest\";\n    expect(() => {\n      getModelIdToUse(loadedModelIds, requestModel, requestName);\n    }).toThrow(\n      new SpecifiedModelNotFoundError(\n        loadedModelIds,\n        requestModel,\n        requestName,\n      ),\n    );\n  });\n\n  test(\"No model loaded\", () => {\n    const loadedModelIds: string[] = [];\n    const requestModel = \"d\";\n    const requestName = \"ChatCompletionRequest\";\n    expect(() => {\n      getModelIdToUse(loadedModelIds, requestModel, requestName);\n    }).toThrow(new ModelNotLoadedError(requestName));\n  });\n\n  test(\"Unclear what model to use, undefined\", () => {\n    const loadedModelIds = [\"a\", \"b\", \"c\"];\n    const requestModel = undefined;\n    const requestName = \"ChatCompletionRequest\";\n    expect(() => {\n      getModelIdToUse(loadedModelIds, requestModel, requestName);\n    }).toThrow(new UnclearModelToUseError(loadedModelIds, requestName));\n  });\n\n  test(\"Unclear what model to use, null\", () => {\n    const loadedModelIds = [\"a\", \"b\", \"c\"];\n    const requestModel = null;\n    const requestName = \"ChatCompletionRequest\";\n    expect(() => {\n      getModelIdToUse(loadedModelIds, requestModel, requestName);\n    }).toThrow(new UnclearModelToUseError(loadedModelIds, requestName));\n  });\n\n  test(\"Valid config, unspecified request model\", () => {\n    const loadedModelIds = [\"a\"];\n    const requestModel = null;\n    const requestName = \"ChatCompletionRequest\";\n    const selectedModelId = getModelIdToUse(\n      loadedModelIds,\n      requestModel,\n      requestName,\n    );\n    expect(selectedModelId).toEqual(\"a\");\n  });\n\n  test(\"Valid config, specified request model\", () => {\n    const loadedModelIds = [\"a\"];\n    const requestModel = \"a\";\n    const requestName = \"ChatCompletionRequest\";\n    const selectedModelId = getModelIdToUse(\n      loadedModelIds,\n      requestModel,\n      requestName,\n    );\n    expect(selectedModelId).toEqual(\"a\");\n  });\n\n  test(\"Valid config, specified request model, multi models loaded\", () => {\n    const loadedModelIds = [\"a\", \"b\", \"c\"];\n    const requestModel = \"c\";\n    const requestName = \"ChatCompletionRequest\";\n    const selectedModelId = getModelIdToUse(\n      loadedModelIds,\n      requestModel,\n      requestName,\n    );\n    expect(selectedModelId).toEqual(\"c\");\n  });\n\n  // Cannot test MLCEngine.getLLMStates E2E because `instanceof LLMChatPipeline` would not pass\n  // with dummy pipeline variables\n  test(\"E2E test with MLCEngine not loading a model for APIs\", () => {\n    const engine = new MLCEngine();\n    expect(async () => {\n      await engine.chatCompletion({\n        messages: [{ role: \"user\", content: \"hi\" }],\n      });\n    }).rejects.toThrow(new ModelNotLoadedError(\"ChatCompletionRequest\"));\n    expect(async () => {\n      await engine.getMessage();\n    }).rejects.toThrow(new ModelNotLoadedError(\"getMessage\"));\n\n    // resetChat should not throw error because it is allowed to resetChat before pipeline\n    // established, as a no-op\n    expect(async () => {\n      await engine.resetChat();\n    }).not.toThrow(new ModelNotLoadedError(\"resetChat\"));\n  });\n\n  test(\"E2E test with MLCEngine with two models without specifying a model\", () => {\n    const engine = new MLCEngine() as any;\n    engine.loadedModelIdToPipeline = new Map<string, any>();\n    engine.loadedModelIdToPipeline.set(\"model1\", \"dummyLLMChatPipeline\");\n    engine.loadedModelIdToPipeline.set(\"model2\", \"dummyLLMChatPipeline\");\n    const loadedModelIds = [\"model1\", \"model2\"];\n\n    expect(async () => {\n      await engine.chatCompletion({\n        messages: [{ role: \"user\", content: \"hi\" }],\n      });\n    }).rejects.toThrow(\n      new UnclearModelToUseError(loadedModelIds, \"ChatCompletionRequest\"),\n    );\n    expect(async () => {\n      await engine.getMessage();\n    }).rejects.toThrow(\n      new UnclearModelToUseError(loadedModelIds, \"getMessage\"),\n    );\n    expect(async () => {\n      await engine.resetChat();\n    }).rejects.toThrow(new UnclearModelToUseError(loadedModelIds, \"resetChat\"));\n  });\n\n  test(\"E2E test with MLCEngine with two models specifying wrong model\", () => {\n    const engine = new MLCEngine() as any;\n    engine.loadedModelIdToPipeline = new Map<string, any>();\n    engine.loadedModelIdToPipeline.set(\"model1\", \"dummyLLMChatPipeline\");\n    engine.loadedModelIdToPipeline.set(\"model2\", \"dummyLLMChatPipeline\");\n    const loadedModelIds = [\"model1\", \"model2\"];\n    const requestedModelId = \"model3\";\n\n    expect(async () => {\n      await engine.chatCompletion({\n        messages: [{ role: \"user\", content: \"hi\" }],\n        model: requestedModelId,\n      });\n    }).rejects.toThrow(\n      new SpecifiedModelNotFoundError(\n        loadedModelIds,\n        requestedModelId,\n        \"ChatCompletionRequest\",\n      ),\n    );\n    expect(async () => {\n      await engine.getMessage(requestedModelId);\n    }).rejects.toThrow(\n      new SpecifiedModelNotFoundError(\n        loadedModelIds,\n        requestedModelId,\n        \"getMessage\",\n      ),\n    );\n    expect(async () => {\n      await engine.runtimeStatsText(requestedModelId);\n    }).rejects.toThrow(\n      new SpecifiedModelNotFoundError(\n        loadedModelIds,\n        requestedModelId,\n        \"runtimeStatsText\",\n      ),\n    );\n\n    // resetChat should not throw error because it is allowed to resetChat before pipeline\n    // established, as a no-op\n    expect(async () => {\n      await engine.resetChat(false, requestedModelId);\n    }).not.toThrow(\n      new SpecifiedModelNotFoundError(\n        loadedModelIds,\n        requestedModelId,\n        \"resetChat\",\n      ),\n    );\n  });\n});\n\ndescribe(\"Test areChatOptionsListEqual\", () => {\n  const dummyChatOpts1: ChatOptions = { tokenizer_files: [\"a\", \"b\"] };\n  const dummyChatOpts2: ChatOptions = {};\n  const dummyChatOpts3: ChatOptions = { tokenizer_files: [\"a\", \"b\"] };\n  const dummyChatOpts4: ChatOptions = {\n    tokenizer_files: [\"a\", \"b\"],\n    top_p: 0.5,\n  };\n\n  test(\"Two undefined\", () => {\n    const options1: ChatOptions[] | undefined = undefined;\n    const options2: ChatOptions[] | undefined = undefined;\n    expect(areChatOptionsListEqual(options1, options2)).toEqual(true);\n  });\n\n  test(\"One undefined\", () => {\n    const options1: ChatOptions[] | undefined = [dummyChatOpts1];\n    const options2: ChatOptions[] | undefined = undefined;\n    expect(areChatOptionsListEqual(options1, options2)).toEqual(false);\n  });\n\n  test(\"Both defined, not equal\", () => {\n    const options1: ChatOptions[] | undefined = [dummyChatOpts1];\n    const options2: ChatOptions[] | undefined = [dummyChatOpts2];\n    expect(areChatOptionsListEqual(options1, options2)).toEqual(false);\n  });\n\n  test(\"Different size\", () => {\n    const options1: ChatOptions[] | undefined = [\n      dummyChatOpts1,\n      dummyChatOpts3,\n    ];\n    const options2: ChatOptions[] | undefined = [dummyChatOpts2];\n    expect(areChatOptionsListEqual(options1, options2)).toEqual(false);\n  });\n\n  test(\"Same size, not equal 1\", () => {\n    const options1: ChatOptions[] | undefined = [\n      dummyChatOpts1,\n      dummyChatOpts3,\n    ];\n    const options2: ChatOptions[] | undefined = [\n      dummyChatOpts1,\n      dummyChatOpts2,\n    ];\n    expect(areChatOptionsListEqual(options1, options2)).toEqual(false);\n  });\n\n  test(\"Same size, not equal 2\", () => {\n    const options1: ChatOptions[] | undefined = [\n      dummyChatOpts1,\n      dummyChatOpts3,\n    ];\n    const options2: ChatOptions[] | undefined = [\n      dummyChatOpts1,\n      dummyChatOpts4,\n    ];\n    expect(areChatOptionsListEqual(options1, options2)).toEqual(false);\n  });\n\n  test(\"Same size, equal\", () => {\n    const options1: ChatOptions[] | undefined = [\n      dummyChatOpts1,\n      dummyChatOpts3,\n    ];\n    const options2: ChatOptions[] | undefined = [\n      dummyChatOpts3,\n      dummyChatOpts1,\n    ];\n    expect(areChatOptionsListEqual(options1, options2)).toEqual(true);\n  });\n});\n\ndescribe(\"Test getChunkedPrefillInputData\", () => {\n  const rangeArr = (start: number, end: number) =>\n    Array.from({ length: end - start }, (v, k) => k + start);\n  type ImageURL = ChatCompletionContentPartImage.ImageURL;\n  const prefillChunkSize = 2048;\n  const image1 = { url: \"url1\" } as ImageURL;\n  const image2 = { url: \"url2\" } as ImageURL;\n\n  test(\"With image data\", async () => {\n    const inputData = [\n      rangeArr(0, 200),\n      image1, // 1921 size\n      rangeArr(0, 10),\n    ];\n    const chunks = getChunkedPrefillInputData(inputData, prefillChunkSize);\n    const expectedChunks = [[rangeArr(0, 200)], [image1, rangeArr(0, 10)]];\n    const expectedChunkLens = [200, 1931];\n    expect(chunks).toEqual([expectedChunks, expectedChunkLens]);\n  });\n\n  test(\"Single image data\", async () => {\n    const inputData = [image1];\n    const chunks = getChunkedPrefillInputData(inputData, prefillChunkSize);\n    const expectedChunks = [[image1]];\n    const expectedChunkLens = [1921];\n    expect(chunks).toEqual([expectedChunks, expectedChunkLens]);\n  });\n\n  test(\"Two images\", async () => {\n    const inputData = [image1, image2];\n    const chunks = getChunkedPrefillInputData(inputData, prefillChunkSize);\n    const expectedChunks = [[image1], [image2]];\n    const expectedChunkLens = [1921, 1921];\n    expect(chunks).toEqual([expectedChunks, expectedChunkLens]);\n  });\n\n  test(\"Single token array that needs to be chunked\", async () => {\n    const inputData = [rangeArr(0, 4097)];\n    const chunks = getChunkedPrefillInputData(inputData, prefillChunkSize);\n    const expectedChunks = [\n      [rangeArr(0, 2048)],\n      [rangeArr(2048, 4096)],\n      [rangeArr(4096, 4097)],\n    ];\n    const expectedChunkLens = [2048, 2048, 1];\n    expect(chunks).toEqual([expectedChunks, expectedChunkLens]);\n  });\n\n  test(\"Single token array that does not need to be chunked\", async () => {\n    const inputData = [rangeArr(0, 2048)];\n    const chunks = getChunkedPrefillInputData(inputData, prefillChunkSize);\n    const expectedChunks = [[rangeArr(0, 2048)]];\n    const expectedChunkLens = [2048];\n    expect(chunks).toEqual([expectedChunks, expectedChunkLens]);\n  });\n\n  test(\"Token array that needs to be chunked, grouped with others\", async () => {\n    const inputData = [\n      image1, // 1921\n      rangeArr(0, 2300),\n      image2,\n    ];\n    const chunks = getChunkedPrefillInputData(inputData, prefillChunkSize);\n    const expectedChunks = [\n      [image1, rangeArr(0, 127)], // 127 = 2048 - 1921\n      [rangeArr(127, 2175)], // 2175 = 127 + 2048\n      [rangeArr(2175, 2300), image2],\n    ];\n    const expectedChunkLens = [2048, 2048, 2046];\n    expect(chunks).toEqual([expectedChunks, expectedChunkLens]);\n  });\n\n  test(\"Image followed by token that fits just well.\", async () => {\n    const inputData = [\n      image1, // 1921\n      rangeArr(0, 127),\n      image2,\n    ];\n    const chunks = getChunkedPrefillInputData(inputData, prefillChunkSize);\n    const expectedChunks = [[image1, rangeArr(0, 127)], [image2]];\n    const expectedChunkLens = [2048, 1921];\n    expect(chunks).toEqual([expectedChunks, expectedChunkLens]);\n  });\n});\n\n// Refers to https://jackpordi.com/posts/locks-in-js-because-why-not\ndescribe(\"Test CustomLock\", () => {\n  test(\"Ensure five +1's give 5 with sleep between read/write\", async () => {\n    let value = 0;\n    const lock = new CustomLock();\n\n    async function addOne() {\n      await lock.acquire();\n      const readValue = value;\n      await new Promise((r) => setTimeout(r, 100));\n      value = readValue + 1;\n      await lock.release();\n    }\n    await Promise.all([addOne(), addOne(), addOne(), addOne(), addOne()]);\n    expect(value).toEqual(5); // without a lock, most likely less than 5\n  });\n});\n"
  },
  {
    "path": "tests/web_worker_handler.test.ts",
    "content": "import { UnknownMessageKindError } from \"../src/error\";\nimport {\n  CreateWebWorkerMLCEngine,\n  WebWorkerMLCEngine,\n  WebWorkerMLCEngineHandler,\n} from \"../src/web_worker\";\nimport { jest, test, expect, beforeEach } from \"@jest/globals\";\n\nconst reloadMock = jest.fn<(...args: any[]) => Promise<void>>(\n  async () => undefined,\n);\nconst forwardMock = jest.fn<(...args: any[]) => Promise<any>>();\nconst chatCompletionMock = jest.fn<(...args: any[]) => Promise<any>>();\nconst completionMock = jest.fn<(...args: any[]) => Promise<any>>();\nconst embeddingMock = jest.fn<(...args: any[]) => Promise<any>>();\nconst setLogitRegistryMock = jest.fn<(...args: any[]) => void>();\nconst setAppConfigMock = jest.fn<(...args: any[]) => void>();\n\nconst mockEngineInstance: Record<string, any> = {\n  reload: reloadMock,\n  forwardTokensAndSample: forwardMock,\n  chatCompletion: chatCompletionMock,\n  completion: completionMock,\n  embedding: embeddingMock,\n  setInitProgressCallback: jest.fn((cb) => {\n    mockEngineInstance.__initCb = cb;\n  }),\n  setLogitProcessorRegistry: setLogitRegistryMock,\n  setAppConfig: setAppConfigMock,\n};\n\njest.mock(\"../src/engine\", () => {\n  return {\n    MLCEngine: jest.fn(() => mockEngineInstance),\n  };\n});\n\nbeforeEach(() => {\n  reloadMock.mockClear();\n  forwardMock.mockClear();\n  chatCompletionMock.mockClear();\n  completionMock.mockClear();\n  embeddingMock.mockClear();\n  setLogitRegistryMock.mockClear();\n  setAppConfigMock.mockClear();\n  mockEngineInstance.__initCb = undefined;\n  (globalThis as any).postMessage = jest.fn();\n});\n\nfunction flushMicrotasks() {\n  return new Promise<void>((resolve) => setTimeout(resolve, 0));\n}\n\ntest(\"constructor registers init progress callback and posts updates\", () => {\n  const handler = new WebWorkerMLCEngineHandler();\n  expect(mockEngineInstance.setInitProgressCallback).toHaveBeenCalled();\n  const report = { progress: 0.5 };\n  mockEngineInstance.__initCb(report);\n  expect(globalThis.postMessage).toHaveBeenCalledWith({\n    kind: \"initProgressCallback\",\n    uuid: \"\",\n    content: report,\n  });\n  // suppress unused\n  expect(handler).toBeTruthy();\n});\n\ntest(\"chatCompletionNonStreaming reloads when worker state mismatches\", async () => {\n  const handler = new WebWorkerMLCEngineHandler();\n  chatCompletionMock.mockResolvedValueOnce({ object: \"chat.completion\" });\n  const message = {\n    kind: \"chatCompletionNonStreaming\",\n    uuid: \"task-1\",\n    content: {\n      modelId: [\"demo\"],\n      chatOpts: [],\n      request: { model: \"demo\", messages: [{ role: \"user\", content: \"hi\" }] },\n    },\n  };\n  const onComplete = jest.fn();\n  handler.onmessage(message, onComplete);\n  await flushMicrotasks();\n  expect(reloadMock).toHaveBeenCalledWith([\"demo\"], []);\n  expect(chatCompletionMock).toHaveBeenCalled();\n  expect(onComplete).toHaveBeenCalledWith({ object: \"chat.completion\" });\n  expect(globalThis.postMessage).toHaveBeenCalledWith({\n    kind: \"return\",\n    uuid: \"task-1\",\n    content: { object: \"chat.completion\" },\n  });\n});\n\ntest(\"chatCompletionStreamInit registers async generator\", async () => {\n  const handler = new WebWorkerMLCEngineHandler();\n  async function* generator() {\n    yield { object: \"chunk\" } as any;\n  }\n  chatCompletionMock.mockResolvedValueOnce(generator());\n  const message = {\n    kind: \"chatCompletionStreamInit\",\n    uuid: \"stream\",\n    content: {\n      modelId: [\"demo\"],\n      selectedModelId: \"demo\",\n      chatOpts: [],\n      request: {\n        model: \"demo\",\n        messages: [{ role: \"user\", content: \"go\" }],\n        stream: true,\n      },\n    },\n  };\n  handler.onmessage(message, jest.fn());\n  await flushMicrotasks();\n  expect(\n    (handler as any).loadedModelIdToAsyncGenerator.get(\"demo\"),\n  ).toBeDefined();\n});\n\ntest(\"completionNonStreaming routes to engine completion\", async () => {\n  const handler = new WebWorkerMLCEngineHandler();\n  completionMock.mockResolvedValueOnce({ object: \"text_completion\" });\n  const message = {\n    kind: \"completionNonStreaming\",\n    uuid: \"comp\",\n    content: {\n      modelId: [\"demo\"],\n      chatOpts: [],\n      request: { model: \"demo\", prompt: \"hi\" },\n    },\n  };\n  const onComplete = jest.fn();\n  handler.onmessage(message, onComplete);\n  await flushMicrotasks();\n  expect(completionMock).toHaveBeenCalled();\n  expect(onComplete).toHaveBeenCalledWith({ object: \"text_completion\" });\n});\n\ntest(\"embedding message reloads if needed and returns embeddings\", async () => {\n  const handler = new WebWorkerMLCEngineHandler();\n  embeddingMock.mockResolvedValueOnce({ object: \"list\", data: [] });\n  const message = {\n    kind: \"embedding\",\n    uuid: \"embed\",\n    content: {\n      modelId: [\"demo\"],\n      chatOpts: [],\n      request: { model: \"demo\", input: \"text\" },\n    },\n  };\n  const onComplete = jest.fn();\n  handler.onmessage(message, onComplete);\n  await flushMicrotasks();\n  expect(embeddingMock).toHaveBeenCalledWith({ model: \"demo\", input: \"text\" });\n  expect(onComplete).toHaveBeenCalledWith({ object: \"list\", data: [] });\n});\n\ntest(\"reloadIfUnmatched triggers reload when model lists differ\", async () => {\n  const handler = new WebWorkerMLCEngineHandler();\n  handler.modelId = [\"a\"];\n  await handler.reloadIfUnmatched([\"b\"]);\n  expect(reloadMock).toHaveBeenCalledWith([\"b\"], undefined);\n  reloadMock.mockClear();\n  handler.modelId = [\"same\"];\n  await handler.reloadIfUnmatched([\"same\"]);\n  expect(reloadMock).not.toHaveBeenCalled();\n});\n\ntest(\"unknown messages invoke onError and throw\", () => {\n  const handler = new WebWorkerMLCEngineHandler();\n  const onError = jest.fn();\n  expect(() =>\n    handler.onmessage({ kind: \"mystery\", content: {} }, undefined, onError),\n  ).toThrow(UnknownMessageKindError);\n  expect(onError).toHaveBeenCalled();\n});\n\ntest(\"CreateWebWorkerMLCEngine instantiates client and reloads\", async () => {\n  const worker = { postMessage: jest.fn(), onmessage: undefined as any };\n  const reloadSpy = jest\n    .spyOn(WebWorkerMLCEngine.prototype, \"reload\")\n    .mockResolvedValue(undefined);\n  const engine = await CreateWebWorkerMLCEngine(worker, \"model@a\");\n  expect(reloadSpy).toHaveBeenCalledWith(\"model@a\", undefined);\n  expect(engine.worker).toBe(worker);\n  reloadSpy.mockRestore();\n});\n\nclass MockWorker {\n  public sent: any[] = [];\n  public onmessage?: (event: any) => void;\n  private responders: Map<string, (msg: any) => any> = new Map();\n\n  constructor() {\n    this.setResponder(\"completionNonStreaming\", () => ({\n      object: \"completion\",\n    }));\n    this.setResponder(\"embedding\", () => ({ object: \"list\", data: [] }));\n    this.setResponder(\"reload\", () => null);\n  }\n\n  setResponder(kind: string, responder: (msg: any) => any) {\n    this.responders.set(kind, responder);\n  }\n\n  postMessage = (msg: any) => {\n    this.sent.push(msg);\n    const responder = this.responders.get(msg.kind);\n    if (!responder) {\n      return;\n    }\n    setTimeout(async () => {\n      const content = await responder(msg);\n      this.onmessage?.({ kind: \"return\", uuid: msg.uuid, content });\n    }, 0);\n  };\n}\n\ntest(\"WebWorkerMLCEngine completion sends message to worker\", async () => {\n  const worker = new MockWorker();\n  const engine = new WebWorkerMLCEngine(worker as any);\n  await engine.reload(\"demo-model\");\n  const res = await engine.completion({\n    model: \"demo-model\",\n    prompt: \"hello\",\n  });\n  expect(res.object).toBe(\"completion\");\n  expect(worker.sent.some((msg) => msg.kind === \"completionNonStreaming\")).toBe(\n    true,\n  );\n});\n\ntest(\"WebWorkerMLCEngine embedding delegates to worker\", async () => {\n  const worker = new MockWorker();\n  const engine = new WebWorkerMLCEngine(worker as any);\n  await engine.reload(\"demo-model\");\n  const res = await engine.embedding({\n    model: \"demo-model\",\n    input: \"test\",\n  });\n  expect(res.object).toBe(\"list\");\n  expect(worker.sent.some((msg) => msg.kind === \"embedding\")).toBe(true);\n});\n\ntest(\"handleTask posts throw when task rejects\", async () => {\n  const handler = new WebWorkerMLCEngineHandler();\n  const postSpy = jest\n    .spyOn(handler as any, \"postMessage\")\n    .mockImplementation(() => undefined);\n  await handler.handleTask(\"fail\", async () => {\n    throw new Error(\"boom\");\n  });\n  expect(postSpy).toHaveBeenCalledWith(\n    expect.objectContaining({\n      kind: \"throw\",\n      uuid: \"fail\",\n    }),\n  );\n});\n\ntest(\"completionStreamNextChunk returns data from stored generator\", async () => {\n  const handler = new WebWorkerMLCEngineHandler();\n  const generator = (async function* () {\n    yield { object: \"chunk\" };\n  })();\n  (handler as any).loadedModelIdToAsyncGenerator.set(\"demo\", generator);\n  const onComplete = jest.fn();\n  handler.onmessage(\n    {\n      kind: \"completionStreamNextChunk\",\n      uuid: \"next\",\n      content: { selectedModelId: \"demo\" },\n    } as any,\n    onComplete,\n  );\n  await flushMicrotasks();\n  expect(onComplete).toHaveBeenCalledWith({ object: \"chunk\" });\n});\n\ntest(\"WebWorkerMLCEngine setAppConfig posts configuration message\", () => {\n  const worker = new MockWorker();\n  const engine = new WebWorkerMLCEngine(worker as any);\n  const config = { model_list: [] } as any;\n  engine.setAppConfig(config);\n  const message = worker.sent.find((msg) => msg.kind === \"setAppConfig\");\n  expect(message).toBeDefined();\n  expect(message?.content).toBe(config);\n});\n\ntest(\"WebWorkerMLCEngine setLogLevel forwards to worker\", () => {\n  const worker = new MockWorker();\n  const engine = new WebWorkerMLCEngine(worker as any);\n  engine.setLogLevel(\"info\" as any);\n  const message = worker.sent.find((msg) => msg.kind === \"setLogLevel\");\n  expect(message?.content).toBe(\"info\");\n});\n\ntest(\"WebWorkerMLCEngine info helpers resolve via worker messages\", async () => {\n  const worker = new MockWorker();\n  worker.setResponder(\"getMessage\", () => \"ready\");\n  worker.setResponder(\"runtimeStatsText\", () => \"stats\");\n  worker.setResponder(\"getGPUVendor\", () => \"MockVendor\");\n  worker.setResponder(\"getMaxStorageBufferBindingSize\", () => 2048);\n  worker.setResponder(\"interruptGenerate\", () => null);\n  const engine = new WebWorkerMLCEngine(worker as any);\n  await engine.reload(\"demo\");\n  await expect(engine.getMessage(\"demo\")).resolves.toBe(\"ready\");\n  await expect(engine.runtimeStatsText()).resolves.toBe(\"stats\");\n  await expect(engine.getGPUVendor()).resolves.toBe(\"MockVendor\");\n  await expect(engine.getMaxStorageBufferBindingSize()).resolves.toBe(2048);\n  engine.interruptGenerate();\n  await flushMicrotasks();\n  expect(worker.sent.some((msg) => msg.kind === \"interruptGenerate\")).toBe(\n    true,\n  );\n});\n"
  },
  {
    "path": "tsconfig.json",
    "content": "{\n  \"compilerOptions\": {\n    \"target\": \"es6\",\n    \"declaration\": true,\n    \"outDir\": \"lib\",\n    \"declarationMap\": true,\n    \"sourceMap\": true,\n    \"strict\": true,\n    \"moduleResolution\": \"Node\",\n    \"esModuleInterop\": true,\n    \"lib\": [\"dom\", \"WebWorker\", \"es2022\"]\n  },\n  \"typeRoots\": [\"./node_modules/@webgpu/types\", \"./node_modules/@types\"],\n  \"include\": [\"src\"],\n  \"exclude\": [\"node_modules\", \"build\", \"dist\", \"rollup.config.cjs\"]\n}\n"
  },
  {
    "path": "utils/.gitignore",
    "content": "package-lock.json\n"
  },
  {
    "path": "utils/vram_requirements/.gitignore",
    "content": "src/app-config.js\n"
  },
  {
    "path": "utils/vram_requirements/README.md",
    "content": "### vRAM Requirements\n\nTo check vRAM requirement for a model, add models to check in `gh-config.json`.\n\nThen run `npm install` followed by `npm start`."
  },
  {
    "path": "utils/vram_requirements/package.json",
    "content": "{\n  \"name\": \"vram-requirements\",\n  \"version\": \"0.1.0\",\n  \"private\": true,\n  \"scripts\": {\n    \"start\": \"cp src/gh-config.js src/app-config.js && parcel src/vram_requirements.html  --port 8885\",\n    \"build\": \"cp src/gh-config.js src/app-config.js && parcel build src/vram_requirements.html --dist-dir lib\"\n  },\n  \"devDependencies\": {\n    \"buffer\": \"^5.7.1\",\n    \"crypto-browserify\": \"^3.12.0\",\n    \"events\": \"^3.3.0\",\n    \"parcel\": \"^2.8.3\",\n    \"path-browserify\": \"^1.0.1\",\n    \"process\": \"^0.11.10\",\n    \"stream-browserify\": \"^3.0.0\",\n    \"tslib\": \"^2.3.1\",\n    \"typescript\": \"^4.9.5\",\n    \"url\": \"^0.11.3\"\n  },\n  \"dependencies\": {\n    \"@mlc-ai/web-llm\": \"^0.2.82\",\n    \"@mlc-ai/web-runtime\": \"0.18.0-dev2\"\n  }\n}\n"
  },
  {
    "path": "utils/vram_requirements/src/gh-config.js",
    "content": "import { prebuiltAppConfig } from \"../../../lib/config\";\n\nexport default {\n\t\"model_list\": prebuiltAppConfig.model_list,\n\t\"use_web_worker\": true\n}\n"
  },
  {
    "path": "utils/vram_requirements/src/vram_requirements.html",
    "content": "<!DOCTYPE html>\n<html>\n<script>\n  webLLMGlobal = {}\n</script>\n\n<body>\n  <h2>vRAM Requirement Report</h2>\n  Open console to see logs\n  <br />\n  <br />\n\n  <label id=\"report-label\"> </label>\n\n  <script type=\"module\" src=\"./vram_requirements.ts\"></script>\n\n</html>"
  },
  {
    "path": "utils/vram_requirements/src/vram_requirements.ts",
    "content": "import ModelRecord from \"@mlc-ai/web-llm\";\nimport appConfig from \"./app-config\"; // Modify this to inspect vram requirement for models of choice\nimport * as tvmjs from \"@mlc-ai/web-runtime\";\nimport log from \"loglevel\";\n\nfunction setLabel(id: string, text: string) {\n  const label = document.getElementById(id);\n  if (label == null) {\n    throw Error(\"Cannot find label \" + id);\n  }\n  label.innerText = text;\n}\n\ninterface AppConfig {\n  model_list: Array<ModelRecord>;\n}\n\nconst dtypeBytesMap = new Map<string, number>([\n  [\"uint32\", 4],\n  [\"uint16\", 2],\n  [\"float32\", 4],\n  [\"float16\", 4],\n]);\n\nasync function main() {\n  const config: AppConfig = appConfig;\n  let report = \"\";\n  for (let i = 0; i < config.model_list.length; ++i) {\n    // 1. Read each model record\n    const modelRecord: ModelRecord = config.model_list[i];\n    const model_id = modelRecord.model_id;\n    // 2. Load the wasm\n    const wasmUrl = modelRecord.model_lib;\n    const wasmSource = await (await fetch(wasmUrl)).arrayBuffer();\n    report += `${model_id}: \\n`;\n    // 3. Initialize tvmjs instance and virtual machine using the wasm\n    const tvm = await tvmjs.instantiate(\n      new Uint8Array(wasmSource),\n      tvmjs.createPolyfillWASI(),\n      log.info,\n    );\n    const gpuDetectOutput = await tvmjs.detectGPUDevice();\n    if (gpuDetectOutput == undefined) {\n      throw Error(\"Cannot find WebGPU in the environment\");\n    }\n    tvm.initWebGPU(gpuDetectOutput.device);\n    tvm.beginScope();\n    const vm = tvm.detachFromCurrentScope(\n      tvm.createVirtualMachine(tvm.webgpu()),\n    );\n    // 4. Get metadata from the vm\n    let fgetMetadata: any;\n    try {\n      fgetMetadata = vm.getFunction(\"_metadata\");\n    } catch (err) {\n      log.error(\n        \"The wasm needs to have function `_metadata` to inspect vram requirement.\",\n        err,\n      );\n    }\n    const ret_value = fgetMetadata();\n    const metadataStr = tvm.detachFromCurrentScope(ret_value).toString();\n    const metadata = JSON.parse(metadataStr);\n    // 5. Parse the vram requirement\n    // 5.1. Get bytes for loading params\n    let paramBytes = 0;\n    metadata.params.forEach((param: any) => {\n      if (Math.min(...param.shape) > 0) {\n        // Possible to have shape -1 signifying a dynamic shape -- we disregard them\n        const dtypeBytes = dtypeBytesMap.get(param.dtype);\n        if (dtypeBytes === undefined) {\n          throw Error(\n            \"Cannot find size of \" +\n              param.dtype +\n              \", add it to `dtypeBytesMap`.\",\n          );\n        }\n        const numParams = param.shape.reduce((a: number, b: number) => a * b);\n        paramBytes += numParams * dtypeBytes;\n      } else {\n        log.info(\n          `${model_id}'s ${param.name} has dynamic shape; excluded from vRAM calculation.`,\n        );\n      }\n    });\n    // 5.2. Get maximum bytes needed for temporary buffer across all functions\n    let maxTempFuncBytes = 0;\n    Object.entries(metadata.memory_usage).forEach(([funcName, funcBytes]) => {\n      if (typeof funcBytes !== \"number\") {\n        throw Error(\"`memory_usage` expects entry `funcName: funcBytes`.\");\n      }\n      maxTempFuncBytes = Math.max(maxTempFuncBytes, funcBytes);\n    });\n    // 5.3. Get kv cache bytes\n    const kv_cache_bytes: number = metadata.kv_cache_bytes;\n    // 5.4. Get total vRAM needed\n    const totalBytes = paramBytes + maxTempFuncBytes + kv_cache_bytes;\n    // 6. Report vRAM Requirement\n    report +=\n      `totalBytes: ${(totalBytes / 1024 / 1024).toFixed(2)} MB\\n` +\n      `paramBytes: ${(paramBytes / 1024 / 1024).toFixed(2)} MB\\n` +\n      `maxTempFuncBytes: ${(maxTempFuncBytes / 1024 / 1024).toFixed(2)} MB\\n` +\n      `kv_cache_bytes: ${(kv_cache_bytes / 1024 / 1024).toFixed(2)} MB\\n\\n`;\n    // 7. Dispose everything\n    tvm.endScope();\n    vm.dispose();\n    tvm.dispose();\n  }\n  setLabel(\"report-label\", report);\n}\n\nmain();\n"
  }
]