[
  {
    "path": ".editorconfig",
    "content": "root = true\n\n[*]\nend_of_line = lf\ntrim_trailing_whitespace = true\ncharset = utf-8\nindent_style = space\nindent_size = 2\n\n[/node_modules/*]\nindent_size = unset\nindent_style = unset\n\n[{package.json,.travis.yml,.eslintrc.json}]\nindent_style = space\n"
  },
  {
    "path": ".envrc.template",
    "content": "export PAPERSPACE_API_KEY=\n"
  },
  {
    "path": ".git-blame-ignore-revs",
    "content": "# You can use this file with 'git config blame.ignoreRevsFile .git-blame-ignore-revs'\n# 07/31/2023: Style guidelines\n8c2867d26dfff8a4cf33bc59d5a8dee159f3256a\n# 08/22/2023: Running yapf with guidelines\n1488fbb167a0ae5b0770f33f50a7ee7f7b2223c9\neddbc063743b198d72c21bd7dced59dbd949b9f1\n# 08/23/2023: Synchronize style guidelines\n787ce1b3b63ecbacde371550f46fa7429f3e4db2\n# 08/25/2023: Consistency between yapf and ruff\n46c890480640294c3f34706d595559c7ea97dac5\n# 08/26/2023: Add one blank space between top level definition to similar to Google Style Guide\n806a663e4aa2b174969241f6e310e05762e233f0\n# 08/30/2023: Update to google style\nb545ad2ad1e3acbb69f6578d8a5ee03613867505\n# 09/01/2023: ignore new line split on comma-separated item\n7d893e6cd217ddfe845210503c8f2cf1667d16b6\n# 11/09/2023: running ruff format preview\nac377fe490bd886cf76c3855e6a2a50fc0e03b51\n# 11/26/2023: reduce line overhead\n69aae34cf4e6995edf2e2dc1a669fc4bdecf959a\n# 11/28/2023: compact\nd04309188b8fccec6a3ff36a893099806f560551\n# 12/14/2023: using ruff to 150 LL\nc8c9663d06e49da327ed53a22bea79f78d808aa9\n# 03/15/2024: ignore new ruff formatter\n727361ced761c82351ff539fcafa7af62fb5e2f0\n"
  },
  {
    "path": ".git_archival.txt",
    "content": "node: $Format:%H$\nnode-date: $Format:%cI$\ndescribe-name: $Format:%(describe:tags=true,match=*[0-9]*)$\nref-names: $Format:%D$\n"
  },
  {
    "path": ".gitattributes",
    "content": "* text=auto eol=lf\n# Needed for setuptools-scm-git-archive\n.git_archival.txt  export-subst\n"
  },
  {
    "path": ".github/CODEOWNERS",
    "content": "* @aarnphm\n"
  },
  {
    "path": ".github/CODE_OF_CONDUCT.md",
    "content": "# Contributor Covenant Code of Conduct\n\n## Our Pledge\n\nWe as members, contributors, and leaders pledge to make participation in our\ncommunity a harassment-free experience for everyone, regardless of age, body\nsize, visible or invisible disability, ethnicity, sex characteristics, gender\nidentity and expression, level of experience, education, socio-economic status,\nnationality, personal appearance, race, religion, or sexual identity\nand orientation.\n\nWe pledge to act and interact in ways that contribute to an open, welcoming,\ndiverse, inclusive, and healthy community.\n\n## Our Standards\n\nExamples of behavior that contributes to a positive environment for our\ncommunity include:\n\n* Demonstrating empathy and kindness toward other people\n* Being respectful of differing opinions, viewpoints, and experiences\n* Giving and gracefully accepting constructive feedback\n* Accepting responsibility and apologizing to those affected by our mistakes,\n  and learning from the experience\n* Focusing on what is best not just for us as individuals, but for the\n  overall community\n\nExamples of unacceptable behavior include:\n\n* The use of sexualized language or imagery, and sexual attention or\n  advances of any kind\n* Trolling, insulting or derogatory comments, and personal or political attacks\n* Public or private harassment\n* Publishing others' private information, such as a physical or email\n  address, without their explicit permission\n* Other conduct which could reasonably be considered inappropriate in a\n  professional setting\n\n## Enforcement Responsibilities\n\nCommunity leaders are responsible for clarifying and enforcing our standards of\nacceptable behavior and will take appropriate and fair corrective action in\nresponse to any behavior that they deem inappropriate, threatening, offensive,\nor harmful.\n\nCommunity leaders have the right and responsibility to remove, edit, or reject\ncomments, commits, code, wiki edits, issues, and other contributions that are\nnot aligned to this Code of Conduct, and will communicate reasons for moderation\ndecisions when appropriate.\n\n## Scope\n\nThis Code of Conduct applies within all community spaces, and also applies when\nan individual is officially representing the community in public spaces.\nExamples of representing our community include using an official e-mail address,\nposting via an official social media account, or acting as an appointed\nrepresentative at an online or offline event.\n\n## Enforcement\n\nInstances of abusive, harassing, or otherwise unacceptable behavior may be\nreported to the community leaders responsible for enforcement at\ncontact@bentoml.com.\nAll complaints will be reviewed and investigated promptly and fairly.\n\nAll community leaders are obligated to respect the privacy and security of the\nreporter of any incident.\n\n## Enforcement Guidelines\n\nCommunity leaders will follow these Community Impact Guidelines in determining\nthe consequences for any action they deem in violation of this Code of Conduct:\n\n### 1. Correction\n\n**Community Impact**: Use of inappropriate language or other behavior deemed\nunprofessional or unwelcome in the community.\n\n**Consequence**: A private, written warning from community leaders, providing\nclarity around the nature of the violation and an explanation of why the\nbehavior was inappropriate. A public apology may be requested.\n\n### 2. Warning\n\n**Community Impact**: A violation through a single incident or series\nof actions.\n\n**Consequence**: A warning with consequences for continued behavior. No\ninteraction with the people involved, including unsolicited interaction with\nthose enforcing the Code of Conduct, for a specified period of time. This\nincludes avoiding interactions in community spaces as well as external channels\nlike social media. Violating these terms may lead to a temporary or\npermanent ban.\n\n### 3. Temporary Ban\n\n**Community Impact**: A serious violation of community standards, including\nsustained inappropriate behavior.\n\n**Consequence**: A temporary ban from any sort of interaction or public\ncommunication with the community for a specified period of time. No public or\nprivate interaction with the people involved, including unsolicited interaction\nwith those enforcing the Code of Conduct, is allowed during this period.\nViolating these terms may lead to a permanent ban.\n\n### 4. Permanent Ban\n\n**Community Impact**: Demonstrating a pattern of violation of community\nstandards, including sustained inappropriate behavior,  harassment of an\nindividual, or aggression toward or disparagement of classes of individuals.\n\n**Consequence**: A permanent ban from any sort of public interaction within\nthe community.\n\n## Attribution\n\nThis Code of Conduct is adapted from the [Contributor Covenant][homepage],\n[version 2.0](https://www.contributor-covenant.org/version/2/0/code_of_conduct.html).\n\nCommunity Impact Guidelines were inspired by [Mozilla's code of conduct\nenforcement ladder](https://github.com/mozilla/diversity).\n\n[homepage]: https://www.contributor-covenant.org\n\nFor answers to common questions about this code of conduct, see the [FAQ](https://www.contributor-covenant.org/faq). Translations are available\n[here](https://www.contributor-covenant.org/translations)\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/bug_report.yml",
    "content": "name: 🐛 Bug Report\ndescription: Create a bug report on OpenLLM.\ntitle: 'bug: '\nlabels: ['']\nbody:\n  - type: markdown\n    id: exists\n    attributes:\n      value: |\n        Please search to see if an issue already exists for the bug you encountered.\n        See [Searching Issues and Pull Requests](https://docs.github.com/en/search-github/searching-on-github/searching-issues-and-pull-requests) for how to use the GitHub search bar and filters.\n  - type: textarea\n    id: describe-the-bug\n    validations:\n      required: true\n    attributes:\n      label: Describe the bug\n      description: |\n        Please provide a clear and concise description of the problem you ran into.\n      placeholder: This happened when I...\n  - type: textarea\n    id: to-reproduce\n    validations:\n      required: false\n    attributes:\n      label: To reproduce\n      description: |\n        Please provide a code sample or a code snippet to reproduce said problem. If you have code snippets, error messages, stack trace please also provide them here.\n\n        **IMPORTANT**: make sure to use [code tag](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/creating-and-highlighting-code-blocks#syntax-highlighting) to correctly format your code. Screenshot is helpful but don't use it for code snippets as it doesn't allow others to copy-and-paste your code.\n\n        To give us more information for diagnosing the issue, it would be great if you can provide a minimal reproducible!\n      placeholder: |\n        Steps to reproduce the bug:\n\n          1. Provide '...'\n          2. Run '...'\n          3. See the error\n  - type: textarea\n    id: logs\n    attributes:\n      label: Logs\n      description: 'Please include the Python logs if you can.'\n      render: shell\n  - type: textarea\n    id: environment-info\n    attributes:\n      label: Environment\n      description: |\n        Please share your environment with us. You should run `bentoml env`, `transformers-cli env` and paste the result here.\n      placeholder: |\n        bentoml: ...\n        transformers: ...\n        python: ...\n        platform: ...\n    validations:\n      required: true\n  - type: textarea\n    id: system-info\n    attributes:\n      label: System information (Optional)\n      description: |\n        Please share your system information with us.\n      placeholder: |\n        memory: ...\n        platform: ...\n        architecture: ...\n        CPU: ...\n        GPU: ...\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/config.yml",
    "content": "blank_issues_enabled: true\nversion: 2.1\ncontact_links:\n  - name: Blank issues\n    url: https://github.com/bentoml/openllm/issues/new\n    about: To create a blank issue\n  - name: BentoML Discussions\n    url: https://github.com/bentoml/openllm/discussions\n    about: Please ask general questions here.\n"
  },
  {
    "path": ".github/ISSUE_TEMPLATE/feature_request.yml",
    "content": "name: 🚀 Feature Request\ndescription: Submit a proposal/request for new OpenLLM features.\ntitle: 'feat: '\nlabels: ['']\nbody:\n  - type: textarea\n    id: feature-request\n    validations:\n      required: true\n    attributes:\n      label: Feature request\n      description: |\n        A clear and concise description of the feature request.\n      placeholder: |\n        I would like it if...\n  - type: textarea\n    id: motivation\n    validations:\n      required: false\n    attributes:\n      label: Motivation\n      description: |\n        Please outline the motivation for this feature request. Is your feature request related to a problem? e.g., I'm always frustrated when [...].\n        If this is related to another issue, please link here too.\n        If you have a current workaround, please also provide it here.\n      placeholder: |\n        This feature would solve ...\n  - type: textarea\n    id: other\n    attributes:\n      label: Other\n      description: |\n        Is there any way that you could help, e.g. by submitting a PR?\n      placeholder: |\n        I would love to contribute ...\n"
  },
  {
    "path": ".github/SECURITY.md",
    "content": "# Security Policy\n\n## Supported Versions\n\nWe are following [semantic versioning](https://semver.org/) with strict\nbackward-compatibility policy. We can ensure that all minor and major version\nare backward compatible. We are more lenient with patch as the development can\nmove quickly.\n\nIf you are just using public API, then feel free to always upgrade. Whenever\nthere is a breaking policies, it will be announced and will be broken.\n\n> [!WARNING]\n> Everything package under `openllm` that has an underscore prefixes\n> are exempt from this. They are considered private API and can change at any\n> time. However, you can ensure that all public API, classes and functions will\n> be backward-compatible.\n\n## Reporting a Vulnerability\n\nTo report a security vulnerability, please send us an\n[email](contact@bentoml.com).\n"
  },
  {
    "path": ".github/dependabot.yml",
    "content": "version: 2\nupdates:\n  - package-ecosystem: github-actions\n    directory: '/'\n    schedule:\n      interval: 'weekly'\n      day: 'monday'\n      time: '09:00'\n    groups:\n      actions-dependencies:\n        applies-to: 'version-updates'\n        patterns:\n          - '*'\n  - package-ecosystem: pip\n    directory: '/'\n    schedule:\n      interval: 'weekly'\n    open-pull-requests-limit: 5\n    groups:\n      production-dependencies:\n        applies-to: 'version-updates'\n        patterns:\n          - '*'\n"
  },
  {
    "path": ".github/workflows/create-releases.yml",
    "content": "name: release\non:\n  push:\n    tags:\n      - \"*\"\njobs:\n  build:\n    runs-on: ubuntu-latest\n    steps:\n      - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # ratchet:actions/checkout@v4\n      - name: Setup Python\n        uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # ratchet:actions/setup-python@v5\n        with:\n          python-version-file: .python-version-default\n      - name: Build\n        run: pipx run build\n      - name: Upload artifacts\n        uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # ratchet:actions/upload-artifact@v4\n        with:\n          name: python-artefacts-openllm\n          path: dist/*\n          if-no-files-found: error\n  release:\n    if: github.repository_owner == 'bentoml'\n    needs:\n      - build\n    runs-on: ubuntu-latest\n    name: Release\n    permissions:\n      id-token: write\n      contents: write\n    steps:\n      - name: Download Python artifacts\n        uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # ratchet:actions/download-artifact@v4\n        with:\n          pattern: python-artefacts-*\n          merge-multiple: true\n          path: dist\n      - name: dry ls\n        run: ls -rthlaR\n      - name: Publish to PyPI\n        uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e # ratchet:pypa/gh-action-pypi-publish@release/v1\n        with:\n          print-hash: true\n      - name: Create release\n        uses: softprops/action-gh-release@6cbd405e2c4e67a21c47fa9e383d020e4e28b836 # ratchet:softprops/action-gh-release@v2\n        with:\n          # Use GH feature to populate the changelog automatically\n          generate_release_notes: true\n          fail_on_unmatched_files: true\n          files: |-\n            dist/*\n"
  },
  {
    "path": ".github/workflows/dependabot-auto-merge.yml",
    "content": "name: Dependabot Auto merge\non: pull_request\n\npermissions:\n  contents: write\n  pull-requests: write\n\njobs:\n  dependabot:\n    runs-on: ubuntu-latest\n    if: github.event.pull_request.user.login == 'dependabot[bot]' && github.repository == 'bentoml/OpenLLM'\n    steps:\n      - name: Dependabot metadata\n        id: metadata\n        uses: dependabot/fetch-metadata@08eff52bf64351f401fb50d4972fa95b9f2c2d1b # ratchet:dependabot/fetch-metadata@v2.4.0\n        with:\n          github-token: \"${{ secrets.GITHUB_TOKEN }}\"\n      - name: Enable auto-merge for Dependabot PRs\n        if: steps.metadata.outputs.dependency-group == 'actions-dependencies' || steps.metadata.outputs.dependency-group == 'production-dependencies'\n        run: gh pr merge --auto --squash \"$PR_URL\"\n        env:\n          PR_URL: ${{github.event.pull_request.html_url}}\n          GH_TOKEN: ${{secrets.GITHUB_TOKEN}}\n"
  },
  {
    "path": ".gitignore",
    "content": "# Byte-compiled / optimized / DLL files\n__pycache__/\n*.py[cod]\n*$py.class\n\n# C extensions\n*.so\n\n# Distribution / packaging\n.Python\nbuild/\ndevelop-eggs/\ndist/\ndownloads/\neggs/\n.eggs/\nlib/\nlib64/\nparts/\nsdist/\nvar/\nwheels/\nshare/python-wheels/\n*.egg-info/\n.installed.cfg\n*.egg\nMANIFEST\n\n# PyInstaller\n#  Usually these files are written by a python script from a template\n#  before PyInstaller builds the exe, so as to inject date/other infos into it.\n*.manifest\n*.spec\n\n# Installer logs\npip-log.txt\npip-delete-this-directory.txt\n\n# Unit test / coverage reports\nhtmlcov/\n.tox/\n.nox/\n.coverage\n.coverage.*\n.cache\nnosetests.xml\ncoverage.xml\n*.cover\n*.py,cover\n.hypothesis/\n.pytest_cache/\ncover/\n\n# Translations\n*.mo\n*.pot\n\n# Django stuff:\n*.log\nlocal_settings.py\ndb.sqlite3\ndb.sqlite3-journal\n\n# Flask stuff:\ninstance/\n.webassets-cache\n\n# Scrapy stuff:\n.scrapy\n\n# Sphinx documentation\ndocs/_build/\n\n# PyBuilder\n.pybuilder/\ntarget/\n\n# Jupyter Notebook\n.ipynb_checkpoints\n\n# IPython\nprofile_default/\nipython_config.py\n\n# pyenv\n#   For a library or package, you might want to ignore these files since the code is\n#   intended to run in multiple environments; otherwise, check them in:\n.python-version\n\n# pipenv\n#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.\n#   However, in case of collaboration, if having platform-specific dependencies or dependencies\n#   having no cross-platform support, pipenv may install dependencies that don't work, or not\n#   install all needed dependencies.\n#Pipfile.lock\n\n# poetry\n#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.\n#   This is especially recommended for binary packages to ensure reproducibility, and is more\n#   commonly ignored for libraries.\n#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control\n#poetry.lock\n\n# pdm\n#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.\n#pdm.lock\n#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it\n#   in version control.\n#   https://pdm.fming.dev/#use-with-ide\n.pdm.toml\n\n# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm\n__pypackages__/\n\n# Celery stuff\ncelerybeat-schedule\ncelerybeat.pid\n\n# SageMath parsed files\n*.sage.py\n\n# Environments\n.env\n.venv\nenv/\nvenv/\nENV/\nenv.bak/\nvenv.bak/\n\n# Spyder project settings\n.spyderproject\n.spyproject\n\n# Rope project settings\n.ropeproject\n\n# mkdocs documentation\n/site\n\n# mypy\n.mypy_cache/\n.dmypy.json\ndmypy.json\n\n# Pyre type checker\n.pyre/\n\n# pytype static type analyzer\n.pytype/\n\n# Cython debug symbols\ncython_debug/\n\n# PyCharm\n#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can\n#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore\n#  and can be added to the global gitignore or merged into this file.  For a more nuclear\n#  option (not recommended) you can uncomment the following to ignore the entire idea folder.\n#.idea/\n*.whl\n# Environments\nvenv/\n.envrc\n_version.py\n.cursor\n"
  },
  {
    "path": ".pre-commit-config.yaml",
    "content": "ci:\n  autoupdate_schedule: weekly\n  autofix_commit_msg: \"ci: auto fixes from pre-commit.ci\\n\\nFor more information, see https://pre-commit.ci\"\n  autoupdate_commit_msg: \"ci: pre-commit autoupdate [pre-commit.ci]\"\n  autofix_prs: true\ndefault_language_version:\n  python: python3.11 # NOTE: sync with .python-version-default\nrepos:\n  - repo: https://github.com/astral-sh/ruff-pre-commit\n    rev: \"v0.12.12\"\n    hooks:\n      - id: ruff\n        alias: r\n        verbose: true\n        args: [--exit-non-zero-on-fix, --show-fixes, --fix]\n        types_or: [python, pyi, jupyter]\n      - id: ruff-format\n        alias: rf\n        verbose: true\n        types_or: [python, pyi, jupyter]\n  - repo: https://github.com/pre-commit/mirrors-mypy\n    rev: \"v1.17.1\"\n    hooks:\n      - id: mypy\n        args: [--strict, --ignore-missing-imports]\n        additional_dependencies:\n          [pydantic, types-pyyaml, types-tabulate, types-psutil, typer]\n  - repo: https://github.com/editorconfig-checker/editorconfig-checker.python\n    rev: \"3.4.0\"\n    hooks:\n      - id: editorconfig-checker\n        verbose: true\n        alias: ec\n        types_or: [python]\n  - repo: meta\n    hooks:\n      - id: check-hooks-apply\n      - id: check-useless-excludes\n  - repo: https://github.com/pre-commit/pre-commit-hooks\n    rev: v6.0.0\n    hooks:\n      - id: trailing-whitespace\n        verbose: true\n      - id: end-of-file-fixer\n        verbose: true\n      - id: check-yaml\n        args: [\"--unsafe\"]\n      - id: check-toml\n      - id: check-docstring-first\n      - id: check-added-large-files\n      - id: debug-statements\n      - id: check-merge-conflict\n"
  },
  {
    "path": ".python-version-default",
    "content": "3.11\n"
  },
  {
    "path": ".ruff.toml",
    "content": "extend-include = [\"*.ipynb\"]\npreview = true\nline-length = 100\nindent-width = 2\n\n[format]\npreview = true\nquote-style = \"single\"\nindent-style = \"space\"\nskip-magic-trailing-comma = true\ndocstring-code-format = true\n\n[lint]\nignore = [\n  \"RUF012\",\n  \"ANN\",    # Mypy is better at this\n  \"E722\",\n]\nselect = [\n  \"F\",\n  \"G\",     # flake8-logging-format\n  \"PERF\",  # perflint\n  \"RUF\",   # Ruff-specific rules\n  \"W6\",\n  \"E71\",\n  \"E72\",\n  \"E112\",\n  \"E113\",\n  \"E203\",\n  \"E272\",\n  \"E702\",\n  \"E703\",\n  \"E731\",\n  \"W191\",\n  \"W291\",\n  \"W293\",\n  \"UP039\", # unnecessary-class-parentheses\n]\n\n[lint.pydocstyle]\nconvention = \"google\"\n"
  },
  {
    "path": "CITATION.cff",
    "content": "cff-version: 1.2.0\ntitle: 'OpenLLM: Operating LLMs in production'\nmessage: >-\n  If you use this software, please cite it using these\n  metadata.\ntype: software\nauthors:\n  - given-names: Aaron\n    family-names: Pham\n    email: aarnphm@bentoml.com\n    orcid: 'https://orcid.org/0009-0008-3180-5115'\n  - given-names: Chaoyu\n    family-names: Yang\n    email: chaoyu@bentoml.com\n  - given-names: Sean\n    family-names: Sheng\n    email: ssheng@bentoml.com\n  - given-names: Shenyang\n    family-names: Zhao\n    email: larme@bentoml.com\n  - given-names: Sauyon\n    family-names: Lee\n    email: sauyon@bentoml.com\n  - given-names: Bo\n    family-names: Jiang\n    email: jiang@bentoml.com\n  - given-names: Fog\n    family-names: Dong\n    email: fog@bentoml.com\n  - given-names: Xipeng\n    family-names: Guan\n    email: xipeng@bentoml.com\n  - given-names: Frost\n    family-names: Ming\n    email: frost@bentoml.com\nrepository-code: 'https://github.com/bentoml/OpenLLM'\nurl: 'https://bentoml.com/'\nabstract: >-\n  OpenLLM is an open platform for operating large language\n  models (LLMs) in production. With OpenLLM, you can run\n  inference with any open-source large-language models,\n  deploy to the cloud or on-premises, and build powerful AI\n  apps. It has built-in support for a wide range of\n  open-source LLMs and model runtime, including StableLM,\n  Falcon, Dolly, Flan-T5, ChatGLM, StarCoder and more.\n  OpenLLM helps serve LLMs over RESTful API or gRPC with one\n  command or query via WebUI, CLI, our Python/Javascript\n  client, or any HTTP client. It provides first-class\n  support for LangChain, BentoML and Hugging Face that\n  allows you to easily create your own AI apps by composing\n  LLMs with other models and services. Last but not least,\n  it automatically generates LLM server OCI-compatible\n  Container Images or easily deploys as a serverless\n  endpoint via BentoCloud.\nkeywords:\n  - MLOps\n  - LLMOps\n  - LLM\n  - Infrastructure\n  - Transformers\n  - LLM Serving\n  - Model Serving\n  - Serverless Deployment\nlicense: Apache-2.0\ndate-released: '2023-06-13'\n"
  },
  {
    "path": "DEVELOPMENT.md",
    "content": "# Developer Guide\n\nThis Developer Guide is designed to help you contribute to the OpenLLM project.\nFollow these steps to set up your development environment and learn the process\nof contributing to our open-source project.\n\nJoin our [Discord Channel](https://l.bentoml.com/join-openllm-discord) and reach\nout to us if you have any question!\n\n## Table of Contents\n\n- [Developer Guide](#developer-guide)\n  - [Table of Contents](#table-of-contents)\n  - [Setting Up Your Development Environment](#setting-up-your-development-environment)\n  - [Development Workflow](#development-workflow)\n    - [Adding new models](#adding-new-models)\n    - [Adding bentos](#adding-new-models)\n    - [Adding repos](#adding-new-models)\n\n## Setting Up Your Development Environment\n\nBefore you can start developing, you'll need to set up your environment:\n\n1. Ensure you have [Git](https://git-scm.com/), and\n   [Python3.8+](https://www.python.org/downloads/) installed.\n2. Fork the OpenLLM repository from GitHub.\n3. Clone the forked repository from GitHub:\n\n   ```bash\n   git clone git@github.com:username/OpenLLM.git && cd openllm\n   ```\n\n4. Add the OpenLLM upstream remote to your local OpenLLM clone:\n\n   ```bash\n   git remote add upstream git@github.com:bentoml/OpenLLM.git\n   ```\n\n5. Configure git to pull from the upstream remote:\n\n   ```bash\n   git switch main # ensure you're on the main branch\n   git fetch upstream --tags\n   git branch --set-upstream-to=upstream/main\n   ```\n\n6. (Optional) Link `.python-version-default` to `.python-version`:\n\n   ```bash\n   ln .python-version-default .python-version\n   ```\n\n## Development Workflow\n\nThere are a few ways to contribute to the repository structure for OpenLLM:\n\n### Adding new models\n\n1. [recipe.yaml](./recipe.yaml) contains all related-metadata for generating new LLM-based bentos. To add a new LLM, the following structure should be adhere to:\n\n```yaml\n\"<model_name>:<model_tag>\":\n  project: vllm-chat\n  service_config:\n    name: phi3\n    traffic:\n      timeout: 300\n    resources:\n      gpu: 1\n      gpu_type: nvidia-tesla-l4\n  engine_config:\n    model: microsoft/Phi-3-mini-4k-instruct\n    max_model_len: 4096\n    dtype: half\n  chat_template: phi-3\n```\n\n- `<model_name>` represents the type of model to be supported. Currently supports `phi3`, `llama2`, `llama3`, `gemma`\n\n- `<model_tag>` emphasizes the type of model and its related metadata. The convention would include `<model_size>-<model_type>-<precision>[-<quantization>]`\n  For example:\n\n  - `microsoft/Phi-3-mini-4k-instruct` should be represented as `3.8b-instruct-fp16`.\n  - `TheBloke/Llama-2-7B-Chat-AWQ` would be `7b-chat-awq-4bit`\n\n- `project` would be used as the basis for the generated bento. Currently, most models should use `vllm-chat` as default.\n\n- `service_config` entails all BentoML-related [configuration](https://docs.bentoml.com/en/latest/guides/configurations.html) to run this bento.\n\n> [!NOTE]\n>\n> We recommend to include the following field for `service_config`:\n>\n> - `name` should be the same as `<model_name>`\n> - `resources` includes the available accelerator that can run this models. See more [here](https://docs.bentoml.com/en/latest/guides/configurations.html#resources)\n\n- `engine_config` are fields to be used for vLLM engine. See more supported arguments in [`AsyncEngineArgs`](https://github.com/vllm-project/vllm/blob/7cd2ebb0251fd1fd0eec5c93dac674603a22eddd/vllm/engine/arg_utils.py#L799). We recommend to always include `model`, `max_model_len`, `dtype` and `trust_remote_code`.\n\n- If the model is a chat model, `chat_template` should be used. Add the appropriate `chat_template` under [chat_template directory](./vllm-chat/chat_templates/) should you decide to do so.\n\n2. You can then run `BENTOML_HOME=$(openllm repo default)/bentoml/bentos python make.py <model_name>:<model_tag>` to generate the required bentos.\n\n3. You can then submit a [Pull request](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request) to `openllm` with the recipe changes\n\n### Adding bentos\n\nOpenLLM now also manages a [generated bento repository](https://github.com/bentoml/openllm-models/tree/main). If you update and modify and generated bentos, make sure to update the recipe and added the generated bentos under `bentoml/bentos`.\n\n### Adding repos\n\nIf you wish to create a your own managed git repo, you should follow the structure of [bentoml/openllm-models](https://github.com/bentoml/openllm-models/tree/main).\n\nTo add your custom repo, do `openllm repo add <repo_alias> <git_url>`\n"
  },
  {
    "path": "LICENSE",
    "content": "                                 Apache License\n                           Version 2.0, January 2004\n                        http://www.apache.org/licenses/\n\n   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION\n\n   1. Definitions.\n\n      \"License\" shall mean the terms and conditions for use, reproduction,\n      and distribution as defined by Sections 1 through 9 of this document.\n\n      \"Licensor\" shall mean the copyright owner or entity authorized by\n      the copyright owner that is granting the License.\n\n      \"Legal Entity\" shall mean the union of the acting entity and all\n      other entities that control, are controlled by, or are under common\n      control with that entity. For the purposes of this definition,\n      \"control\" means (i) the power, direct or indirect, to cause the\n      direction or management of such entity, whether by contract or\n      otherwise, or (ii) ownership of fifty percent (50%) or more of the\n      outstanding shares, or (iii) beneficial ownership of such entity.\n\n      \"You\" (or \"Your\") shall mean an individual or Legal Entity\n      exercising permissions granted by this License.\n\n      \"Source\" form shall mean the preferred form for making modifications,\n      including but not limited to software source code, documentation\n      source, and configuration files.\n\n      \"Object\" form shall mean any form resulting from mechanical\n      transformation or translation of a Source form, including but\n      not limited to compiled object code, generated documentation,\n      and conversions to other media types.\n\n      \"Work\" shall mean the work of authorship, whether in Source or\n      Object form, made available under the License, as indicated by a\n      copyright notice that is included in or attached to the work\n      (an example is provided in the Appendix below).\n\n      \"Derivative Works\" shall mean any work, whether in Source or Object\n      form, that is based on (or derived from) the Work and for which the\n      editorial revisions, annotations, elaborations, or other modifications\n      represent, as a whole, an original work of authorship. For the purposes\n      of this License, Derivative Works shall not include works that remain\n      separable from, or merely link (or bind by name) to the interfaces of,\n      the Work and Derivative Works thereof.\n\n      \"Contribution\" shall mean any work of authorship, including\n      the original version of the Work and any modifications or additions\n      to that Work or Derivative Works thereof, that is intentionally\n      submitted to Licensor for inclusion in the Work by the copyright owner\n      or by an individual or Legal Entity authorized to submit on behalf of\n      the copyright owner. For the purposes of this definition, \"submitted\"\n      means any form of electronic, verbal, or written communication sent\n      to the Licensor or its representatives, including but not limited to\n      communication on electronic mailing lists, source code control systems,\n      and issue tracking systems that are managed by, or on behalf of, the\n      Licensor for the purpose of discussing and improving the Work, but\n      excluding communication that is conspicuously marked or otherwise\n      designated in writing by the copyright owner as \"Not a Contribution.\"\n\n      \"Contributor\" shall mean Licensor and any individual or Legal Entity\n      on behalf of whom a Contribution has been received by Licensor and\n      subsequently incorporated within the Work.\n\n   2. Grant of Copyright License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      copyright license to reproduce, prepare Derivative Works of,\n      publicly display, publicly perform, sublicense, and distribute the\n      Work and such Derivative Works in Source or Object form.\n\n   3. Grant of Patent License. Subject to the terms and conditions of\n      this License, each Contributor hereby grants to You a perpetual,\n      worldwide, non-exclusive, no-charge, royalty-free, irrevocable\n      (except as stated in this section) patent license to make, have made,\n      use, offer to sell, sell, import, and otherwise transfer the Work,\n      where such license applies only to those patent claims licensable\n      by such Contributor that are necessarily infringed by their\n      Contribution(s) alone or by combination of their Contribution(s)\n      with the Work to which such Contribution(s) was submitted. If You\n      institute patent litigation against any entity (including a\n      cross-claim or counterclaim in a lawsuit) alleging that the Work\n      or a Contribution incorporated within the Work constitutes direct\n      or contributory patent infringement, then any patent licenses\n      granted to You under this License for that Work shall terminate\n      as of the date such litigation is filed.\n\n   4. Redistribution. You may reproduce and distribute copies of the\n      Work or Derivative Works thereof in any medium, with or without\n      modifications, and in Source or Object form, provided that You\n      meet the following conditions:\n\n      (a) You must give any other recipients of the Work or\n          Derivative Works a copy of this License; and\n\n      (b) You must cause any modified files to carry prominent notices\n          stating that You changed the files; and\n\n      (c) You must retain, in the Source form of any Derivative Works\n          that You distribute, all copyright, patent, trademark, and\n          attribution notices from the Source form of the Work,\n          excluding those notices that do not pertain to any part of\n          the Derivative Works; and\n\n      (d) If the Work includes a \"NOTICE\" text file as part of its\n          distribution, then any Derivative Works that You distribute must\n          include a readable copy of the attribution notices contained\n          within such NOTICE file, excluding those notices that do not\n          pertain to any part of the Derivative Works, in at least one\n          of the following places: within a NOTICE text file distributed\n          as part of the Derivative Works; within the Source form or\n          documentation, if provided along with the Derivative Works; or,\n          within a display generated by the Derivative Works, if and\n          wherever such third-party notices normally appear. The contents\n          of the NOTICE file are for informational purposes only and\n          do not modify the License. You may add Your own attribution\n          notices within Derivative Works that You distribute, alongside\n          or as an addendum to the NOTICE text from the Work, provided\n          that such additional attribution notices cannot be construed\n          as modifying the License.\n\n      You may add Your own copyright statement to Your modifications and\n      may provide additional or different license terms and conditions\n      for use, reproduction, or distribution of Your modifications, or\n      for any such Derivative Works as a whole, provided Your use,\n      reproduction, and distribution of the Work otherwise complies with\n      the conditions stated in this License.\n\n   5. Submission of Contributions. Unless You explicitly state otherwise,\n      any Contribution intentionally submitted for inclusion in the Work\n      by You to the Licensor shall be under the terms and conditions of\n      this License, without any additional terms or conditions.\n      Notwithstanding the above, nothing herein shall supersede or modify\n      the terms of any separate license agreement you may have executed\n      with Licensor regarding such Contributions.\n\n   6. Trademarks. This License does not grant permission to use the trade\n      names, trademarks, service marks, or product names of the Licensor,\n      except as required for reasonable and customary use in describing the\n      origin of the Work and reproducing the content of the NOTICE file.\n\n   7. Disclaimer of Warranty. Unless required by applicable law or\n      agreed to in writing, Licensor provides the Work (and each\n      Contributor provides its Contributions) on an \"AS IS\" BASIS,\n      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or\n      implied, including, without limitation, any warranties or conditions\n      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A\n      PARTICULAR PURPOSE. You are solely responsible for determining the\n      appropriateness of using or redistributing the Work and assume any\n      risks associated with Your exercise of permissions under this License.\n\n   8. Limitation of Liability. In no event and under no legal theory,\n      whether in tort (including negligence), contract, or otherwise,\n      unless required by applicable law (such as deliberate and grossly\n      negligent acts) or agreed to in writing, shall any Contributor be\n      liable to You for damages, including any direct, indirect, special,\n      incidental, or consequential damages of any character arising as a\n      result of this License or out of the use or inability to use the\n      Work (including but not limited to damages for loss of goodwill,\n      work stoppage, computer failure or malfunction, or any and all\n      other commercial damages or losses), even if such Contributor\n      has been advised of the possibility of such damages.\n\n   9. Accepting Warranty or Additional Liability. While redistributing\n      the Work or Derivative Works thereof, You may choose to offer,\n      and charge a fee for, acceptance of support, warranty, indemnity,\n      or other liability obligations and/or rights consistent with this\n      License. However, in accepting such obligations, You may act only\n      on Your own behalf and on Your sole responsibility, not on behalf\n      of any other Contributor, and only if You agree to indemnify,\n      defend, and hold each Contributor harmless for any liability\n      incurred by, or claims asserted against, such Contributor by reason\n      of your accepting any such warranty or additional liability.\n\n   END OF TERMS AND CONDITIONS\n\n   APPENDIX: How to apply the Apache License to your work.\n\n      To apply the Apache License to your work, attach the following\n      boilerplate notice, with the fields enclosed by brackets \"[]\"\n      replaced with your own identifying information. (Don't include\n      the brackets!)  The text should be enclosed in the appropriate\n      comment syntax for the file format. We also recommend that a\n      file or class name and description of purpose be included on the\n      same \"printed page\" as the copyright notice for easier\n      identification within third-party archives.\n\n   Copyright [yyyy] [name of copyright owner]\n\n   Licensed under the Apache License, Version 2.0 (the \"License\");\n   you may not use this file except in compliance with the License.\n   You may obtain a copy of the License at\n\n       http://www.apache.org/licenses/LICENSE-2.0\n\n   Unless required by applicable law or agreed to in writing, software\n   distributed under the License is distributed on an \"AS IS\" BASIS,\n   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n   See the License for the specific language governing permissions and\n   limitations under the License.\n"
  },
  {
    "path": "README.md",
    "content": "<div align=\"center\">\n\n<h1>🦾 OpenLLM: Self-Hosting LLMs Made Easy</h1>\n\n[![License: Apache-2.0](https://img.shields.io/badge/License-Apache%202-green.svg)](https://github.com/bentoml/OpenLLM/blob/main/LICENSE)\n[![Releases](https://img.shields.io/pypi/v/openllm.svg?logo=pypi&label=PyPI&logoColor=gold)](https://pypi.org/project/openllm)\n[![CI](https://results.pre-commit.ci/badge/github/bentoml/OpenLLM/main.svg)](https://results.pre-commit.ci/latest/github/bentoml/OpenLLM/main)\n[![X](https://badgen.net/badge/icon/@bentomlai/000000?icon=twitter&label=Follow)](https://twitter.com/bentomlai)\n[![Community](https://badgen.net/badge/icon/Community/562f5d?icon=slack&label=Join)](https://l.bentoml.com/join-slack)\n\n</div>\n\nOpenLLM allows developers to run **any open-source LLMs** (Llama 3.3, Qwen2.5, Phi3 and [more](#supported-models)) or **custom models** as **OpenAI-compatible APIs** with a single command. It features a [built-in chat UI](#chat-ui), state-of-the-art inference backends, and a simplified workflow for creating enterprise-grade cloud deployment with Docker, Kubernetes, and [BentoCloud](#deploy-to-bentocloud).\n\nUnderstand the [design philosophy of OpenLLM](https://www.bentoml.com/blog/from-ollama-to-openllm-running-llms-in-the-cloud).\n\n## Get Started\n\nRun the following commands to install OpenLLM and explore it interactively.\n\n```bash\npip install openllm  # or pip3 install openllm\nopenllm hello\n```\n\n![hello](https://github.com/user-attachments/assets/5af19f23-1b34-4c45-b1e0-a6798b4586d1)\n\n## Supported models\n\nOpenLLM supports a wide range of state-of-the-art open-source LLMs. You can also add a [model repository to run custom models](#set-up-a-custom-repository) with OpenLLM.\n\n<table>\n  <tr>\n    <th>Model</th>\n    <th>Parameters</th>\n    <th>Required GPU</th>\n    <th>Start a Server</th>\n  </tr>\n  <tr>\n    <td>deepseek</td>\n    <td>r1-671b</td>\n    <td>80Gx16</td>\n    <td><code>openllm serve deepseek:r1-671b</code></td>\n  </tr>\n  <tr>\n    <td>gemma2</td>\n    <td>2b</td>\n    <td>12G</td>\n    <td><code>openllm serve gemma2:2b</code></td>\n  </tr>\n  <tr>\n    <td>gemma3</td>\n    <td>3b</td>\n    <td>12G</td>\n    <td><code>openllm serve gemma3:3b</code></td>\n  </tr>\n  <tr>\n    <td>jamba1.5</td>\n    <td>mini-ff0a</td>\n    <td>80Gx2</td>\n    <td><code>openllm serve jamba1.5:mini-ff0a</code></td>\n  </tr>\n  <tr>\n    <td>llama3.1</td>\n    <td>8b</td>\n    <td>24G</td>\n    <td><code>openllm serve llama3.1:8b</code></td>\n  </tr>\n  <tr>\n    <td>llama3.2</td>\n    <td>1b</td>\n    <td>24G</td>\n    <td><code>openllm serve llama3.2:1b</code></td>\n  </tr>\n  <tr>\n    <td>llama3.3</td>\n    <td>70b</td>\n    <td>80Gx2</td>\n    <td><code>openllm serve llama3.3:70b</code></td>\n  </tr>\n  <tr>\n    <td>llama4</td>\n    <td>17b16e</td>\n    <td>80Gx8</td>\n    <td><code>openllm serve llama4:17b16e</code></td>\n  </tr>\n  <tr>\n    <td>mistral</td>\n    <td>8b-2410</td>\n    <td>24G</td>\n    <td><code>openllm serve mistral:8b-2410</code></td>\n  </tr>\n  <tr>\n    <td>mistral-large</td>\n    <td>123b-2407</td>\n    <td>80Gx4</td>\n    <td><code>openllm serve mistral-large:123b-2407</code></td>\n  </tr>\n  <tr>\n    <td>phi4</td>\n    <td>14b</td>\n    <td>80G</td>\n    <td><code>openllm serve phi4:14b</code></td>\n  </tr>\n  <tr>\n    <td>pixtral</td>\n    <td>12b-2409</td>\n    <td>80G</td>\n    <td><code>openllm serve pixtral:12b-2409</code></td>\n  </tr>\n  <tr>\n    <td>qwen2.5</td>\n    <td>7b</td>\n    <td>24G</td>\n    <td><code>openllm serve qwen2.5:7b</code></td>\n  </tr>\n  <tr>\n    <td>qwen2.5-coder</td>\n    <td>3b</td>\n    <td>24G</td>\n    <td><code>openllm serve qwen2.5-coder:3b</code></td>\n  </tr>\n  <tr>\n    <td>qwq</td>\n    <td>32b</td>\n    <td>80G</td>\n    <td><code>openllm serve qwq:32b</code></td>\n  </tr>\n</table>\n\nFor the full model list, see the [OpenLLM models repository](https://github.com/bentoml/openllm-models).\n\n## Start an LLM server\n\nTo start an LLM server locally, use the `openllm serve` command and specify the model version.\n\n> [!NOTE]\n> OpenLLM does not store model weights. A Hugging Face token (HF_TOKEN) is required for gated models.\n>\n> 1. Create your Hugging Face token [here](https://huggingface.co/settings/tokens).\n> 2. Request access to the gated model, such as [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct).\n> 3. Set your token as an environment variable by running:\n>    ```bash\n>    export HF_TOKEN=<your token>\n>    ```\n\n```bash\nopenllm serve llama3.2:1b\n```\n\nThe server will be accessible at [http://localhost:3000](http://localhost:3000/), providing OpenAI-compatible APIs for interaction. You can call the endpoints with different frameworks and tools that support OpenAI-compatible APIs. Typically, you may need to specify the following:\n\n- **The API host address**: By default, the LLM is hosted at [http://localhost:3000](http://localhost:3000/).\n- **The model name:** The name can be different depending on the tool you use.\n- **The API key**: The API key used for client authentication. This is optional.\n\nHere are some examples:\n\n<details>\n\n<summary>OpenAI Python client</summary>\n\n```python\nfrom openai import OpenAI\n\nclient = OpenAI(base_url='http://localhost:3000/v1', api_key='na')\n\n# Use the following func to get the available models\n# model_list = client.models.list()\n# print(model_list)\n\nchat_completion = client.chat.completions.create(\n    model=\"meta-llama/Llama-3.2-1B-Instruct\",\n    messages=[\n        {\n            \"role\": \"user\",\n            \"content\": \"Explain superconductors like I'm five years old\"\n        }\n    ],\n    stream=True,\n)\nfor chunk in chat_completion:\n    print(chunk.choices[0].delta.content or \"\", end=\"\")\n```\n\n</details>\n\n<details>\n\n<summary>LlamaIndex</summary>\n\n```python\nfrom llama_index.llms.openai import OpenAI\n\nllm = OpenAI(api_bese=\"http://localhost:3000/v1\", model=\"meta-llama/Llama-3.2-1B-Instruct\", api_key=\"dummy\")\n...\n```\n\n</details>\n\n## Chat UI\n\nOpenLLM provides a chat UI at the `/chat` endpoint for the launched LLM server at http://localhost:3000/chat.\n\n<img width=\"800\" alt=\"openllm_ui\" src=\"https://github.com/bentoml/OpenLLM/assets/5886138/8b426b2b-67da-4545-8b09-2dc96ff8a707\">\n\n## Chat with a model in the CLI\n\nTo start a chat conversation in the CLI, use the `openllm run` command and specify the model version.\n\n```bash\nopenllm run llama3:8b\n```\n\n## Model repository\n\nA model repository in OpenLLM represents a catalog of available LLMs that you can run. OpenLLM provides a default model repository that includes the latest open-source LLMs like Llama 3, Mistral, and Qwen2, hosted at [this GitHub repository](https://github.com/bentoml/openllm-models). To see all available models from the default and any added repository, use:\n\n```bash\nopenllm model list\n```\n\nTo ensure your local list of models is synchronized with the latest updates from all connected repositories, run:\n\n```bash\nopenllm repo update\n```\n\nTo review a model’s information, run:\n\n```bash\nopenllm model get llama3.2:1b\n```\n\n### Add a model to the default model repository\n\nYou can contribute to the default model repository by adding new models that others can use. This involves creating and submitting a Bento of the LLM. For more information, check out this [example pull request](https://github.com/bentoml/openllm-models/pull/1).\n\n### Set up a custom repository\n\nYou can add your own repository to OpenLLM with custom models. To do so, follow the format in the default OpenLLM model repository with a `bentos` directory to store custom LLMs. You need to [build your Bentos with BentoML](https://docs.bentoml.com/en/latest/guides/build-options.html) and submit them to your model repository.\n\nFirst, prepare your custom models in a `bentos` directory following the guidelines provided by [BentoML to build Bentos](https://docs.bentoml.com/en/latest/guides/build-options.html). Check out the [default model repository](https://github.com/bentoml/openllm-repo) for an example and read the [Developer Guide](https://github.com/bentoml/OpenLLM/blob/main/DEVELOPMENT.md) for details.\n\nThen, register your custom model repository with OpenLLM:\n\n```bash\nopenllm repo add <repo-name> <repo-url>\n```\n\n**Note**: Currently, OpenLLM only supports adding public repositories.\n\n## Deploy to BentoCloud\n\nOpenLLM supports LLM cloud deployment via BentoML, the unified model serving framework, and BentoCloud, an AI inference platform for enterprise AI teams. BentoCloud provides fully-managed infrastructure optimized for LLM inference with autoscaling, model orchestration, observability, and many more, allowing you to run any AI model in the cloud.\n\n[Sign up for BentoCloud](https://www.bentoml.com/) for free and [log in](https://docs.bentoml.com/en/latest/bentocloud/how-tos/manage-access-token.html). Then, run `openllm deploy` to deploy a model to BentoCloud:\n\n```bash\nopenllm deploy llama3.2:1b --env HF_TOKEN\n```\n\n> [!NOTE]\n> If you are deploying a gated model, make sure to set HF_TOKEN in enviroment variables.\n\nOnce the deployment is complete, you can run model inference on the BentoCloud console:\n\n<img width=\"800\" alt=\"bentocloud_ui\" src=\"https://github.com/bentoml/OpenLLM/assets/65327072/4f7819d9-73ea-488a-a66c-f724e5d063e6\">\n\n## Community\n\nOpenLLM is actively maintained by the BentoML team. Feel free to reach out and join us in our pursuit to make LLMs more accessible and easy to use 👉 [Join our Slack community!](https://l.bentoml.com/join-slack)\n\n## Contributing\n\nAs an open-source project, we welcome contributions of all kinds, such as new features, bug fixes, and documentation. Here are some of the ways to contribute:\n\n- Repost a bug by [creating a GitHub issue](https://github.com/bentoml/OpenLLM/issues/new/choose).\n- [Submit a pull request](https://github.com/bentoml/OpenLLM/compare) or help review other developers’ [pull requests](https://github.com/bentoml/OpenLLM/pulls).\n- Add an LLM to the OpenLLM default model repository so that other users can run your model. See the [pull request template](https://github.com/bentoml/openllm-models/pull/1).\n- Check out the [Developer Guide](https://github.com/bentoml/OpenLLM/blob/main/DEVELOPMENT.md) to learn more.\n\n## Acknowledgements\n\nThis project uses the following open-source projects:\n\n- [bentoml/bentoml](https://github.com/bentoml/bentoml) for production level model serving\n- [vllm-project/vllm](https://github.com/vllm-project/vllm) for production level LLM backend\n- [blrchen/chatgpt-lite](https://github.com/blrchen/chatgpt-lite) for a fancy Web Chat UI\n- [astral-sh/uv](https://github.com/astral-sh/uv) for blazing fast model requirements installing\n\nWe are grateful to the developers and contributors of these projects for their hard work and dedication.\n"
  },
  {
    "path": "README.md.tpl",
    "content": "<div align=\"center\">\n\n<h1>🦾 OpenLLM: Self-Hosting LLMs Made Easy</h1>\n\n[![License: Apache-2.0](https://img.shields.io/badge/License-Apache%202-green.svg)](https://github.com/bentoml/OpenLLM/blob/main/LICENSE)\n[![Releases](https://img.shields.io/pypi/v/openllm.svg?logo=pypi&label=PyPI&logoColor=gold)](https://pypi.org/project/openllm)\n[![CI](https://results.pre-commit.ci/badge/github/bentoml/OpenLLM/main.svg)](https://results.pre-commit.ci/latest/github/bentoml/OpenLLM/main)\n[![X](https://badgen.net/badge/icon/@bentomlai/000000?icon=twitter&label=Follow)](https://twitter.com/bentomlai)\n[![Community](https://badgen.net/badge/icon/Community/562f5d?icon=slack&label=Join)](https://l.bentoml.com/join-slack)\n\n</div>\n\nOpenLLM allows developers to run **any open-source LLMs** (Llama 3.3, Qwen2.5, Phi3 and [more](#supported-models)) or **custom models** as **OpenAI-compatible APIs** with a single command. It features a [built-in chat UI](#chat-ui), state-of-the-art inference backends, and a simplified workflow for creating enterprise-grade cloud deployment with Docker, Kubernetes, and [BentoCloud](#deploy-to-bentocloud).\n\nUnderstand the [design philosophy of OpenLLM](https://www.bentoml.com/blog/from-ollama-to-openllm-running-llms-in-the-cloud).\n\n## Get Started\n\nRun the following commands to install OpenLLM and explore it interactively.\n\n```bash\npip install openllm  # or pip3 install openllm\nopenllm hello\n```\n\n![hello](https://github.com/user-attachments/assets/5af19f23-1b34-4c45-b1e0-a6798b4586d1)\n\n## Supported models\n\nOpenLLM supports a wide range of state-of-the-art open-source LLMs. You can also add a [model repository to run custom models](#set-up-a-custom-repository) with OpenLLM.\n\n<table>\n  <tr>\n    <th>Model</th>\n    <th>Parameters</th>\n    <th>Required GPU</th>\n    <th>Start a Server</th>\n  </tr>\n  {%- for key, value in model_dict|items %}\n  <tr>\n    <td>{{key}}</td>\n    <td>{{value['version']}}</td>\n    <td>{{value['pretty_gpu']}}</td>\n    <td><code>{{value['command']}}</code></td>\n  </tr>\n  {%- endfor %}\n</table>\n\n\nFor the full model list, see the [OpenLLM models repository](https://github.com/bentoml/openllm-models).\n\n## Start an LLM server\n\nTo start an LLM server locally, use the `openllm serve` command and specify the model version.\n\n> [!NOTE]\n> OpenLLM does not store model weights. A Hugging Face token (HF_TOKEN) is required for gated models.\n>\n> 1. Create your Hugging Face token [here](https://huggingface.co/settings/tokens).\n> 2. Request access to the gated model, such as [meta-llama/Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct).\n> 3. Set your token as an environment variable by running:\n>    ```bash\n>    export HF_TOKEN=<your token>\n>    ```\n\n```bash\n{{model_dict.get(\"llama3.2\")[\"command\"]}}\n```\n\nThe server will be accessible at [http://localhost:3000](http://localhost:3000/), providing OpenAI-compatible APIs for interaction. You can call the endpoints with different frameworks and tools that support OpenAI-compatible APIs. Typically, you may need to specify the following:\n\n- **The API host address**: By default, the LLM is hosted at [http://localhost:3000](http://localhost:3000/).\n- **The model name:** The name can be different depending on the tool you use.\n- **The API key**: The API key used for client authentication. This is optional.\n\nHere are some examples:\n\n<details>\n\n<summary>OpenAI Python client</summary>\n\n```python\nfrom openai import OpenAI\n\nclient = OpenAI(base_url='http://localhost:3000/v1', api_key='na')\n\n# Use the following func to get the available models\n# model_list = client.models.list()\n# print(model_list)\n\nchat_completion = client.chat.completions.create(\n    model=\"meta-llama/Llama-3.2-1B-Instruct\",\n    messages=[\n        {\n            \"role\": \"user\",\n            \"content\": \"Explain superconductors like I'm five years old\"\n        }\n    ],\n    stream=True,\n)\nfor chunk in chat_completion:\n    print(chunk.choices[0].delta.content or \"\", end=\"\")\n```\n\n</details>\n\n<details>\n\n<summary>LlamaIndex</summary>\n\n```python\nfrom llama_index.llms.openai import OpenAI\n\nllm = OpenAI(api_bese=\"http://localhost:3000/v1\", model=\"meta-llama/Llama-3.2-1B-Instruct\", api_key=\"dummy\")\n...\n```\n\n</details>\n\n## Chat UI\n\nOpenLLM provides a chat UI at the `/chat` endpoint for the launched LLM server at http://localhost:3000/chat.\n\n<img width=\"800\" alt=\"openllm_ui\" src=\"https://github.com/bentoml/OpenLLM/assets/5886138/8b426b2b-67da-4545-8b09-2dc96ff8a707\">\n\n## Chat with a model in the CLI\n\nTo start a chat conversation in the CLI, use the `openllm run` command and specify the model version.\n\n```bash\nopenllm run llama3:8b\n```\n\n## Model repository\n\nA model repository in OpenLLM represents a catalog of available LLMs that you can run. OpenLLM provides a default model repository that includes the latest open-source LLMs like Llama 3, Mistral, and Qwen2, hosted at [this GitHub repository](https://github.com/bentoml/openllm-models). To see all available models from the default and any added repository, use:\n\n```bash\nopenllm model list\n```\n\nTo ensure your local list of models is synchronized with the latest updates from all connected repositories, run:\n\n```bash\nopenllm repo update\n```\n\nTo review a model’s information, run:\n\n```bash\nopenllm model get {{model_dict.get(\"llama3.2\")[\"tag\"]}}\n```\n\n### Add a model to the default model repository\n\nYou can contribute to the default model repository by adding new models that others can use. This involves creating and submitting a Bento of the LLM. For more information, check out this [example pull request](https://github.com/bentoml/openllm-models/pull/1).\n\n### Set up a custom repository\n\nYou can add your own repository to OpenLLM with custom models. To do so, follow the format in the default OpenLLM model repository with a `bentos` directory to store custom LLMs. You need to [build your Bentos with BentoML](https://docs.bentoml.com/en/latest/guides/build-options.html) and submit them to your model repository.\n\nFirst, prepare your custom models in a `bentos` directory following the guidelines provided by [BentoML to build Bentos](https://docs.bentoml.com/en/latest/guides/build-options.html). Check out the [default model repository](https://github.com/bentoml/openllm-repo) for an example and read the [Developer Guide](https://github.com/bentoml/OpenLLM/blob/main/DEVELOPMENT.md) for details.\n\nThen, register your custom model repository with OpenLLM:\n\n```bash\nopenllm repo add <repo-name> <repo-url>\n```\n\n**Note**: Currently, OpenLLM only supports adding public repositories.\n\n## Deploy to BentoCloud\n\nOpenLLM supports LLM cloud deployment via BentoML, the unified model serving framework, and BentoCloud, an AI inference platform for enterprise AI teams. BentoCloud provides fully-managed infrastructure optimized for LLM inference with autoscaling, model orchestration, observability, and many more, allowing you to run any AI model in the cloud.\n\n[Sign up for BentoCloud](https://www.bentoml.com/) for free and [log in](https://docs.bentoml.com/en/latest/bentocloud/how-tos/manage-access-token.html). Then, run `openllm deploy` to deploy a model to BentoCloud:\n\n```bash\nopenllm deploy {{model_dict.get(\"llama3.2\")[\"tag\"]}}\n```\n\n> [!NOTE]\n> If you are deploying a gated model, make sure to set HF_TOKEN in enviroment variables.\n\nOnce the deployment is complete, you can run model inference on the BentoCloud console:\n\n<img width=\"800\" alt=\"bentocloud_ui\" src=\"https://github.com/bentoml/OpenLLM/assets/65327072/4f7819d9-73ea-488a-a66c-f724e5d063e6\">\n\n## Community\n\nOpenLLM is actively maintained by the BentoML team. Feel free to reach out and join us in our pursuit to make LLMs more accessible and easy to use 👉 [Join our Slack community!](https://l.bentoml.com/join-slack)\n\n## Contributing\n\nAs an open-source project, we welcome contributions of all kinds, such as new features, bug fixes, and documentation. Here are some of the ways to contribute:\n\n- Repost a bug by [creating a GitHub issue](https://github.com/bentoml/OpenLLM/issues/new/choose).\n- [Submit a pull request](https://github.com/bentoml/OpenLLM/compare) or help review other developers’ [pull requests](https://github.com/bentoml/OpenLLM/pulls).\n- Add an LLM to the OpenLLM default model repository so that other users can run your model. See the [pull request template](https://github.com/bentoml/openllm-models/pull/1).\n- Check out the [Developer Guide](https://github.com/bentoml/OpenLLM/blob/main/DEVELOPMENT.md) to learn more.\n\n## Acknowledgements\n\nThis project uses the following open-source projects:\n\n- [bentoml/bentoml](https://github.com/bentoml/bentoml) for production level model serving\n- [vllm-project/vllm](https://github.com/vllm-project/vllm) for production level LLM backend\n- [blrchen/chatgpt-lite](https://github.com/blrchen/chatgpt-lite) for a fancy Web Chat UI\n- [astral-sh/uv](https://github.com/astral-sh/uv) for blazing fast model requirements installing\n\nWe are grateful to the developers and contributors of these projects for their hard work and dedication.\n"
  },
  {
    "path": "gen_readme.py",
    "content": "# /// script\n# requires-python = \">=3.11\"\n# dependencies = [\n#     \"jinja2\",\n#     \"pre-commit\",\n#     \"uv\",\n# ]\n# ///\nimport subprocess, sys, pathlib, json, jinja2\n\nif __name__ == '__main__':\n  with (pathlib.Path('.').parent / 'README.md').open('w') as f:\n    f.write(\n      jinja2.Environment(loader=jinja2.FileSystemLoader('.'))\n      .get_template('README.md.tpl')\n      .render(\n        model_dict=json.loads(\n          subprocess.run(\n            [\n              sys.executable,\n              '-m',\n              'uv',\n              'run',\n              '--with-editable',\n              '.',\n              'openllm',\n              'model',\n              'list',\n              '--output',\n              'readme',\n            ],\n            text=True,\n            check=True,\n            capture_output=True,\n          ).stdout.strip()\n        )\n      )\n    )\n"
  },
  {
    "path": "pyproject.toml",
    "content": "[project]\nname = \"openllm\"\ndescription = \"OpenLLM: Self-hosting LLMs Made Easy.\"\nreadme = { file = \"README.md\", content-type = \"text/markdown\" }\nauthors = [{ name = \"BentoML Team\", email = \"contact@bentoml.com\" }]\ndynamic = [\"version\"]\nclassifiers = [\n  \"Development Status :: 5 - Production/Stable\",\n  \"Environment :: GPU :: NVIDIA CUDA\",\n  \"Environment :: GPU :: NVIDIA CUDA :: 12\",\n  \"Environment :: GPU :: NVIDIA CUDA :: 11.8\",\n  \"Environment :: GPU :: NVIDIA CUDA :: 11.7\",\n  \"License :: OSI Approved :: Apache Software License\",\n  \"Topic :: Scientific/Engineering :: Artificial Intelligence\",\n  \"Topic :: Software Development :: Libraries\",\n  \"Operating System :: OS Independent\",\n  \"Intended Audience :: Developers\",\n  \"Intended Audience :: Science/Research\",\n  \"Intended Audience :: System Administrators\",\n  \"Typing :: Typed\",\n  \"Programming Language :: Python\",\n  \"Programming Language :: Python :: 3\",\n  \"Programming Language :: Python :: 3 :: Only\",\n  \"Programming Language :: Python :: 3.8\",\n  \"Programming Language :: Python :: 3.9\",\n  \"Programming Language :: Python :: 3.10\",\n  \"Programming Language :: Python :: 3.11\",\n  \"Programming Language :: Python :: 3.12\",\n  \"Programming Language :: Python :: Implementation :: CPython\",\n  \"Programming Language :: Python :: Implementation :: PyPy\",\n]\ndependencies = [\n  \"bentoml==1.4.23\",\n  \"typer\",\n  \"questionary\",\n  \"pyaml\",\n  \"attrs\",\n  \"psutil\",\n  \"pip_requirements_parser\",\n  \"nvidia-ml-py\",\n  \"dulwich\",\n  \"tabulate\",\n  \"uv\",\n  \"openai==1.90.0\",\n  \"huggingface-hub\",\n  \"hf-xet\",\n  \"typing-extensions>=4.12.2\",\n]\nkeywords = [\n  \"MLOps\",\n  \"AI\",\n  \"BentoML\",\n  \"Model Serving\",\n  \"Model Deployment\",\n  \"LLMOps\",\n  \"Falcon\",\n  \"Vicuna\",\n  \"Llama 2\",\n  \"Fine tuning\",\n  \"Serverless\",\n  \"Large Language Model\",\n  \"Generative AI\",\n  \"StableLM\",\n  \"Alpaca\",\n  \"PyTorch\",\n  \"Mistral\",\n  \"vLLM\",\n  \"Transformers\",\n]\nlicense = \"Apache-2.0\"\nrequires-python = \">=3.9\"\n\n[project.scripts]\nopenllm = \"openllm.__main__:app\"\n\n[project.urls]\nBlog = \"https://modelserving.com\"\nDocumentation = \"https://github.com/bentoml/OpenLLM#readme\"\nGitHub = \"https://github.com/bentoml/OpenLLM\"\nHomepage = \"https://bentoml.com\"\nTracker = \"https://github.com/bentoml/OpenLLM/issues\"\nTwitter = \"https://twitter.com/bentomlai\"\n\n[tool.typer]\nsrc-dir = \"src/openllm\"\n\n[build-system]\nrequires = [\"hatchling==1.27.0\", \"hatch-vcs==0.5.0\"]\nbuild-backend = 'hatchling.build'\n\n[dependency-groups]\ntests = [\"pexpect>=4.9.0\", \"pytest>=8.3.5\"]\n\n[tool.hatch.version]\nsource = \"vcs\"\nfallback-version = \"0.0.0\"\n[tool.hatch.build.hooks.vcs]\nversion-file = \"src/openllm/_version.py\"\n[tool.hatch.version.raw-options]\ngit_describe_command = [\n  \"git\",\n  \"describe\",\n  \"--dirty\",\n  \"--tags\",\n  \"--long\",\n  \"--first-parent\",\n]\nversion_scheme = \"post-release\"\nfallback_version = \"0.0.0\"\n[tool.hatch.metadata]\nallow-direct-references = true\n[tool.hatch.build.targets.wheel]\nonly-include = [\"src/openllm\"]\nsources = [\"src\"]\n[tool.hatch.build.targets.sdist]\nexclude = [\"/.git_archival.txt\", \"/.python-version-default\"]\n"
  },
  {
    "path": "pyrightconfig.json",
    "content": "{\n  \"useLibraryCodeForTypes\": true,\n  \"verboseOutput\": true,\n  \"define\": {\n    \"MYPY\": true\n  },\n  \"venvPath\": \".\",\n  \"venv\": \".venv\",\n  \"pythonVersion\": \"3.9\",\n  \"enableExperimentalFeatures\": true,\n  \"reportMissingImports\": \"warning\",\n  \"reportMissingTypeStubs\": false,\n  \"reportPrivateUsage\": \"warning\",\n  \"reportUnknownArgumentType\": \"warning\",\n  \"reportUnsupportedDunderAll\": \"warning\",\n  \"reportWildcardImportFromLibrary\": \"warning\"\n}\n"
  },
  {
    "path": "release.sh",
    "content": "#!/usr/bin/env bash\n\nset -e\n\n# Function to print script usage\nprint_usage() {\n  echo \"Usage: $0 [--release <major|minor|patch|alpha>]\"\n}\n\n# Function to validate release argument\nvalidate_release() {\n  local release=$1\n\n  if [[ $release == \"major\" || $release == \"minor\" || $release == \"patch\" || $release == \"alpha\" ]]; then\n    return 0\n  else\n    return 1\n  fi\n}\n\n# Check if release flag is provided\nif [[ $1 == \"--release\" ]]; then\n  # Check if release argument is provided\n  if [[ -z $2 ]]; then\n    echo \"Error: No release argument provided.\"\n    print_usage\n    exit 1\n  fi\n\n  release=$2\n\n  if ! validate_release \"$release\"; then\n    echo \"Error: Invalid release argument. Only 'major', 'minor', 'patch', or 'alpha' are allowed.\"\n    print_usage\n    exit 1\n  fi\nelse\n  echo \"Error: Unknown option or no option provided.\"\n  print_usage\n  exit 1\nfi\n\n# Get the current version and separate the alpha part if it exists\nversion=\"$(git describe --tags \"$(git rev-list --tags --max-count=1)\")\"\nVERSION=\"${version#v}\"\n\n# Initialize variables for alpha versioning\nALPHA=\"\"\nALPHA_NUM=0\n\n# Check if current version is an alpha version and split accordingly\nif [[ $VERSION =~ -alpha ]]; then\n  IFS='-' read -r BASE_VERSION ALPHA <<<\"$VERSION\"\n  if [[ $ALPHA =~ [.] ]]; then\n    IFS='.' read -r ALPHA ALPHA_NUM <<<\"$ALPHA\"\n  fi\nelse\n  BASE_VERSION=\"$VERSION\"\nfi\n\n# Save the current value of IFS to restore it later and split the base version\nOLD_IFS=$IFS\nIFS='.'\nread -ra VERSION_BITS <<<\"$BASE_VERSION\"\nIFS=$OLD_IFS\n\n# Assign split version numbers\nVNUM1=${VERSION_BITS[0]}\nVNUM2=${VERSION_BITS[1]}\nVNUM3=${VERSION_BITS[2]}\n\n# Adjust the version numbers based on the release type\nif [[ $release == 'major' ]]; then\n  VNUM1=$((VNUM1 + 1))\n  VNUM2=0\n  VNUM3=0\n  ALPHA=\"\" # Reset alpha for major release\nelif [[ $release == 'minor' ]]; then\n  if [[ -n $ALPHA ]]; then\n    ALPHA=\"\" # Remove alpha suffix for minor release from an alpha version\n  else\n    VNUM2=$((VNUM2 + 1))\n    VNUM3=0\n  fi\nelif [[ $release == 'patch' ]]; then\n  VNUM3=$((VNUM3 + 1))\n  ALPHA=\"\" # Reset alpha for patch release\nelif [[ $release == 'alpha' ]]; then\n  if [ -n \"$ALPHA\" ]; then\n    ALPHA_NUM=$((ALPHA_NUM + 1))\n  else\n    VNUM2=$((VNUM2 + 1))\n    VNUM3=0\n    ALPHA=\"alpha\"\n    ALPHA_NUM=0\n  fi\nfi\n\n# Construct the new version string\nif [ -n \"$ALPHA\" ]; then\n  if ((ALPHA_NUM > 0)); then\n    RELEASE_VERSION=\"$VNUM1.$VNUM2.$VNUM3-alpha.$ALPHA_NUM\"\n  else\n    RELEASE_VERSION=\"$VNUM1.$VNUM2.$VNUM3-alpha\"\n  fi\nelse\n  RELEASE_VERSION=\"$VNUM1.$VNUM2.$VNUM3\"\nfi\n\necho \"Commit count: $(git rev-list --count HEAD)\"\necho \"Releasing tag ${RELEASE_VERSION}...\" && git tag -a \"v${RELEASE_VERSION}\" -m \"Release ${RELEASE_VERSION} [generated by GitHub Actions]\"\ngit push origin \"v${RELEASE_VERSION}\"\necho \"Finish releasing OpenLLM ${RELEASE_VERSION}\"\n"
  },
  {
    "path": "src/openllm/__init__.py",
    "content": ""
  },
  {
    "path": "src/openllm/__main__.py",
    "content": "from __future__ import annotations\n\nimport importlib.metadata, os, platform, random, sys, typing\nimport questionary, typer\n\nfrom collections import defaultdict\nfrom openllm.accelerator_spec import can_run, get_local_machine_spec\nfrom openllm.analytic import DO_NOT_TRACK, OpenLLMTyper\nfrom openllm.clean import app as clean_app\nfrom openllm.cloud import deploy as cloud_deploy, get_cloud_machine_spec\nfrom openllm.common import CHECKED, INTERACTIVE, VERBOSE_LEVEL, BentoInfo, output\nfrom openllm.local import run as local_run, serve as local_serve\nfrom openllm.model import app as model_app, ensure_bento, list_bento\nfrom openllm.repo import app as repo_app, cmd_update\n\nif typing.TYPE_CHECKING:\n  from openllm.common import DeploymentTarget\n\napp = OpenLLMTyper(\n  help='`openllm hello` to get started. '\n  'OpenLLM is a CLI tool to manage and deploy open source LLMs and'\n  ' get an OpenAI API compatible chat server in seconds.'\n)\n\napp.add_typer(repo_app, name='repo')\napp.add_typer(model_app, name='model')\napp.add_typer(clean_app, name='clean')\n\n\ndef _select_bento_name(models: list[BentoInfo], target: DeploymentTarget) -> tuple[str, str]:\n  from tabulate import tabulate\n\n  model_infos = [(model.repo.name, model.name, can_run(model, target)) for model in models]\n  model_name_groups: defaultdict[tuple[str, str], float] = defaultdict(lambda: 0.0)\n  for repo, name, score in model_infos:\n    model_name_groups[repo, name] += score\n  table_data = [\n    (name, repo, CHECKED if score > 0 else '') for (repo, name), score in model_name_groups.items()\n  ]\n  if not table_data:\n    output('No model found', style='red')\n    raise typer.Exit(1)\n  table: list[str] = tabulate(table_data, headers=['model', 'repo', 'locally runnable']).split('\\n')\n\n  selected: tuple[str, str] | None = questionary.select(\n    'Select a model',\n    [\n      questionary.Separator(f'{table[0]}\\n   {table[1]}'),\n      *[questionary.Choice(line, value=value[:2]) for value, line in zip(table_data, table[2:])],\n    ],\n    use_search_filter=True,\n    use_jk_keys=False,\n  ).ask()\n  if selected is None:\n    raise typer.Exit(1)\n  return selected\n\n\ndef _select_bento_version(\n  models: list[BentoInfo], target: DeploymentTarget | None, bento_name: str, repo: str\n) -> tuple[BentoInfo, float]:\n  from tabulate import tabulate\n\n  model_infos: list[tuple[BentoInfo, float]] = [\n    (model, can_run(model, target))\n    for model in models\n    if model.name == bento_name and model.repo.name == repo\n  ]\n\n  table_data = [\n    [model.tag, CHECKED if score > 0 else '']\n    for model, score in model_infos\n    if model.name == bento_name and model.repo.name == repo\n  ]\n  if not table_data:\n    output(f'No model found for {bento_name} in {repo}', style='red')\n    raise typer.Exit(1)\n  table: list[str] = tabulate(table_data, headers=['version', 'locally runnable']).split('\\n')\n\n  selected: tuple[BentoInfo, float] | None = questionary.select(\n    'Select a version',\n    [\n      questionary.Separator(f'{table[0]}\\n   {table[1]}'),\n      *[questionary.Choice(line, value=value[:2]) for value, line in zip(model_infos, table[2:])],\n    ],\n    use_search_filter=True,\n    use_jk_keys=False,\n  ).ask()\n  if selected is None:\n    raise typer.Exit(1)\n  return selected\n\n\ndef _select_target(bento: BentoInfo, targets: list[DeploymentTarget]) -> DeploymentTarget:\n  from tabulate import tabulate\n\n  targets.sort(key=lambda x: can_run(bento, x), reverse=True)\n  if not targets:\n    output('No available instance type, check your bentocloud account', style='red')\n    raise typer.Exit(1)\n\n  table = tabulate(\n    [\n      [\n        target.name,\n        target.accelerators_repr,\n        f'${target.price}',\n        CHECKED if can_run(bento, target) else 'insufficient res.',\n      ]\n      for target in targets\n    ],\n    headers=['instance type', 'accelerator', 'price/hr', 'deployable'],\n  ).split('\\n')\n\n  selected: DeploymentTarget | None = questionary.select(\n    'Select an instance type',\n    [\n      questionary.Separator(f'{table[0]}\\n   {table[1]}'),\n      *[questionary.Choice(f'{line}', value=target) for target, line in zip(targets, table[2:])],\n    ],\n    use_search_filter=True,\n    use_jk_keys=False,\n  ).ask()\n  if selected is None:\n    raise typer.Exit(1)\n  return selected\n\n\ndef _select_action(\n  bento: BentoInfo,\n  score: float,\n  context: typing.Optional[str] = None,\n  envs: typing.Optional[list[str]] = None,\n  arg: typing.Optional[list[str]] = None,\n  interactive: bool = False,\n) -> None:\n  if score > 0:\n    options: list[typing.Any] = [\n      questionary.Separator('Available actions'),\n      questionary.Choice('0. Run the model in terminal', value='run', shortcut_key='0'),\n      questionary.Separator(f'  $ openllm run {bento}'),\n      questionary.Separator(' '),\n      questionary.Choice(\n        '1. Serve the model locally and get a chat server', value='serve', shortcut_key='1'\n      ),\n      questionary.Separator(f'  $ openllm serve {bento}'),\n      questionary.Separator(' '),\n      questionary.Choice(\n        '2. Deploy the model to bentocloud and get a scalable chat server',\n        value='deploy',\n        shortcut_key='2',\n      ),\n      questionary.Separator(f'  $ openllm deploy {bento}'),\n    ]\n  else:\n    options = [\n      questionary.Separator('Available actions'),\n      questionary.Choice(\n        '0. Run the model in terminal', value='run', disabled='insufficient res.', shortcut_key='0'\n      ),\n      questionary.Separator(f'  $ openllm run {bento}'),\n      questionary.Separator(' '),\n      questionary.Choice(\n        '1. Serve the model locally and get a chat server',\n        value='serve',\n        disabled='insufficient res.',\n        shortcut_key='1',\n      ),\n      questionary.Separator(f'  $ openllm serve {bento}'),\n      questionary.Separator(' '),\n      questionary.Choice(\n        '2. Deploy the model to bentocloud and get a scalable chat server',\n        value='deploy',\n        shortcut_key='2',\n      ),\n      questionary.Separator(f'  $ openllm deploy {bento}'),\n    ]\n  action: str | None = questionary.select('Select an action', options).ask()\n  if action is None:\n    raise typer.Exit(1)\n  if action == 'run':\n    try:\n      port = random.randint(30000, 40000)\n      local_run(bento, port=port, cli_envs=envs, cli_args=arg)\n    finally:\n      output('\\nUse this command to run the action again:', style='green')\n      output(f'  $ openllm run {bento}', style='orange')\n  elif action == 'serve':\n    try:\n      local_serve(bento, cli_envs=envs, cli_args=arg)\n    finally:\n      output('\\nUse this command to run the action again:', style='green')\n      output(f'  $ openllm serve {bento}', style='orange')\n  elif action == 'deploy':\n    targets = get_cloud_machine_spec(context=context)\n    target = _select_target(bento, targets)\n    try:\n      cloud_deploy(\n        bento, target, cli_envs=envs, context=context, cli_args=arg, interactive=interactive\n      )\n    finally:\n      output('\\nUse this command to run the action again:', style='green')\n      output(f'  $ openllm deploy {bento} --instance-type {target.name}', style='orange')\n\n\n@app.command(help='get started interactively')\ndef hello(\n  repo: typing.Optional[str] = None,\n  envs: typing.Optional[list[str]] = typer.Option(\n    None,\n    '--env',\n    help='Environment variables to pass to the deployment command. Format: NAME or NAME=value. Can be specified multiple times.',\n  ),\n  arg: typing.Optional[list[str]] = typer.Option(\n    None,\n    '--arg',\n    help='Bento arguments in the form of key=value pairs. Can be specified multiple times.',\n  ),\n  context: typing.Optional[str] = typer.Option(\n    None, '--context', help='BentoCloud context name to pass to the deployment command.'\n  ),\n) -> None:\n  cmd_update()\n  INTERACTIVE.set(True)\n\n  target = get_local_machine_spec()\n  output(f'  Detected Platform: {target.platform}', style='green')\n  if target.accelerators:\n    output('  Detected Accelerators: ', style='green')\n    for a in target.accelerators:\n      output(f'   - {a.model} {a.memory_size}GB', style='green')\n  else:\n    output('  Detected Accelerators: None', style='green')\n\n  models = list_bento(repo_name=repo)\n  if not models:\n    output('No model found, you probably need to update the model repo:', style='red')\n    output('  $ openllm repo update', style='orange')\n    raise typer.Exit(1)\n\n  bento_name, repo = _select_bento_name(models, target)\n  bento, score = _select_bento_version(models, target, bento_name, repo)\n  _select_action(bento, score, context=context, envs=envs, arg=arg, interactive=INTERACTIVE.get())\n\n\n@app.command(help='start an OpenAI API compatible chat server and chat in browser')\ndef serve(\n  model: typing.Annotated[str, typer.Argument()],\n  repo: typing.Optional[str] = None,\n  port: int = 3000,\n  verbose: bool = False,\n  env: typing.Optional[list[str]] = typer.Option(\n    None,\n    '--env',\n    help='Environment variables to pass to the deployment command. Format: NAME or NAME=value. Can be specified multiple times.',\n  ),\n  arg: typing.Optional[list[str]] = typer.Option(\n    None,\n    '--arg',\n    help='Bento arguments in the form of key=value pairs. Can be specified multiple times.',\n  ),\n) -> None:\n  cmd_update()\n  if verbose:\n    VERBOSE_LEVEL.set(20)\n  target = get_local_machine_spec()\n  bento = ensure_bento(model, target=target, repo_name=repo)\n  local_serve(bento, port=port, cli_envs=env, cli_args=arg)\n\n\n@app.command(help='run the model and chat in terminal')\ndef run(\n  model: typing.Annotated[str, typer.Argument()] = '',\n  repo: typing.Optional[str] = None,\n  port: typing.Optional[int] = None,\n  timeout: int = 600,\n  verbose: bool = False,\n  env: typing.Optional[list[str]] = typer.Option(\n    None,\n    '--env',\n    help='Environment variables to pass to the deployment command. Format: NAME or NAME=value. Can be specified multiple times.',\n  ),\n  arg: typing.Optional[list[str]] = typer.Option(\n    None,\n    '--arg',\n    help='Bento arguments in the form of key=value pairs. Can be specified multiple times.',\n  ),\n) -> None:\n  cmd_update()\n  if verbose:\n    VERBOSE_LEVEL.set(20)\n  target = get_local_machine_spec()\n  bento = ensure_bento(model, target=target, repo_name=repo)\n  if port is None:\n    port = random.randint(30000, 40000)\n  local_run(bento, port=port, timeout=timeout, cli_envs=env, cli_args=arg)\n\n\n@app.command(help='deploy production-ready OpenAI API-compatible server to BentoCloud')\ndef deploy(\n  model: typing.Annotated[str, typer.Argument()] = '',\n  instance_type: typing.Optional[str] = None,\n  repo: typing.Optional[str] = None,\n  verbose: bool = False,\n  env: typing.Optional[list[str]] = typer.Option(\n    None,\n    '--env',\n    help='Environment variables to pass to the deployment command. Format: NAME or NAME=value. Can be specified multiple times.',\n  ),\n  context: typing.Optional[str] = typer.Option(\n    None, '--context', help='BentoCloud context name to pass to the deployment command.'\n  ),\n  arg: typing.Optional[list[str]] = typer.Option(\n    None,\n    '--arg',\n    help='Bento arguments in the form of key=value pairs. Can be specified multiple times.',\n  ),\n) -> None:\n  cmd_update()\n  if verbose:\n    VERBOSE_LEVEL.set(20)\n  bento = ensure_bento(model, repo_name=repo)\n  if instance_type is not None:\n    return cloud_deploy(\n      bento,\n      DeploymentTarget(accelerators=[], name=instance_type),\n      cli_envs=env,\n      context=context,\n      cli_args=arg,\n      interactive=INTERACTIVE.get(),\n    )\n  targets = get_cloud_machine_spec(context=context)\n  runnable_targets = sorted(\n    filter(lambda x: can_run(bento, x) > 0, targets), key=lambda x: can_run(bento, x), reverse=True\n  )\n  if not runnable_targets:\n    output('No available instance type, check your bentocloud account', style='red')\n    raise typer.Exit(1)\n\n  # Use questionary to select target when in interactive mode and no instance_type is provided\n  if INTERACTIVE.get() and instance_type is None:\n    target = _select_target(bento, targets)\n  else:\n    target = runnable_targets[0]\n    output(f'Recommended instance type: {target.name}', style='green')\n\n  cloud_deploy(\n    bento, target, cli_envs=env, context=context, cli_args=arg, interactive=INTERACTIVE.get()\n  )\n\n\n@app.callback(invoke_without_command=True)\ndef typer_callback(\n  verbose: int = 0,\n  do_not_track: bool = typer.Option(\n    False, '--do-not-track', help='Whether to disable usage tracking', envvar=DO_NOT_TRACK\n  ),\n  version: bool = typer.Option(False, '--version', '-v', help='Show version'),\n) -> None:\n  if verbose:\n    VERBOSE_LEVEL.set(verbose)\n  if version:\n    output(\n      f'openllm, {importlib.metadata.version(\"openllm\")}\\nPython ({platform.python_implementation()}) {platform.python_version()}'\n    )\n    sys.exit(0)\n  if do_not_track:\n    os.environ[DO_NOT_TRACK] = str(True)\n\n\nif __name__ == '__main__':\n  app()\n"
  },
  {
    "path": "src/openllm/accelerator_spec.py",
    "content": "from __future__ import annotations\n\nimport functools, math, re, typing\nimport psutil, pydantic\n\nfrom pydantic import BeforeValidator\nfrom typing_extensions import override\nfrom openllm.common import BentoInfo, DeploymentTarget, output, Accelerator\n\n\ndef parse_memory_string(v: typing.Any) -> typing.Any:\n  \"\"\"Parse memory strings like \"60Gi\" into float.\"\"\"\n  if isinstance(v, str):\n    match = re.match(r'(\\d+(\\.\\d+)?)\\s*Gi$', v, re.IGNORECASE)\n    if match:\n      return float(match.group(1))\n  # Pass other types (including numbers or other strings for standard float conversion) through\n  return v\n\n\nclass Resource(pydantic.BaseModel):\n  memory: typing.Annotated[float, BeforeValidator(parse_memory_string)] = 0.0\n  cpu: int = 0\n  gpu: int = 0\n  gpu_type: str = ''\n\n  @override\n  def __hash__(self) -> int:\n    return hash((self.cpu, self.memory, self.gpu, self.gpu_type))\n\n  def __bool__(self) -> bool:\n    return any(value is not None for value in self.__dict__.values())\n\n\nACCELERATOR_SPECS: dict[str, Accelerator] = {\n  'nvidia-gtx-1650': Accelerator(model='GTX 1650', memory_size=4.0),\n  'nvidia-gtx-1060': Accelerator(model='GTX 1060', memory_size=6.0),\n  'nvidia-gtx-1080-ti': Accelerator(model='GTX 1080 Ti', memory_size=11.0),\n  'nvidia-rtx-3060': Accelerator(model='RTX 3060', memory_size=12.0),\n  'nvidia-rtx-3060-ti': Accelerator(model='RTX 3060 Ti', memory_size=8.0),\n  'nvidia-rtx-3070-ti': Accelerator(model='RTX 3070 Ti', memory_size=8.0),\n  'nvidia-rtx-3080': Accelerator(model='RTX 3080', memory_size=10.0),\n  'nvidia-rtx-3080-ti': Accelerator(model='RTX 3080 Ti', memory_size=12.0),\n  'nvidia-rtx-3090': Accelerator(model='RTX 3090', memory_size=24.0),\n  'nvidia-rtx-4070-ti': Accelerator(model='RTX 4070 Ti', memory_size=12.0),\n  'nvidia-tesla-p4': Accelerator(model='P4', memory_size=8.0),\n  'nvidia-tesla-p100': Accelerator(model='P100', memory_size=16.0),\n  'nvidia-tesla-k80': Accelerator(model='K80', memory_size=12.0),\n  'nvidia-tesla-t4': Accelerator(model='T4', memory_size=16.0),\n  'nvidia-tesla-v100': Accelerator(model='V100', memory_size=16.0),\n  'nvidia-l4': Accelerator(model='L4', memory_size=24.0),\n  'nvidia-tesla-l4': Accelerator(model='L4', memory_size=24.0),\n  'nvidia-tesla-a10g': Accelerator(model='A10G', memory_size=24.0),\n  'nvidia-a100-80g': Accelerator(model='A100', memory_size=80.0),\n  'nvidia-a100-80gb': Accelerator(model='A100', memory_size=80.0),\n  'nvidia-tesla-a100': Accelerator(model='A100', memory_size=40.0),\n  'nvidia-tesla-h100': Accelerator(model='H100', memory_size=80.0),\n  'nvidia-h200-141gb': Accelerator(model='H200', memory_size=141.0),\n  'nvidia-blackwell-b100': Accelerator(model='B100', memory_size=192.0),\n  'nvidia-blackwell-gb200': Accelerator(model='GB200', memory_size=192.0),\n}\n\n\n@functools.lru_cache\ndef get_local_machine_spec() -> DeploymentTarget:\n  if psutil.MACOS:\n    return DeploymentTarget(accelerators=[], source='local', platform='macos')\n\n  if psutil.WINDOWS:\n    platform = 'windows'\n  elif psutil.LINUX:\n    platform = 'linux'\n  else:\n    raise NotImplementedError('Unsupported platform')\n\n  from pynvml import (\n    nvmlDeviceGetCount,\n    nvmlDeviceGetCudaComputeCapability,\n    nvmlDeviceGetHandleByIndex,\n    nvmlDeviceGetMemoryInfo,\n    nvmlDeviceGetName,\n    nvmlInit,\n    nvmlShutdown,\n  )\n\n  try:\n    nvmlInit()\n    device_count = nvmlDeviceGetCount()\n    accelerators: list[Accelerator] = []\n    for i in range(device_count):\n      handle = nvmlDeviceGetHandleByIndex(i)\n      name = nvmlDeviceGetName(handle)\n      memory_info = nvmlDeviceGetMemoryInfo(handle)\n      accelerators.append(\n        Accelerator(model=name, memory_size=math.ceil(int(memory_info.total) / 1024**3))\n      )\n      compute_capability = nvmlDeviceGetCudaComputeCapability(handle)\n      if compute_capability < (7, 5):\n        output(\n          f'GPU {name} with compute capability {compute_capability} '\n          'may not be supported, 7.5 or higher is recommended. check '\n          'https://developer.nvidia.com/cuda-gpus for more information',\n          style='yellow',\n        )\n    nvmlShutdown()\n    return DeploymentTarget(accelerators=accelerators, source='local', platform=platform)\n  except Exception as e:\n    output(\n      'Failed to get local GPU info. Ensure nvidia driver is installed to enable local GPU deployment',\n      style='yellow',\n    )\n    output(f'Error: {e}', style='red', level=20)\n    return DeploymentTarget(accelerators=[], source='local', platform=platform)\n\n\n@functools.lru_cache(typed=True)\ndef can_run(bento: BentoInfo, target: DeploymentTarget | None = None) -> float:\n  \"\"\"\n  Calculate if the bento can be deployed on the target.\n  \"\"\"\n  if target is None:\n    target = get_local_machine_spec()\n\n  resource_spec = Resource(**(bento.bento_yaml['services'][0]['config'].get('resources', {})))\n  labels = bento.bento_yaml.get('labels', {})\n  platforms = labels.get('platforms', 'linux').split(',')\n\n  if target.platform not in platforms:\n    return 0.0\n\n  # return 1.0 if no resource is specified\n  if not resource_spec:\n    return 0.5\n\n  if resource_spec.gpu > 0:\n    required_gpu = ACCELERATOR_SPECS[resource_spec.gpu_type]\n    filtered_accelerators = [\n      ac for ac in target.accelerators if ac.memory_size >= required_gpu.memory_size\n    ]\n    if resource_spec.gpu > len(filtered_accelerators):\n      return 0.0\n    return (\n      required_gpu.memory_size\n      * resource_spec.gpu\n      / sum(ac.memory_size for ac in target.accelerators)\n    )\n  if target.accelerators:\n    return 0.01 / sum(ac.memory_size for ac in target.accelerators)\n  return 1.0\n"
  },
  {
    "path": "src/openllm/analytic.py",
    "content": "from __future__ import annotations\n\nimport functools, os, re, time, typing, abc\nimport attr, click, typer, typer.core\n\nDO_NOT_TRACK = 'BENTOML_DO_NOT_TRACK'\n\n\nclass EventMeta(abc.ABC):\n  @property\n  def event_name(self) -> str:\n    # camel case to snake case\n    event_name = re.sub(r'(?<!^)(?=[A-Z])', '_', self.__class__.__name__).lower()\n    # remove \"_event\" suffix\n    suffix_to_remove = '_event'\n    if event_name.endswith(suffix_to_remove):\n      event_name = event_name[: -len(suffix_to_remove)]\n    return event_name\n\n\n@attr.define\nclass CliEvent(EventMeta):\n  cmd_group: str\n  cmd_name: str\n  duration_in_ms: float = attr.field(default=0)\n  error_type: typing.Optional[str] = attr.field(default=None)\n  return_code: typing.Optional[int] = attr.field(default=None)\n\n\n@attr.define\nclass OpenllmCliEvent(CliEvent):\n  pass\n\n\nclass OrderedCommands(typer.core.TyperGroup):\n  def list_commands(self, ctx: click.Context) -> list[str]:\n    return list(self.commands)\n\n\nclass OpenLLMTyper(typer.Typer):\n  def __init__(self, *args: typing.Any, **kwargs: typing.Any):\n    no_args_is_help: bool = kwargs.pop('no_args_is_help', True)\n    context_settings: dict[str, typing.Any] = kwargs.pop('context_settings', {})\n    if 'help_option_names' not in context_settings:\n      context_settings['help_option_names'] = ('-h', '--help')\n    if 'max_content_width' not in context_settings:\n      context_settings['max_content_width'] = int(os.environ.get('COLUMNS', str(120)))\n    klass = kwargs.pop('cls', OrderedCommands)\n\n    super().__init__(\n      *args, cls=klass, no_args_is_help=no_args_is_help, context_settings=context_settings, **kwargs\n    )\n\n  # NOTE: Since OpenLLMTyper only wraps command to add analytics, the default type-hint for @app.command\n  # does not change, hence the below hijacking.\n  if typing.TYPE_CHECKING:\n    command = typer.Typer.command\n  else:\n\n    def command(self, *args: typing.Any, **kwargs: typing.Any):\n      def decorator(f):\n        @functools.wraps(f)\n        @click.pass_context\n        def wrapped(ctx: click.Context, *args, **kwargs):\n          from bentoml._internal.utils.analytics import track\n\n          do_not_track = os.environ.get(DO_NOT_TRACK, str(False)).lower() == 'true'\n\n          # so we know that the root program is openllm\n          command_name = ctx.info_name\n          if ctx.parent.parent is not None:\n            # openllm model list\n            command_group = ctx.parent.info_name\n          elif ctx.parent.info_name == ctx.find_root().info_name:\n            # openllm run\n            command_group = 'openllm'\n\n          if do_not_track:\n            return f(*args, **kwargs)\n          start_time = time.time_ns()\n          try:\n            return_value = f(*args, **kwargs)\n            duration_in_ns = time.time_ns() - start_time\n            track(\n              OpenllmCliEvent(\n                cmd_group=command_group, cmd_name=command_name, duration_in_ms=duration_in_ns / 1e6\n              )\n            )\n            return return_value\n          except BaseException as e:\n            duration_in_ns = time.time_ns() - start_time\n            track(\n              OpenllmCliEvent(\n                cmd_group=command_group,\n                cmd_name=command_name,\n                duration_in_ms=duration_in_ns / 1e6,\n                error_type=type(e).__name__,\n                return_code=(2 if isinstance(e, KeyboardInterrupt) else 1),\n              )\n            )\n            raise\n\n        return typer.Typer.command(self, *args, **kwargs)(wrapped)\n\n      return decorator\n"
  },
  {
    "path": "src/openllm/clean.py",
    "content": "from __future__ import annotations\n\nimport os, pathlib, shutil\nimport questionary\n\nfrom openllm.analytic import OpenLLMTyper\nfrom openllm.common import CONFIG_FILE, REPO_DIR, VENV_DIR, VERBOSE_LEVEL, output\n\napp = OpenLLMTyper(help='clean up and release disk space used by OpenLLM')\n\nHUGGINGFACE_CACHE = pathlib.Path.home() / '.cache' / 'huggingface' / 'hub'\n\n\ndef _du(path: pathlib.Path) -> int:\n  seen_paths = set()\n  used_space = 0\n\n  for f in path.rglob('*'):\n    if os.name == 'nt':  # Windows system\n      # On Windows, directly add file sizes without considering hard links\n      used_space += f.stat().st_size\n    else:\n      # On non-Windows systems, use inodes to avoid double counting\n      stat = f.stat()\n      if stat.st_ino not in seen_paths:\n        seen_paths.add(stat.st_ino)\n        used_space += stat.st_size\n  return used_space\n\n\n@app.command(help='Clean up all the cached models from huggingface')\ndef model_cache(verbose: bool = False) -> None:\n  if verbose:\n    VERBOSE_LEVEL.set(20)\n  used_space = _du(HUGGINGFACE_CACHE)\n  sure = questionary.confirm(\n    f'This will remove all models cached by Huggingface (~{used_space / 1024 / 1024:.2f}MB), are you sure?'\n  ).ask()\n  if not sure:\n    return\n  shutil.rmtree(HUGGINGFACE_CACHE, ignore_errors=True)\n  output('All models cached by Huggingface have been removed', style='green')\n\n\n@app.command(help='Clean up all the virtual environments created by OpenLLM')\ndef venvs(verbose: bool = False) -> None:\n  if verbose:\n    VERBOSE_LEVEL.set(20)\n\n  used_space = _du(VENV_DIR)\n  sure = questionary.confirm(\n    f'This will remove all virtual environments created by OpenLLM (~{used_space / 1024 / 1024:.2f}MB), are you sure?'\n  ).ask()\n  if not sure:\n    return\n  shutil.rmtree(VENV_DIR, ignore_errors=True)\n  output('All virtual environments have been removed', style='green')\n\n\n@app.command(help='Clean up all the repositories cloned by OpenLLM')\ndef repos(verbose: bool = False) -> None:\n  if verbose:\n    VERBOSE_LEVEL.set(20)\n  shutil.rmtree(REPO_DIR, ignore_errors=True)\n  output('All repositories have been removed', style='green')\n\n\n@app.command(help='Reset configurations to default')\ndef configs(verbose: bool = False) -> None:\n  if verbose:\n    VERBOSE_LEVEL.set(20)\n  shutil.rmtree(CONFIG_FILE, ignore_errors=True)\n  output('All configurations have been reset', style='green')\n\n\n@app.command(name='all', help='Clean up all above and bring OpenLLM to a fresh start')\ndef all_cache(verbose: bool = False) -> None:\n  if verbose:\n    VERBOSE_LEVEL.set(20)\n  repos()\n  venvs()\n  model_cache()\n  configs()\n"
  },
  {
    "path": "src/openllm/cloud.py",
    "content": "from __future__ import annotations\n\nimport json, os, pathlib, shutil, subprocess, typing\nimport typer\n\nfrom openllm.analytic import OpenLLMTyper\nfrom openllm.accelerator_spec import ACCELERATOR_SPECS\nfrom openllm.common import BentoInfo, DeploymentTarget, EnvVars, output, run_command, INTERACTIVE\n\napp = OpenLLMTyper()\n\n\ndef resolve_cloud_config() -> pathlib.Path:\n  env = os.environ.get('BENTOML_HOME')\n  if env is not None:\n    return pathlib.Path(env) / '.yatai.yaml'\n  return pathlib.Path.home() / 'bentoml' / '.yatai.yaml'\n\n\ndef _get_deploy_cmd(\n  bento: BentoInfo,\n  target: typing.Optional[DeploymentTarget] = None,\n  cli_envs: typing.Optional[list[str]] = None,\n  context: typing.Optional[str] = None,\n  cli_args: typing.Optional[list[str]] = None,\n) -> tuple[list[str], EnvVars]:\n  cmd = ['bentoml', 'deploy', bento.bentoml_tag]\n  if cli_args:\n    for arg in cli_args:\n      cmd += ['--arg', arg]\n\n  env = EnvVars({'BENTOML_HOME': f'{bento.repo.path}/bentoml'})\n\n  # Process CLI env vars first to determine overrides\n  explicit_envs: dict[str, str] = {}\n  if cli_envs:\n    for env_var in cli_envs:\n      if '=' in env_var:\n        name, value = env_var.split('=', 1)\n        explicit_envs[name] = value\n      else:\n        name = env_var\n        value = typing.cast(str, os.environ.get(name))\n        if value is None:\n          output(\n            f\"Environment variable '{name}' specified via --env but not found in the current environment.\",\n            style='red',\n          )\n          raise typer.Exit(1)\n        explicit_envs[name] = value\n\n  # Process envs defined in bento.yaml, skipping those overridden by CLI\n  required_envs = bento.bento_yaml.get('envs', [])\n\n  all_required_env_names = [env['name'] for env in required_envs if 'name' in env]\n  required_env_names = [\n    env['name']\n    for env in required_envs\n    if 'name' in env\n    and env['name'] not in explicit_envs\n    and not env.get('value')\n    and env['name'] not in os.environ\n  ]\n  if required_env_names:\n    output(\n      f'This model requires the following environment variables to run (unless overridden via --env): {required_env_names!r}',\n      style='green',\n    )\n\n  for env_info in required_envs:\n    name = typing.cast(str, env_info.get('name'))\n    if not name or name in explicit_envs or env_info.get('value', ''):\n      continue\n\n    if os.environ.get(name):\n      default = os.environ[name]\n    elif 'value' in env_info:\n      default = env_info['value']\n    else:\n      default = ''\n\n    if INTERACTIVE.get():\n      import questionary\n\n      value = questionary.text(f'{name}: (from bento.yaml)', default=default).ask()\n    else:\n      if default == '':\n        output(\n          f'Environment variable {name} (from bento.yaml) is required but not provided', style='red'\n        )\n        raise typer.Exit(1)\n      else:\n        value = default\n\n    if value is None:\n      raise typer.Exit(1)\n    cmd += ['--env', f'{name}={value}']\n\n  # Add any required envs from os.environ that haven't been handled yet\n  for name in all_required_env_names:\n    if name in os.environ:\n      cmd += ['--env', f'{name}={os.environ.get(name)}']\n\n  # Add explicitly provided env vars from CLI\n  for name, value in explicit_envs.items():\n    cmd += ['--env', f'{name}={value}']\n\n  if target:\n    cmd += ['--instance-type', target.name]\n\n  if context:\n    cmd += ['--context', context]\n\n  base_config = resolve_cloud_config()\n  if not base_config.exists():\n    raise Exception('Cannot find cloud config.')\n  # remove before copy\n  if (bento.repo.path / 'bentoml' / '.yatai.yaml').exists():\n    (bento.repo.path / 'bentoml' / '.yatai.yaml').unlink()\n  shutil.copy(base_config, bento.repo.path / 'bentoml' / '.yatai.yaml')\n\n  return cmd, env\n\n\ndef get_current_context() -> str | None:\n  cmd = ['bentoml', 'cloud', 'current-context']\n  try:\n    result = subprocess.check_output(cmd, stderr=subprocess.DEVNULL)\n    return typing.cast(str, json.loads(result)['name'])\n  except subprocess.CalledProcessError:\n    return None\n\n\ndef ensure_cloud_context() -> None:\n  import questionary\n\n  cmd = ['bentoml', 'cloud', 'current-context']\n  try:\n    result = subprocess.check_output(cmd, stderr=subprocess.DEVNULL)\n    context = json.loads(result)\n    output(f'  bentoml already logged in: {context[\"endpoint\"]}', style='green', level=20)\n  except subprocess.CalledProcessError:\n    output('  bentoml not logged in', style='red')\n    if not INTERACTIVE.get():\n      output('\\n  get bentoml logged in by:')\n      output('    $ bentoml cloud login', style='orange')\n      output('')\n      output(\n        \"\"\"  * you may need to visit https://cloud.bentoml.com to get an account. you can also bring your own bentoml cluster (BYOC) to your team from https://bentoml.com/contact\"\"\",\n        style='yellow',\n      )\n      raise typer.Exit(1)\n    else:\n      action = questionary.select(\n        'Choose an action:',\n        choices=['I have a BentoCloud account', 'get an account in two minutes'],\n      ).ask()\n      if action is None:\n        raise typer.Exit(1)\n      elif action == 'get an account in two minutes':\n        output('Please visit https://cloud.bentoml.com to get your token', style='yellow')\n      endpoint = questionary.text(\n        'Enter the endpoint: (similar to https://my-org.cloud.bentoml.com)'\n      ).ask()\n      if endpoint is None:\n        raise typer.Exit(1)\n      token = questionary.text('Enter your token: (similar to cniluaxxxxxxxx)').ask()\n      if token is None:\n        raise typer.Exit(1)\n      cmd = ['bentoml', 'cloud', 'login', '--api-token', token, '--endpoint', endpoint]\n      try:\n        result = subprocess.check_output(cmd)\n        output('  Logged in successfully', style='green')\n      except subprocess.CalledProcessError:\n        output('  Failed to login', style='red')\n        raise typer.Exit(1)\n\n\ndef get_cloud_machine_spec(context: typing.Optional[str] = None) -> list[DeploymentTarget]:\n  ensure_cloud_context()\n  cmd = ['bentoml', 'deployment', 'list-instance-types', '-o', 'json']\n  if context:\n    cmd += ['--context', context]\n\n  if context is None:\n    context = get_current_context()\n\n  try:\n    result = subprocess.check_output(cmd, stderr=subprocess.DEVNULL)\n    instance_types = json.loads(result)\n    return [\n      DeploymentTarget(\n        source='cloud',\n        name=it['name'],\n        price=it['price'],\n        platform='linux',\n        accelerators=(\n          [ACCELERATOR_SPECS[it['gpu_type']] for _ in range(int(it['gpu']))]\n          if it.get('gpu') and it['gpu_type'] in ACCELERATOR_SPECS\n          else []\n        ),\n      )\n      for it in instance_types\n    ]\n  except (subprocess.CalledProcessError, json.JSONDecodeError):\n    output(\n      f'Failed to get cloud instance types{\"\" if context is None else f\" for context {context}\"}',\n      style='red',\n    )\n    return []\n\n\ndef deploy(\n  bento: BentoInfo,\n  target: DeploymentTarget,\n  cli_envs: typing.Optional[list[str]] = None,\n  context: typing.Optional[str] = None,\n  cli_args: typing.Optional[list[str]] = None,\n  interactive: bool = False,\n) -> None:\n  INTERACTIVE.set(interactive)\n  ensure_cloud_context()\n  cmd, env = _get_deploy_cmd(bento, target, cli_envs=cli_envs, context=context, cli_args=cli_args)\n  run_command(cmd, env=env, cwd=None)\n"
  },
  {
    "path": "src/openllm/common.py",
    "content": "from __future__ import annotations\n\nimport asyncio, asyncio.subprocess, functools, hashlib, io, json, os, pathlib, signal, subprocess, sys, sysconfig, typing, shlex\nimport typer, typer.core, pydantic, questionary, pyaml, yaml\n\nfrom collections import UserDict\nfrom contextlib import asynccontextmanager, contextmanager\nfrom typing_extensions import override\n\nfrom pydantic_core import core_schema\n\n\nERROR_STYLE = 'red'\nSUCCESS_STYLE = 'green'\n\nOPENLLM_HOME = pathlib.Path(os.getenv('OPENLLM_HOME', pathlib.Path.home() / '.openllm'))\n\nREPO_DIR = OPENLLM_HOME / 'repos'\nTEMP_DIR = OPENLLM_HOME / 'temp'\nVENV_DIR = OPENLLM_HOME / 'venv'\n\nREPO_DIR.mkdir(exist_ok=True, parents=True)\nTEMP_DIR.mkdir(exist_ok=True, parents=True)\nVENV_DIR.mkdir(exist_ok=True, parents=True)\n\nCONFIG_FILE = OPENLLM_HOME / 'config.json'\n\nCHECKED = 'Yes'\n\nT = typing.TypeVar('T')\n\n\nclass ContextVar(typing.Generic[T]):\n  def __init__(self, default: T):\n    self._stack: list[T] = []\n    self._default = default\n\n  def get(self) -> T:\n    if self._stack:\n      return self._stack[-1]\n    return self._default\n\n  def set(self, value: T) -> None:\n    self._stack.append(value)\n\n  @contextmanager\n  def patch(self, value: T) -> typing.Iterator[None]:\n    self._stack.append(value)\n    try:\n      yield\n    finally:\n      self._stack.pop()\n\n\nVERBOSE_LEVEL = ContextVar(0)\nINTERACTIVE = ContextVar(False)\n\n\ndef output(\n  content: typing.Any, level: int = 0, style: str | None = None, end: str | None = None\n) -> None:\n  if level > VERBOSE_LEVEL.get():\n    return\n\n  if not isinstance(content, str):\n    out = io.StringIO()\n    pyaml.pprint(content, dst=out, sort_dicts=False, sort_keys=False)\n    questionary.print(out.getvalue(), style=style, end='' if end is None else end)\n    out.close()\n  else:\n    questionary.print(content, style=style, end='\\n' if end is None else end)\n\n\nclass Config(pydantic.BaseModel):\n  repos: dict[str, str] = pydantic.Field(\n    default_factory=lambda: {\n      'default': 'https://github.com/bentoml/openllm-models@main',\n      'nightly': 'https://github.com/bentoml/openllm-models@nightly',\n    }\n  )\n  default_repo: str = 'default'\n\n  def tolist(self) -> dict[str, typing.Any]:\n    return dict(repos=self.repos, default_repo=self.default_repo)\n\n\ndef load_config() -> Config:\n  if CONFIG_FILE.exists():\n    try:\n      with open(CONFIG_FILE) as f:\n        return Config(**json.load(f))\n    except json.JSONDecodeError:\n      return Config()\n  return Config()\n\n\ndef save_config(config: Config) -> None:\n  with open(CONFIG_FILE, 'w') as f:\n    json.dump(config.tolist(), f, indent=2)\n\n\nclass BentoMetadata(typing.TypedDict):\n  name: str\n  version: str\n  labels: dict[str, str]\n  envs: list[dict[str, str]]\n  services: list[dict[str, typing.Any]]\n  schema: dict[str, typing.Any]\n\n\nclass EnvVars(UserDict[str, str]):\n  \"\"\"\n  A dictionary-like object that sorted by key and only keeps the environment variables that have a value.\n  \"\"\"\n\n  @classmethod\n  def __get_pydantic_core_schema__(\n    cls: type[EnvVars], source_type: type[typing.Any], handler: typing.Callable[..., typing.Any]\n  ) -> core_schema.DictSchema:\n    return core_schema.dict_schema(core_schema.str_schema(), core_schema.str_schema())\n\n  def __init__(self, data: typing.Mapping[str, str] | None = None):\n    super().__init__(data or {})\n    self.data = {k: v for k, v in sorted(self.data.items()) if v}\n\n  def __hash__(self) -> int:\n    return hash(tuple(sorted(self.data.items())))\n\n\nclass RepoInfo(pydantic.BaseModel):\n  name: str\n  path: pathlib.Path\n  url: str\n  server: str\n  owner: str\n  repo: str\n  branch: str\n\n  def tolist(self) -> str | dict[str, typing.Any] | None:\n    if VERBOSE_LEVEL.get() <= 0:\n      return f'{self.name} ({self.url}@{self.branch})'\n    if VERBOSE_LEVEL.get() <= 10:\n      return dict(name=self.name, url=f'{self.url}@{self.branch}', path=str(self.path))\n    if VERBOSE_LEVEL.get() <= 20:\n      return dict(\n        name=self.name,\n        url=f'{self.url}@{self.branch}',\n        path=str(self.path),\n        server=self.server,\n        owner=self.owner,\n        repo=self.repo,\n      )\n    return None\n\n\nclass BentoInfo(pydantic.BaseModel):\n  repo: RepoInfo\n  path: pathlib.Path\n  alias: str = ''\n\n  def __str__(self) -> str:\n    if self.repo.name == 'default':\n      return f'{self.tag}'\n    else:\n      return f'{self.repo.name}/{self.tag}'\n\n  @override\n  def __hash__(self) -> int:\n    return md5(str(self.path))\n\n  @property\n  def tag(self) -> str:\n    if self.alias:\n      return f'{self.path.parent.name}:{self.alias}'\n    return f'{self.path.parent.name}:{self.path.name}'\n\n  @property\n  def bentoml_tag(self) -> str:\n    return f'{self.path.parent.name}:{self.path.name}'\n\n  @property\n  def name(self) -> str:\n    return self.path.parent.name\n\n  @property\n  def version(self) -> str:\n    return self.path.name\n\n  @property\n  def labels(self) -> dict[str, str]:\n    return self.bento_yaml['labels']\n\n  @property\n  def envs(self) -> list[dict[str, str]]:\n    return self.bento_yaml['envs']\n\n  @functools.cached_property\n  def bento_yaml(self) -> BentoMetadata:\n    bento: BentoMetadata = yaml.safe_load((self.path / 'bento.yaml').read_text())\n    return bento\n\n  @functools.cached_property\n  def platforms(self) -> list[str]:\n    return self.bento_yaml['labels'].get('platforms', 'linux').split(',')\n\n  @functools.cached_property\n  def pretty_yaml(self) -> BentoMetadata | dict[str, typing.Any]:\n    def _pretty_routes(routes: list[dict[str, typing.Any]]) -> dict[str, typing.Any]:\n      return {\n        route['route']: {\n          'input': {k: v['type'] for k, v in route['input']['properties'].items()},\n          'output': route['output']['type'],\n        }\n        for route in routes\n      }\n\n    if len(self.bento_yaml['services']) == 1:\n      pretty_yaml: dict[str, typing.Any] = {\n        'apis': _pretty_routes(self.bento_yaml['schema']['routes']),\n        'resources': self.bento_yaml['services'][0]['config']['resources'],\n        'envs': self.bento_yaml['envs'],\n        'platforms': self.platforms,\n      }\n      return pretty_yaml\n    return self.bento_yaml\n\n  @functools.cached_property\n  def pretty_gpu(self) -> str:\n    from openllm.accelerator_spec import ACCELERATOR_SPECS\n\n    try:\n      resources = self.bento_yaml['services'][0]['config']['resources']\n      if resources['gpu'] > 1:\n        acc = ACCELERATOR_SPECS[resources['gpu_type']]\n        return f'{acc.memory_size:.0f}Gx{resources[\"gpu\"]}'\n      elif resources['gpu'] > 0:\n        acc = ACCELERATOR_SPECS[resources['gpu_type']]\n        return f'{acc.memory_size:.0f}G'\n    except KeyError:\n      pass\n    return ''\n\n  def tolist(self) -> str | dict[str, typing.Any] | None:\n    verbose = VERBOSE_LEVEL.get()\n    if verbose <= 0:\n      return str(self)\n    if verbose <= 10:\n      return dict(\n        tag=self.tag, repo=self.repo.tolist(), path=str(self.path), model_card=self.pretty_yaml\n      )\n    if verbose <= 20:\n      return dict(\n        tag=self.tag, repo=self.repo.tolist(), path=str(self.path), bento_yaml=self.bento_yaml\n      )\n    return None\n\n\nclass VenvSpec(pydantic.BaseModel):\n  python_version: str\n  requirements_txt: str\n  envs: EnvVars\n  name_prefix: str = ''\n\n  @functools.cached_property\n  def normalized_requirements_txt(self) -> str:\n    parameter_lines: list[str] = []\n    dependency_lines: list[str] = []\n    comment_lines: list[str] = []\n\n    for line in self.requirements_txt.splitlines():\n      if not line.strip():\n        continue\n      elif line.strip().startswith('#'):\n        comment_lines.append(line.strip())\n      elif line.strip().startswith('-'):\n        parameter_lines.append(line.strip())\n      else:\n        dependency_lines.append(line.strip())\n\n    parameter_lines.sort()\n    dependency_lines.sort()\n    return '\\n'.join(parameter_lines + dependency_lines).strip()\n\n  @functools.cached_property\n  def normalized_envs(self) -> str:\n    return '\\n'.join(f'{k}={v}' for k, v in sorted(self.envs.items(), key=lambda x: x[0]) if not v)\n\n  @override\n  def __hash__(self) -> int:\n    return md5(self.normalized_requirements_txt, str(hash(self.normalized_envs)))\n\n\nclass Accelerator(pydantic.BaseModel):\n  model: str\n  memory_size: float\n\n  def __gt__(self, other: Accelerator) -> bool:\n    return self.memory_size > other.memory_size\n\n  def __eq__(self, other: object) -> bool:\n    if not isinstance(other, Accelerator):\n      return NotImplemented\n    return self.memory_size == other.memory_size\n\n  def __repr__(self) -> str:\n    return f'{self.model}({self.memory_size}GB)'\n\n\nclass DeploymentTarget(pydantic.BaseModel):\n  accelerators: list[Accelerator]\n  source: str = 'local'\n  name: str = 'local'\n  price: str = ''\n  platform: str = 'linux'\n\n  @override\n  def __hash__(self) -> int:\n    return hash(self.source)\n\n  @property\n  def accelerators_repr(self) -> str:\n    accs = {a.model for a in self.accelerators}\n    if len(accs) == 0:\n      return 'null'\n    if len(accs) == 1:\n      a = self.accelerators[0]\n      return f'{a.model} x{len(self.accelerators)}'\n    return ', '.join((f'{a.model}' for a in self.accelerators))\n\n\ndef run_command(\n  cmd: list[str],\n  cwd: str | None = None,\n  env: EnvVars | None = None,\n  copy_env: bool = True,\n  venv: pathlib.Path | None = None,\n  silent: bool = False,\n) -> subprocess.CompletedProcess[typing.Any]:\n  env = env or EnvVars({})\n  cmd = [str(c) for c in cmd]\n  bin_dir = 'Scripts' if os.name == 'nt' else 'bin'\n  if not silent:\n    output('\\n')\n    if cwd:\n      output(f'$ cd {cwd}', style='orange')\n    if env:\n      for k, v in env.items():\n        output(f'$ export {k}={shlex.quote(v)}', style='orange')\n    if venv:\n      output(f'$ source {venv / \"bin\" / \"activate\"}', style='orange')\n    output(f'$ {\" \".join(cmd)}', style='orange')\n\n  if venv:\n    py = venv / bin_dir / f'python{sysconfig.get_config_var(\"EXE\")}'\n  else:\n    py = pathlib.Path(sys.executable)\n\n  if copy_env:\n    env = EnvVars({**os.environ, **env})\n\n  if cmd and cmd[0] == 'bentoml':\n    cmd = [py.__fspath__(), '-m', 'bentoml', *cmd[1:]]\n  if cmd and cmd[0] == 'python':\n    cmd = [py.__fspath__(), *cmd[1:]]\n\n  try:\n    if silent:\n      return subprocess.run(\n        cmd, cwd=cwd, env=env, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True\n      )\n    else:\n      return subprocess.run(cmd, cwd=cwd, env=env, check=True)\n  except Exception as e:\n    if VERBOSE_LEVEL.get() >= 20:\n      output(str(e), style='red')\n    raise typer.Exit(1)\n\n\nasync def stream_command_output(\n  stream: asyncio.streams.StreamReader | None, style: str = 'gray'\n) -> None:\n  if stream:\n    async for line in stream:\n      output(line.decode(), style=style, end='')\n\n\n@asynccontextmanager\nasync def async_run_command(\n  cmd: list[str],\n  cwd: str | None = None,\n  env: EnvVars | None = None,\n  copy_env: bool = True,\n  venv: pathlib.Path | None = None,\n  silent: bool = True,\n) -> typing.AsyncGenerator[asyncio.subprocess.Process]:\n  env = env or EnvVars({})\n  cmd = [str(c) for c in cmd]\n\n  if not silent:\n    output('\\n')\n    if cwd:\n      output(f'$ cd {cwd}', style='orange')\n    if env:\n      for k, v in env.items():\n        output(f'$ export {k}={shlex.quote(v)}', style='orange')\n    if venv:\n      output(f'$ source {venv / \"bin\" / \"activate\"}', style='orange')\n    output(f'$ {\" \".join(cmd)}', style='orange')\n\n  if venv:\n    py = venv / 'bin' / 'python'\n  else:\n    py = pathlib.Path(sys.executable)\n\n  if copy_env:\n    env = EnvVars({**os.environ, **env})\n\n  if cmd and cmd[0] == 'bentoml':\n    cmd = [py.__fspath__(), '-m', 'bentoml', *cmd[1:]]\n  if cmd and cmd[0] == 'python':\n    cmd = [py.__fspath__(), *cmd[1:]]\n\n  proc = None\n  try:\n    proc = await asyncio.create_subprocess_shell(\n      ' '.join(map(str, cmd)),\n      stdout=asyncio.subprocess.PIPE,\n      stderr=asyncio.subprocess.PIPE,\n      cwd=cwd,\n      env=env,\n    )\n    yield proc\n  except subprocess.CalledProcessError:\n    output('Command failed', style='red')\n    raise typer.Exit(1)\n  finally:\n    if proc:\n      proc.send_signal(signal.SIGINT)\n      await proc.wait()\n\n\ndef md5(*strings: str) -> int:\n  m = hashlib.md5()\n  for s in strings:\n    m.update(s.encode())\n  return int(m.hexdigest(), 16)\n"
  },
  {
    "path": "src/openllm/local.py",
    "content": "from __future__ import annotations\n\nimport asyncio, time, typing, os\nimport httpx, openai\n\nfrom openai.types.chat import ChatCompletionAssistantMessageParam, ChatCompletionUserMessageParam\nfrom openllm.common import (\n  BentoInfo,\n  EnvVars,\n  async_run_command,\n  output,\n  run_command,\n  stream_command_output,\n)\nfrom openllm.venv import ensure_venv\n\nif typing.TYPE_CHECKING:\n  from openai.types.chat import ChatCompletionMessageParam\n\n\ndef prep_env_vars(bento: BentoInfo) -> None:\n  env_vars = bento.envs\n  for env_var in env_vars:\n    if not env_var.get('value'):\n      continue\n    key = env_var['name']\n    value = env_var['value']\n    os.environ[key] = value\n\n\ndef _get_serve_cmd(\n  bento: BentoInfo, port: int = 3000, cli_args: typing.Optional[list[str]] = None\n) -> tuple[list[str], EnvVars]:\n  cmd = ['bentoml', 'serve', bento.bentoml_tag]\n  if port != 3000:\n    cmd += ['--port', str(port)]\n\n  # Add CLI arguments if provided\n  if cli_args:\n    for arg in cli_args:\n      cmd += ['--arg', arg]\n\n  return cmd, EnvVars({'BENTOML_HOME': f'{bento.repo.path}/bentoml'})\n\n\ndef serve(\n  bento: BentoInfo,\n  port: int = 3000,\n  cli_envs: typing.Optional[list[str]] = None,\n  cli_args: typing.Optional[list[str]] = None,\n) -> None:\n  prep_env_vars(bento)\n  cmd, env = _get_serve_cmd(bento, port=port, cli_args=cli_args)\n\n  # Add CLI environment variables if provided\n  if cli_envs:\n    for env_var in cli_envs:\n      if '=' in env_var:\n        key, value = env_var.split('=', 1)\n        env[key] = value\n      else:\n        env[env_var] = os.environ.get(env_var, '')\n\n  venv = ensure_venv(bento, runtime_envs=env)\n  output(f'Access the Chat UI at http://localhost:{port}/chat (or with you IP)')\n  run_command(cmd, env=env, cwd=None, venv=venv)\n\n\nasync def _run_model(\n  bento: BentoInfo,\n  port: int = 3000,\n  timeout: int = 600,\n  cli_env: typing.Optional[dict[str, typing.Any]] = None,\n  cli_args: typing.Optional[list[str]] = None,\n) -> None:\n  cmd, env = _get_serve_cmd(bento, port, cli_args=cli_args)\n\n  # Merge cli environment variables if provided\n  if cli_env:\n    env.update(cli_env)\n\n  venv = ensure_venv(bento, runtime_envs=env)\n  async with async_run_command(cmd, env=env, cwd=None, venv=venv, silent=False) as server_proc:\n    output(f'Model server started {server_proc.pid}')\n\n    stdout_streamer = None\n    stderr_streamer = None\n    start_time = time.time()\n\n    output('Model loading...', style='green')\n    for _ in range(timeout):\n      try:\n        resp = httpx.get(f'http://localhost:{port}/readyz', timeout=3)\n        if resp.status_code == 200:\n          break\n      except httpx.RequestError:\n        if time.time() - start_time > 30:\n          if not stdout_streamer:\n            stdout_streamer = asyncio.create_task(\n              stream_command_output(server_proc.stdout, style='gray')\n            )\n          if not stderr_streamer:\n            stderr_streamer = asyncio.create_task(\n              stream_command_output(server_proc.stderr, style='#BD2D0F')\n            )\n        await asyncio.sleep(1)\n    else:\n      output('Model failed to load', style='red')\n      server_proc.terminate()\n      return\n\n    if stdout_streamer:\n      stdout_streamer.cancel()\n    if stderr_streamer:\n      stderr_streamer.cancel()\n\n    output('Model is ready', style='green')\n    messages: list[ChatCompletionMessageParam] = []\n\n    client = openai.AsyncOpenAI(base_url=f'http://localhost:{port}/v1', api_key='local')\n    while True:\n      try:\n        message = input('user: ')\n        if message == '':\n          output('empty message, please enter something', style='yellow')\n          continue\n        messages.append(ChatCompletionUserMessageParam(role='user', content=message))\n        output('assistant: ', end='', style='lightgreen')\n        assistant_message = ''\n        stream = await client.chat.completions.create(\n          model=(await client.models.list()).data[0].id, messages=messages, stream=True\n        )\n        async for chunk in stream:\n          text = chunk.choices[0].delta.content or ''\n          assistant_message += text\n          output(text, end='', style='lightgreen')\n        messages.append(\n          ChatCompletionAssistantMessageParam(role='assistant', content=assistant_message)\n        )\n        output('')\n      except KeyboardInterrupt:\n        break\n    output('\\nStopping model server...', style='green')\n    output('Stopped model server', style='green')\n\n\ndef run(\n  bento: BentoInfo,\n  port: int = 3000,\n  timeout: int = 600,\n  cli_envs: typing.Optional[list[str]] = None,\n  cli_args: typing.Optional[list[str]] = None,\n) -> None:\n  prep_env_vars(bento)\n\n  # Add CLI environment variables to the process\n  env = {}\n  if cli_envs:\n    for env_var in cli_envs:\n      if '=' in env_var:\n        key, value = env_var.split('=', 1)\n        env[key] = value\n      else:\n        env[env_var] = os.environ.get(env_var, '')\n\n  asyncio.run(_run_model(bento, port=port, timeout=timeout, cli_env=env, cli_args=cli_args))\n"
  },
  {
    "path": "src/openllm/model.py",
    "content": "from __future__ import annotations\n\nimport re, typing, json\nimport tabulate, questionary, typer\n\nfrom openllm.accelerator_spec import can_run\nfrom openllm.common import DeploymentTarget\nfrom openllm.analytic import OpenLLMTyper\nfrom openllm.common import VERBOSE_LEVEL, BentoInfo, output as output_, load_config\nfrom openllm.repo import ensure_repo_updated, list_repo\n\napp = OpenLLMTyper(help='manage models')\n\n\n@app.command(help='get model')\ndef get(tag: str, repo: typing.Optional[str] = None, verbose: bool = False) -> None:\n  if verbose:\n    VERBOSE_LEVEL.set(20)\n  bento_info = ensure_bento(tag, repo_name=repo)\n  if bento_info:\n    output_(bento_info)\n\n\n@app.command(name='list', help='list available models')\ndef list_model(\n  tag: typing.Optional[str] = None,\n  repo: typing.Optional[str] = None,\n  verbose: bool = False,\n  output: typing.Optional[str] = typer.Option(None, hidden=True),\n) -> None:\n  if verbose:\n    VERBOSE_LEVEL.set(20)\n\n  if repo is None:\n    repo = load_config().default_repo\n\n  bentos = list_bento(tag=tag, repo_name=repo)\n  bentos.sort(key=lambda x: x.name)\n\n  seen = set()\n\n  def is_seen(value: str) -> bool:\n    if value in seen:\n      return True\n    seen.add(value)\n    return False\n\n  if output == 'readme':\n    # Parse parameters from bento.tag (e.g. \"model:671b-it\" -> \"671b\", 'model:something-long-78b' -> '78b')\n    questionary.print(\n      json.dumps({\n        f'{bento.name}': dict(\n          tag=bento.tag,\n          version=bento.tag.split(':')[-1],\n          pretty_gpu=bento.pretty_gpu,\n          command=f'openllm serve {bento.tag}',\n        )\n        for bento in bentos\n        if not is_seen(bento.name)\n      })\n    )\n    return\n\n  table = tabulate.tabulate(\n    [\n      [\n        '' if is_seen(bento.name) else bento.name,\n        bento.tag,\n        bento.repo.name,\n        bento.pretty_gpu,\n        ','.join(bento.platforms),\n      ]\n      for bento in bentos\n    ],\n    headers=['model', 'version', 'repo', 'required GPU RAM', 'platforms'],\n  )\n  output_(table)\n\n\ndef ensure_bento(\n  model: str,\n  target: typing.Optional[DeploymentTarget] = None,\n  repo_name: typing.Optional[str] = None,\n) -> BentoInfo:\n  if repo_name is None:\n    from openllm.common import load_config\n\n    repo_name = load_config().default_repo\n\n  bentos = list_bento(model, repo_name=repo_name)\n  if len(bentos) == 0:\n    output_(f'No model found for {model}', style='red')\n    raise typer.Exit(1)\n\n  if len(bentos) == 1:\n    output_(f'Found model {bentos[0]}', style='green')\n    if target is not None and can_run(bentos[0], target) <= 0:\n      output_(\n        f'The machine({target.name}) with {target.accelerators_repr} does not appear to have sufficient '\n        f'resources to run model {bentos[0]}\\n',\n        style='yellow',\n      )\n    return bentos[0]\n\n  # multiple models, pick one according to target\n  output_(f'Multiple models match {model}, did you mean one of these?', style='red')\n  list_model(model, repo=repo_name)\n  raise typer.Exit(1)\n\n\nNUMBER_RE = re.compile(r'\\d+')\n\n\ndef _extract_first_number(s: str) -> int:\n  match = NUMBER_RE.search(s)\n  if match:\n    return int(match.group())\n  else:\n    return 100\n\n\ndef list_bento(\n  tag: typing.Optional[str] = None,\n  repo_name: typing.Optional[str] = None,\n  include_alias: bool = False,\n) -> typing.List[BentoInfo]:\n  ensure_repo_updated()\n\n  if repo_name is None and tag and '/' in tag:\n    repo_name, tag = tag.split('/', 1)\n\n  repo_list = list_repo(repo_name)\n  if repo_name is not None:\n    repo_map = {repo.name: repo for repo in repo_list}\n    if repo_name not in repo_map:\n      output_(f'Repo `{repo_name}` not found, did you mean one of these?')\n      for repo_name in repo_map:\n        output_(f'  {repo_name}')\n      raise typer.Exit(1)\n\n  if not tag:\n    glob_pattern = 'bentoml/bentos/*/*'\n  elif ':' in tag:\n    bento_name, version = tag.split(':')\n    glob_pattern = f'bentoml/bentos/{bento_name}/{version}'\n  else:\n    glob_pattern = f'bentoml/bentos/{tag}/*'\n\n  model_list: list[BentoInfo] = []\n  repo_list = list_repo(repo_name)\n  for repo in repo_list:\n    paths = sorted(\n      repo.path.glob(glob_pattern),\n      key=lambda x: (x.parent.name, _extract_first_number(x.name), len(x.name), x.name),\n    )\n    for path in paths:\n      if path.is_dir() and (path / 'bento.yaml').exists():\n        model = BentoInfo(repo=repo, path=path)\n      elif path.is_file():\n        with open(path) as f:\n          origin_name = f.read().strip()\n        origin_path = path.parent / origin_name\n        model = BentoInfo(alias=path.name, repo=repo, path=origin_path)\n      else:\n        model = None\n      if model:\n        model_list.append(model)\n\n  if not include_alias:\n    seen: set[str] = set()\n    # we are calling side-effect in seen here.\n    model_list = [\n      x\n      for x in model_list\n      if not (\n        f'{x.bento_yaml[\"name\"]}:{x.bento_yaml[\"version\"]}' in seen\n        or seen.add(f'{x.bento_yaml[\"name\"]}:{x.bento_yaml[\"version\"]}')  # type: ignore\n      )\n    ]\n  return model_list\n"
  },
  {
    "path": "src/openllm/py.typed",
    "content": ""
  },
  {
    "path": "src/openllm/repo.py",
    "content": "from __future__ import annotations\n\nimport datetime, subprocess, re, shutil, typing, os, pathlib\nimport pyaml, questionary, typer\n\nfrom openllm.analytic import OpenLLMTyper\nfrom openllm.common import (\n  INTERACTIVE,\n  REPO_DIR,\n  VERBOSE_LEVEL,\n  RepoInfo,\n  load_config,\n  output,\n  save_config,\n)\n\nUPDATE_INTERVAL = datetime.timedelta(days=3)\nTEST_REPO = os.getenv('OPENLLM_TEST_REPO', None)  # for testing\n\n\napp = OpenLLMTyper(help='manage repos')\n\n\n@app.command(name='list', help='list available repo')\ndef cmd_list(verbose: bool = False) -> None:\n  if verbose:\n    VERBOSE_LEVEL.set(20)\n  pyaml.pprint(list_repo(), sort_dicts=False, sort_keys=False)\n\n\n@app.command(name='remove', help='remove given repo')\ndef cmd_remove(name: str) -> None:\n  if TEST_REPO:\n    return\n  config = load_config()\n  if name not in config.repos:\n    output(f'Repo {name} does not exist', style='red')\n    return\n\n  del config.repos[name]\n  save_config(config)\n  output(f'Repo {name} removed', style='green')\n\n\n@app.command(name='update', help='update default repo')\ndef cmd_update() -> None:\n  if TEST_REPO:\n    return\n\n  repos_in_use = set()\n  for repo in list_repo():\n    # Show simplified output if not in verbose mode\n    if VERBOSE_LEVEL.get() <= 0:\n      output(f'updating repo {repo.name}', style='green')\n\n    repos_in_use.add((repo.server, repo.owner, repo.repo, repo.branch))\n    if repo.path.exists():\n      shutil.rmtree(repo.path, ignore_errors=True)\n    repo.path.parent.mkdir(parents=True, exist_ok=True)\n    try:\n      _clone_repo(repo)\n      if VERBOSE_LEVEL.get() > 0:\n        output('')\n        output(f'Repo `{repo.name}` updated', style='green')\n    except Exception as e:\n      shutil.rmtree(repo.path, ignore_errors=True)\n      if VERBOSE_LEVEL.get() > 0:\n        output(f'Failed to clone repo {repo.name}', style='red')\n        output(e)\n  for c in REPO_DIR.glob('*/*/*/*'):\n    repo_spec = tuple(c.parts[-4:])\n    if repo_spec not in repos_in_use:\n      shutil.rmtree(c, ignore_errors=True)\n      if VERBOSE_LEVEL.get() > 0:\n        output(f'Removed unused repo cache {c}')\n  with open(REPO_DIR / 'last_update', 'w') as f:\n    f.write(datetime.datetime.now().isoformat())\n  for repo in list_repo():\n    _complete_alias(repo.name)\n\n\n@app.command(name='add', help='add new repo')\ndef cmd_add(name: str, repo: str) -> None:\n  if TEST_REPO:\n    return\n  name = name.lower()\n  if not name.isidentifier():\n    output(\n      f'Invalid repo name: {name}, should only contain letters, numbers and underscores',\n      style='red',\n    )\n    return\n\n  try:\n    parse_repo_url(repo)\n  except ValueError:\n    output(f'Invalid repo url: {repo}', style='red')\n    return\n\n  config = load_config()\n  if name in config.repos:\n    override = questionary.confirm(\n      f'Repo {name} already exists({config.repos[name]}), override?'\n    ).ask()\n    if not override:\n      return\n\n  config.repos[name] = repo\n  save_config(config)\n  output(f'Repo {name} added', style='green')\n\n\n@app.command(name='default', help='get default repo path')\ndef default() -> typing.Optional[pathlib.Path]:\n  if TEST_REPO:\n    return None\n  output((info := parse_repo_url(load_config().repos['default'], 'default')).path)\n  return info.path\n\n\ndef list_repo(repo_name: typing.Optional[str] = None) -> typing.List[RepoInfo]:\n  if TEST_REPO:\n    return [\n      RepoInfo(\n        name='default',\n        url='',\n        server='test',\n        owner='test',\n        repo='test',\n        branch='main',\n        path=pathlib.Path(TEST_REPO),\n      )\n    ]\n  config = load_config()\n  repos = []\n  for _repo_name, repo_url in config.repos.items():\n    if repo_name is not None and _repo_name != repo_name:\n      continue\n    repo = parse_repo_url(repo_url, _repo_name)\n    repos.append(repo)\n  return repos\n\n\ndef _complete_alias(repo_name: str) -> None:\n  from openllm.model import list_bento\n\n  for bento in list_bento(repo_name=repo_name):\n    alias = bento.labels.get('aliases', '').strip()\n    if alias:\n      for a in alias.split(','):\n        with open(bento.path.parent / a, 'w') as f:\n          f.write(bento.version)\n\n\ndef _clone_repo(repo: RepoInfo) -> None:\n  try:\n    # Suppress output if verbosity level is low\n    if VERBOSE_LEVEL.get() <= 0:\n      subprocess.run(\n        ['git', 'clone', '--depth=1', '-b', repo.branch, repo.url, str(repo.path)],\n        check=True,\n        stdout=subprocess.DEVNULL,\n        stderr=subprocess.DEVNULL,\n      )\n    else:\n      subprocess.run(\n        ['git', 'clone', '--depth=1', '-b', repo.branch, repo.url, str(repo.path)], check=True\n      )\n  except (subprocess.CalledProcessError, FileNotFoundError):\n    import dulwich\n    import dulwich.porcelain\n\n    # Dulwich doesn't have easy output suppression, but we rarely get here\n    dulwich.porcelain.clone(repo.url, str(repo.path), checkout=True, depth=1, branch=repo.branch)\n\n\ndef ensure_repo_updated() -> None:\n  if TEST_REPO:\n    return\n  last_update_file = REPO_DIR / 'last_update'\n  if not last_update_file.exists():\n    if INTERACTIVE.get():\n      choice = questionary.confirm(\n        'The repo cache is never updated, do you want to update it to fetch the latest model list?'\n      ).ask()\n      if choice:\n        cmd_update()\n      return\n    else:\n      output(\n        'The repo cache is never updated, please run `openllm repo update` to fetch the latest model list',\n        style='red',\n      )\n      raise typer.Exit(1)\n  last_update = datetime.datetime.fromisoformat(last_update_file.read_text().strip())\n  if datetime.datetime.now() - last_update > UPDATE_INTERVAL:\n    if INTERACTIVE.get():\n      choice = questionary.confirm(\n        'The repo cache is outdated, do you want to update it to fetch the latest model list?'\n      ).ask()\n      if choice:\n        cmd_update()\n    else:\n      output(\n        'The repo cache is outdated, please run `openllm repo update` to fetch the latest model list',\n        style='yellow',\n      )\n\n\nGIT_HTTP_RE = re.compile(\n  r'(?P<schema>git|ssh|http|https):\\/\\/(?P<server>[\\.\\w\\d\\-]+)\\/(?P<owner>[\\w\\d\\-]+)\\/(?P<repo>[\\w\\d\\-\\_\\.]+)(@(?P<branch>.+))?(\\/)?$'\n)\nGIT_SSH_RE = re.compile(\n  r'git@(?P<server>[\\.\\w\\d-]+):(?P<owner>[\\w\\d\\-]+)\\/(?P<repo>[\\w\\d\\-\\_\\.]+)(@(?P<branch>.+))?(\\/)?$'\n)\n\n\ndef parse_repo_url(repo_url: str, repo_name: typing.Optional[str] = None) -> RepoInfo:\n  \"\"\"\n  parse the git repo url to server, owner, repo name, branch\n  >>> parse_repo_url('https://github.com/bentoml/bentovllm@main')\n  ('github.com', 'bentoml', 'bentovllm', 'main')\n\n  >>> parse_repo_url('https://github.com/bentoml/bentovllm.git@main')\n  ('github.com', 'bentoml', 'bentovllm', 'main')\n\n  >>> parse_repo_url('https://github.com/bentoml/bentovllm')\n  ('github.com', 'bentoml', 'bentovllm', 'main')\n\n  >>> parse_repo_url('git@github.com:bentoml/openllm-models.git')\n  ('github.com', 'bentoml', 'openllm-models', 'main')\n  \"\"\"\n  match = GIT_HTTP_RE.match(repo_url)\n  if match:\n    schema = match.group('schema')\n  else:\n    match = GIT_SSH_RE.match(repo_url)\n    if not match:\n      raise ValueError(f'Invalid git repo url: {repo_url}')\n    schema = None\n\n  if match.group('branch') is not None:\n    repo_url = repo_url[: match.start('branch') - 1]\n\n  server = match.group('server')\n  owner = match.group('owner')\n  repo = match.group('repo')\n  if repo.endswith('.git'):\n    repo = repo[:-4]\n  branch = match.group('branch') or 'main'\n\n  if schema is not None:\n    repo_url = f'{schema}://{server}/{owner}/{repo}'\n  else:\n    repo_url = f'git@{server}:{owner}/{repo}'\n\n  path = REPO_DIR / server / owner / repo / branch\n  return RepoInfo(\n    name=repo if repo_name is None else repo_name,\n    url=repo_url,\n    server=server,\n    owner=owner,\n    repo=repo,\n    branch=branch,\n    path=path,\n  )\n\n\nif __name__ == '__main__':\n  app()\n"
  },
  {
    "path": "src/openllm/venv.py",
    "content": "from __future__ import annotations\n\nimport functools, os, pathlib, shutil\nimport typer, yaml\n\nfrom openllm.common import (\n  VENV_DIR,\n  VERBOSE_LEVEL,\n  BentoInfo,\n  EnvVars,\n  VenvSpec,\n  output,\n  run_command,\n)\n\n\n@functools.lru_cache\ndef _resolve_bento_venv_spec(bento: BentoInfo, runtime_envs: EnvVars | None = None) -> VenvSpec:\n  lock_file = bento.path / 'env' / 'python' / 'requirements.lock.txt'\n  if not lock_file.exists():\n    lock_file = bento.path / 'env' / 'python' / 'requirements.txt'\n\n  reqs = lock_file.read_text().strip()\n  bentofile = bento.path / 'bento.yaml'\n  data = yaml.safe_load(bentofile.read_text())\n  bento_env_list = data.get('envs', [])\n  python_version = data.get('image', {})['python_version']\n  bento_envs = {e['name']: e.get('value') for e in bento_env_list}\n  envs = {k: runtime_envs.get(k, v) for k, v in bento_envs.items()} if runtime_envs else {}\n\n  return VenvSpec(\n    python_version=python_version,\n    requirements_txt=reqs,\n    name_prefix=f'{bento.tag.replace(\":\", \"_\")}-1-',\n    envs=EnvVars(envs),\n  )\n\n\ndef _ensure_venv(venv_spec: VenvSpec) -> pathlib.Path:\n  venv = VENV_DIR / str(hash(venv_spec))\n  if venv.exists() and not (venv / 'DONE').exists():\n    shutil.rmtree(venv, ignore_errors=True)\n  if not venv.exists():\n    output(f'Installing model dependencies({venv})...', style='green')\n\n    venv_py = venv / 'Scripts' / 'python.exe' if os.name == 'nt' else venv / 'bin' / 'python'\n    try:\n      run_command(\n        ['python', '-m', 'uv', 'venv', venv.__fspath__(), '-p', venv_spec.python_version],\n        silent=VERBOSE_LEVEL.get() < 10,\n      )\n      run_command(\n        ['python', '-m', 'uv', 'pip', 'install', '-p', str(venv_py), 'bentoml'],\n        silent=VERBOSE_LEVEL.get() < 10,\n        env=venv_spec.envs,\n      )\n      with open(venv / 'requirements.txt', 'w') as f:\n        f.write(venv_spec.normalized_requirements_txt)\n      run_command(\n        [\n          'python',\n          '-m',\n          'uv',\n          'pip',\n          'install',\n          '-p',\n          str(venv_py),\n          '-r',\n          (venv / 'requirements.txt').__fspath__(),\n        ],\n        silent=VERBOSE_LEVEL.get() < 10,\n        env=venv_spec.envs,\n      )\n      with open(venv / 'DONE', 'w') as f:\n        f.write('DONE')\n    except Exception as e:\n      shutil.rmtree(venv, ignore_errors=True)\n      if VERBOSE_LEVEL.get() >= 10:\n        output(str(e), style='red')\n      output(f'Failed to install dependencies to {venv}. Cleaned up.', style='red')\n      raise typer.Exit(1)\n    output(f'Successfully installed dependencies to {venv}.', style='green')\n    return venv\n  else:\n    return venv\n\n\ndef ensure_venv(bento: BentoInfo, runtime_envs: EnvVars | None = None) -> pathlib.Path:\n  venv_spec = _resolve_bento_venv_spec(bento, runtime_envs=EnvVars(runtime_envs))\n  venv = _ensure_venv(venv_spec)\n  assert venv is not None\n  return venv\n\n\ndef check_venv(bento: BentoInfo) -> bool:\n  venv_spec = _resolve_bento_venv_spec(bento)\n  venv = VENV_DIR / str(hash(venv_spec))\n  if not venv.exists():\n    return False\n  if venv.exists() and not (venv / 'DONE').exists():\n    return False\n  return True\n"
  }
]