Repository: dottxt-ai/outlines
Branch: main
Commit: 54827e6d539b
Files: 239
Total size: 1.2 MB

Directory structure:
gitextract_sobc03i9/

├── .devcontainer/
│   └── devcontainer.json
├── .editorconfig
├── .github/
│   ├── ISSUE_TEMPLATE/
│   │   ├── bug_report.yml
│   │   └── config.yml
│   ├── PULL_REQUEST_TEMPLATE/
│   │   └── pull_request_template.md
│   ├── scripts/
│   │   └── build_sdist_and_wheel.sh
│   └── workflows/
│       ├── build_documentation.yml
│       ├── publish_documentation.yml
│       ├── release_pypi.yaml
│       ├── tests.yml
│       └── tests_api_models.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .pydocstyle
├── .readthedocs.yaml
├── .vscode/
│   └── settings.json
├── LICENSE
├── README.md
├── docs/
│   ├── api_reference/
│   │   └── index.md
│   ├── blog/
│   │   └── index.md
│   ├── community/
│   │   ├── contribute.md
│   │   ├── examples.md
│   │   ├── feedback.md
│   │   ├── index.md
│   │   └── versioning.md
│   ├── core_concepts.md
│   ├── examples/
│   │   ├── chain_of_density.md
│   │   ├── chain_of_thought.md
│   │   ├── classification.md
│   │   ├── dating_profiles.md
│   │   ├── deploy-using-bentoml.md
│   │   ├── deploy-using-cerebrium.md
│   │   ├── deploy-using-modal.md
│   │   ├── earnings-reports.md
│   │   ├── extract_event_details.md
│   │   ├── extract_event_details.py
│   │   ├── extraction.md
│   │   ├── index.md
│   │   ├── knowledge_graph_extraction.md
│   │   ├── models_playing_chess.md
│   │   ├── prompt_templates/
│   │   │   ├── chain_of_density.txt
│   │   │   ├── classification.txt
│   │   │   ├── react_agent.txt
│   │   │   ├── simtom_prospective_taking.txt
│   │   │   └── simtom_simulation.txt
│   │   ├── qa-with-citations.md
│   │   ├── react_agent.md
│   │   ├── read-pdfs.md
│   │   ├── receipt-digitization.md
│   │   ├── simtom.md
│   │   └── structured_generation_workflow.md
│   ├── features/
│   │   ├── advanced/
│   │   │   ├── backends.md
│   │   │   └── logits_processors.md
│   │   ├── core/
│   │   │   ├── generator.md
│   │   │   ├── inputs.md
│   │   │   └── output_types.md
│   │   ├── index.md
│   │   ├── models/
│   │   │   ├── anthropic.md
│   │   │   ├── dottxt.md
│   │   │   ├── gemini.md
│   │   │   ├── index.md
│   │   │   ├── llamacpp.md
│   │   │   ├── mistral.md
│   │   │   ├── mlxlm.md
│   │   │   ├── ollama.md
│   │   │   ├── openai.md
│   │   │   ├── openai_compatible.md
│   │   │   ├── openrouter.md
│   │   │   ├── sglang.md
│   │   │   ├── tgi.md
│   │   │   ├── transformers.md
│   │   │   ├── transformers_multimodal.md
│   │   │   ├── vllm.md
│   │   │   └── vllm_offline.md
│   │   └── utility/
│   │       ├── application.md
│   │       ├── regex_dsl.md
│   │       └── template.md
│   ├── guide/
│   │   ├── architecture.md
│   │   ├── chat_templating.md
│   │   ├── core_concepts.md
│   │   ├── fastapi_vllm_deployment.md
│   │   ├── getting_started.md
│   │   ├── installation.md
│   │   ├── migration.md
│   │   ├── selecting_an_inference_backend.md
│   │   └── vlm.md
│   ├── index.md
│   ├── overrides/
│   │   ├── home.html
│   │   └── main.html
│   └── stylesheets/
│       └── extra.css
├── environment.yml
├── examples/
│   ├── babyagi.py
│   ├── beam-cloud/
│   │   ├── README.md
│   │   └── app.py
│   ├── bentoml/
│   │   ├── .bentoignore
│   │   ├── bentofile.yaml
│   │   ├── import_model.py
│   │   ├── requirements.txt
│   │   └── service.py
│   ├── cerebrium/
│   │   ├── cerebrium.toml
│   │   └── main.py
│   ├── dating_profile.py
│   ├── llamacpp_example.py
│   ├── llamacpp_processor.py
│   ├── math_generate_code.py
│   ├── meta_prompting.py
│   ├── modal_example.py
│   ├── pick_odd_one_out.py
│   ├── prompts/
│   │   ├── babyagi_create_task.txt
│   │   ├── babyagi_perform_task.txt
│   │   ├── babyagi_prioritize_task.txt
│   │   ├── dating_profile.txt
│   │   ├── pick_odd_one_out.txt
│   │   └── self_consistency.txt
│   ├── react.py
│   ├── sampling.ipynb
│   ├── self_consistency.py
│   ├── simulation_based_inference.ipynb
│   └── vllm_offline_integration.py
├── flake.nix
├── llm.txt
├── mkdocs.yml
├── outlines/
│   ├── __init__.py
│   ├── applications.py
│   ├── backends/
│   │   ├── __init__.py
│   │   ├── base.py
│   │   ├── llguidance.py
│   │   ├── outlines_core.py
│   │   └── xgrammar.py
│   ├── caching.py
│   ├── generator.py
│   ├── grammars/
│   │   ├── arithmetic.lark
│   │   ├── common.lark
│   │   └── json.lark
│   ├── grammars.py
│   ├── inputs.py
│   ├── models/
│   │   ├── __init__.py
│   │   ├── anthropic.py
│   │   ├── base.py
│   │   ├── dottxt.py
│   │   ├── gemini.py
│   │   ├── llamacpp.py
│   │   ├── lmstudio.py
│   │   ├── mistral.py
│   │   ├── mlxlm.py
│   │   ├── ollama.py
│   │   ├── openai.py
│   │   ├── sglang.py
│   │   ├── tgi.py
│   │   ├── tokenizer.py
│   │   ├── transformers.py
│   │   ├── utils.py
│   │   ├── vllm.py
│   │   └── vllm_offline.py
│   ├── processors/
│   │   ├── __init__.py
│   │   ├── base_logits_processor.py
│   │   └── tensor_adapters/
│   │       ├── __init__.py
│   │       ├── base.py
│   │       ├── mlx.py
│   │       ├── numpy.py
│   │       └── torch.py
│   ├── py.typed
│   ├── release_note.md
│   ├── templates.py
│   └── types/
│       ├── __init__.py
│       ├── airports.py
│       ├── countries.py
│       ├── dsl.py
│       ├── json_schema_utils.py
│       ├── locale/
│       │   ├── __init__.py
│       │   └── us.py
│       └── utils.py
├── pyproject.toml
├── requirements-doc.txt
├── scripts/
│   └── gen_ref_pages.py
├── setup.cfg
├── shell.nix
└── tests/
    ├── __init__.py
    ├── backends/
    │   ├── test_backends.py
    │   ├── test_backends_utils.py
    │   ├── test_llguidance.py
    │   ├── test_outlines_core.py
    │   └── test_xgrammar.py
    ├── cfg_samples/
    │   ├── arithmetic/
    │   │   ├── lots_of_ops.arithmetic.test
    │   │   └── simple_math.arithmetic.test
    │   └── json/
    │       ├── outlines.generate.samplers.mypy.json.test
    │       ├── simple_fruit.json.test
    │       └── simple_fruit_no_indent.json.test
    ├── conftest.py
    ├── models/
    │   ├── test_anthopic_type_adapter.py
    │   ├── test_anthropic.py
    │   ├── test_dottxt.py
    │   ├── test_dottxt_type_adapter.py
    │   ├── test_gemini.py
    │   ├── test_gemini_type_adapter.py
    │   ├── test_llamacpp.py
    │   ├── test_llamacpp_tokenizer.py
    │   ├── test_llamacpp_type_adapter.py
    │   ├── test_lmstudio.py
    │   ├── test_lmstudio_type_adapter.py
    │   ├── test_mistral.py
    │   ├── test_mistral_type_adapter.py
    │   ├── test_mlxlm.py
    │   ├── test_mlxlm_type_adapter.py
    │   ├── test_ollama.py
    │   ├── test_ollama_type_adapter.py
    │   ├── test_openai.py
    │   ├── test_openai_type_adapter.py
    │   ├── test_sglang.py
    │   ├── test_sglang_type_adapter.py
    │   ├── test_tgi.py
    │   ├── test_tgi_model_adapter.py
    │   ├── test_tokenizer.py
    │   ├── test_transformers.py
    │   ├── test_transformers_multimodal.py
    │   ├── test_transformers_multimodal_type_adapter.py
    │   ├── test_transformers_tokenizer.py
    │   ├── test_transformers_type_adapter.py
    │   ├── test_utils.py
    │   ├── test_vllm.py
    │   ├── test_vllm_offline.py
    │   ├── test_vllm_offline_type_adapter.py
    │   └── test_vllm_type_adapter.py
    ├── processors/
    │   ├── test_base_processor.py
    │   └── test_tensor_adapters.py
    ├── test_applications.py
    ├── test_cache.py
    ├── test_generator.py
    ├── test_inputs.py
    ├── test_templates.py
    ├── test_utils/
    │   ├── mock_lmstudio_client.py
    │   ├── mock_openai_client.py
    │   ├── mock_tgi_client.py
    │   └── utils.py
    └── types/
        ├── test_custom_types.py
        ├── test_dsl.py
        ├── test_json_schema_utils.py
        ├── test_to_regex.py
        └── test_types_utils.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .devcontainer/devcontainer.json
================================================
{
  "name": "dottxt-ai",
  "image": "mcr.microsoft.com/devcontainers/python:3.12",
  "runArgs": [
    "--device=nvidia.com/gpu=all"
  ],
  "hostRequirements": {
    "gpu": "optional"
  },
  "features": {
    "ghcr.io/devcontainers/features/conda:1": {},
    "ghcr.io/devcontainers/features/nvidia-cuda:1": {
      "installCudnn": true,
      "installToolkit": true,
      "cudaVersion": "12.4"
    },
    "ghcr.io/devcontainers/features/rust:1": {}
  }
}


================================================
FILE: .editorconfig
================================================
# EditorConfig is awesome: https://EditorConfig.org

# top-most EditorConfig file
root = true

[*]
indent_style = space
indent_size = 4
end_of_line = lf
charset = utf-8
trim_trailing_whitespace = true
insert_final_newline = true

[*.yaml]
indent_size = 2


================================================
FILE: .github/ISSUE_TEMPLATE/bug_report.yml
================================================
# Issue template inspired by NumPy's excellent template:
# https://github.com/numpy/numpy/edit/main/.github/ISSUE_TEMPLATE/bug-report.yml
name: 🐞 Bug report
description: Create a bug report to help us reproduce and fix it.
title: "<Please write a descriptive title>"
labels: ["bug"]

body:
  - type: markdown
    attributes:
      value: >-
        Thank you for taking the time to file a bug report. First, carefully read
        the following before everything else:

          - Does your issue only arise in a library that uses Outlines? If so,
            submit your issue to this library's issue tracker.
          - Did you check the issue tracker for open and closed issues that may be
            related to your bug?

  - type: textarea
    attributes:
      label: "Describe the issue as clearly as possible:"
    validations:
      required: true

  - type: textarea
    attributes:
      label: "Steps/code to reproduce the bug:"
      description: >
        A short code example that reproduces the problem/missing feature. It
        should be self-contained, i.e., can be copy-pasted into the Python
        interpreter or run as-is via `python myproblem.py`.
      placeholder: |
        import outlines

        << your code here >>
      render: python
    validations:
      required: true

  - type: textarea
    attributes:
      label: "Expected result:"
      description: >
        Please describe what you expect the above example to output.
      placeholder: |
        << the expected result here >>
      render: shell
    validations:
      required: true

  - type: textarea
    attributes:
      label: "Error message:"
      description: >
        Please include the full error message, if any.
      placeholder: |
        << Full traceback starting from `Traceback: ...` >>
      render: shell

  - type: textarea
    attributes:
      label: "Outlines/Python version information:"
      description: |
          Please run the following code and paste the output here.
          python -c "from outlines import _version; print(_version.__version__)";
          python -c "import sys; print('Python', sys.version)";
          pip freeze;
      value: |
          Version information
          <details>
          ```
          (command output here)
          ```
          </details>
    validations:
      required: true

  - type: textarea
    attributes:
      label: "Context for the issue:"
      description: |
        Please explain how this issue affects your work or why it should be prioritized.
      placeholder: |
        << your explanation here >>
    validations:
      required: false


================================================
FILE: .github/ISSUE_TEMPLATE/config.yml
================================================
contact_links:
  - name: 🤔 Questions & Help
    url: https://github.com/dottxt-ai/outlines/discussions/new
    about: "If you have a question about how to use Outlines, please start a discussion."


================================================
FILE: .github/PULL_REQUEST_TEMPLATE/pull_request_template.md
================================================
# 🚧 Thank you for opening a PR!

A few important guidelines and requirements before we can merge your PR:

- [ ] We should be able to understand what the PR does from its title only;
- [ ] There is a high-level description of the changes;
- [ ] *If I add a new feature*, there is an [issue][issues] discussing it already;
- [ ] There are links to *all* the relevant issues, discussions and PRs;
- [ ] The branch is rebased on the latest `main` commit;
- [ ] **Commit messages** follow these [guidelines][git-guidelines];
- [ ] One commit per logical change;
- [ ] The code respects the current **naming conventions**;
- [ ] Docstrings follow the [numpy style guide][docstring-guidelines];
- [ ] `pre-commit` is installed and configured on your machine, and you ran it before opening the PR;
- [ ] There are tests covering the changes;
- [ ] The documentation is up-to-date;

Consider opening a **Draft PR** if your work is still in progress but you would
like some feedback from other contributors.

[issues]: https://github.com/dottxt-ai/outlines/issues
[git-guidelines]: https://tbaggery.com/2008/04/19/a-note-about-git-commit-messages.html
[docstring-guidelines]: https://numpydoc.readthedocs.io/en/latest/format.html


================================================
FILE: .github/scripts/build_sdist_and_wheel.sh
================================================
#!/bin/bash

# Build sdist and wheel
python -m pip install -U pip
python -m pip install build
python -m build

# Check sdist install and imports
mkdir -p test-sdist
cd test-sdist
python -m venv venv-sdist
venv-sdist/bin/python -m pip install ../dist/outlines-*.tar.gz
venv-sdist/bin/python -c "import outlines"
cd ..

# Check wheel install and imports
mkdir -p test-wheel
cd test-wheel
python -m venv venv-wheel
venv-wheel/bin/python -m pip install ../dist/outlines-*.whl
venv-wheel/bin/python -c "import outlines"
cd ..


================================================
FILE: .github/workflows/build_documentation.yml
================================================
name: Build the documentation

on:
  pull_request:
    types: [opened, synchronize, reopened, closed]
    branches: [main]
  workflow_dispatch:

permissions:
  contents: write
  pull-requests: write

jobs:
  build:
    name: Build and Deploy Documentation Preview
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4

      - uses: actions/setup-python@v4
        with:
          python-version: "3.10"

      - name: Install dependencies
        if: github.event.action != 'closed'
        run: pip install -r requirements-doc.txt

      - name: Build the documentation
        if: github.event.action != 'closed'
        env:
          GOOGLE_ANALYTICS_KEY: ${{ secrets.GOOGLE_ANALYTICS_KEY }}
          PR_NUMBER: ${{ github.event.pull_request.number }}
        run: |
          sed -i "1i site_url: https://dottxt-ai.github.io/outlines/pr-preview/pr-${PR_NUMBER}/" mkdocs.yml
          mkdocs build

      - name: Deploy to PR preview
        if: github.event_name == 'pull_request'
        uses: rossjrw/pr-preview-action@v1
        with:
          source-dir: site/
          preview-branch: gh-pages
          umbrella-dir: pr-preview
          comment: false

      - name: Comment PR with preview link
        if: github.event_name == 'pull_request' && github.event.action != 'closed'
        uses: actions/github-script@v7
        with:
          script: |
            const prNumber = context.issue.number;
            const previewUrl = `https://dottxt-ai.github.io/outlines/pr-preview/pr-${prNumber}/`;

            // Find existing preview comment
            const comments = await github.rest.issues.listComments({
              issue_number: prNumber,
              owner: context.repo.owner,
              repo: context.repo.repo,
            });

            const botComment = comments.data.find(comment =>
              comment.user.type === 'Bot' &&
              comment.body.includes('Documentation preview')
            );

            const commentBody = `📚 **Documentation preview**: ${previewUrl}\n\n*Preview updates automatically with each commit.*`;

            // Update existing comment or create new one
            if (botComment) {
              await github.rest.issues.updateComment({
                comment_id: botComment.id,
                owner: context.repo.owner,
                repo: context.repo.repo,
                body: commentBody
              });
            } else {
              await github.rest.issues.createComment({
                issue_number: prNumber,
                owner: context.repo.owner,
                repo: context.repo.repo,
                body: commentBody
              });
            }


================================================
FILE: .github/workflows/publish_documentation.yml
================================================
name: Publish the documentation

on:
  workflow_dispatch:
  push:
    branches:
      - main
  release:
    types:
      - created

permissions:
  contents: write

jobs:
  deploy:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
        with:
          fetch-depth: 0
      - uses: actions/setup-python@v4
        with:
          python-version: 3.x
      - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV
      - uses: actions/cache@v3
        with:
          key: mkdocs-material-${{ env.cache_id }}
          path: .cache
          restore-keys: |
            mkdocs-material-
      - run: pip install -r requirements-doc.txt
      - run: mkdocs build

      - name: Set up Git
        run: |
          git config user.name ${{ github.actor }}
          git config user.email ${{ github.actor }}@users.noreply.github.com

      - name: Publish Tag as latest
        env:
          GOOGLE_ANALYTICS_KEY: ${{ secrets.GOOGLE_ANALYTICS_KEY }}
        if: github.event_name == 'release'
        run: |
          mike deploy --push --update-aliases ${{ github.ref_name }} latest
          mike set-default --push latest

      - name: Publish main as unstable
        env:
          GOOGLE_ANALYTICS_KEY: ${{ secrets.GOOGLE_ANALYTICS_KEY }}
        if: github.event_name == 'push'
        run: |
          mike deploy --push --update-aliases ${{ github.ref_name }} unstable


================================================
FILE: .github/workflows/release_pypi.yaml
================================================
name: Release PyPi

on:
  release:
    types:
      - created
jobs:
  release-job:
    name: Build and publish on PyPi
    runs-on: ubuntu-latest
    steps:
    - name: Checkout
      uses: actions/checkout@v2
    - name: Set up Python
      uses: actions/setup-python@v2
      with:
        python-version: "3.10"
    - name: Build SDist and Wheel
      run: ./.github/scripts/build_sdist_and_wheel.sh
    - name: Check that the package version matches the Release name
      run: |
        grep -Rq "^Version: ${GITHUB_REF:10}$" outlines.egg-info/PKG-INFO
    - name: Publish to PyPi
      uses: pypa/gh-action-pypi-publish@v1.4.2
      with:
        user: __token__
        password: ${{ secrets.PYPI_TOKEN }}


================================================
FILE: .github/workflows/tests.yml
================================================
name: Tests

on:
  pull_request:
    branches: [main,v1.0]
  push:
    branches: [main]

jobs:
  style:
    name: Check the code style
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v3
    - uses: actions/setup-python@v4
      with:
        python-version: "3.13"
    - uses: pre-commit/action@v3.0.0

  tests:
    name: Run the tests
    runs-on: ubuntu-latest
    strategy:
      matrix:
        python-version: ["3.10", "3.13"]
    steps:
    - uses: actions/checkout@v3
    - name: Set up Python ${{ matrix.python-version }}
      uses: actions/setup-python@v4
      with:
        python-version: ${{ matrix.python-version }}
        cache: 'pip'
        cache-dependency-path: 'pyproject.toml'
    - name: Free disk space
      run: |
        set -eux
        sudo rm -rf /usr/share/dotnet || true
        sudo rm -rf /opt/ghc || true
        sudo rm -rf /usr/local/lib/android || true
        sudo apt-get clean
        df -h
    - name: Install Ollama
      run: |
        curl -fsSL https://ollama.com/install.sh | sh
        ollama --version
        ollama pull tinyllama
    - name: Set up test environment
      run: |
        python -m pip install --upgrade pip
        pip install uv
        uv sync --no-group test-gpu --extra test
    - name: cache HuggingFace models
      uses: actions/cache@v4
      with:
        path: ~/.cache/huggingface
        key: hf-${{ runner.os }}-${{ hashFiles('**/pyproject.toml') }}
        restore-keys: |
          hf-${{ runner.os }}-
    - name: Create matrix id
      id: matrix-id
      env:
        MATRIX_CONTEXT: ${{ toJson(matrix) }}
      run: |
        echo $MATRIX_CONTEXT
        export MATRIX_ID=`echo $MATRIX_CONTEXT | md5sum | cut -c 1-32`
        echo $MATRIX_ID
        echo "::set-output name=id::$MATRIX_ID"
    - name: Run tests
      run: |
        rm -f .coverage*
        uv run coverage erase
        uv run python -m coverage run --branch --source=outlines --parallel-mode -m pytest -x -m 'not api_call'
    - name: Upload coverage data
      uses: actions/upload-artifact@v4
      with:
        name: coverage-data-${{ matrix.python-version }}
        path: .coverage.*
        if-no-files-found: ignore
        include-hidden-files: true

  coverage:
    name: Combine & check coverage.
    needs: tests
    runs-on: ubuntu-latest

    steps:
      - uses: actions/checkout@v3
        with:
          fetch-depth: 0

      - uses: actions/setup-python@v4
        with:
          cache: pip
          python-version: "3.11"

      - name: Set up environment
        run: |
          pip install --upgrade "coverage[toml]>=5.1" diff-cover

      - uses: actions/download-artifact@v4
        with:
          pattern: coverage-data-*
          merge-multiple: true

      - name: Combine coverage & fail if it's <100%.
        run: |
          python -m coverage combine
          python -m coverage html --skip-covered --skip-empty
          python -m coverage xml
          python -m coverage report --fail-under=100 || (python -m coverage report && exit 1)

      - name: Upload HTML report if check failed.
        uses: actions/upload-artifact@v4
        with:
          name: html-report
          path: htmlcov
          overwrite: true
        if: ${{ failure() }}

  build-wheel:
    name: Build Wheel and Test SDist
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v3
      - name: Build SDist and Wheel
        run: ./.github/scripts/build_sdist_and_wheel.sh


================================================
FILE: .github/workflows/tests_api_models.yml
================================================
name: API Models Tests

on:
  workflow_dispatch:

jobs:
  tests:
    name: Run API Models Tests
    runs-on: ubuntu-latest
    env:
      OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
      ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
      GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
      DOTTXT_API_KEY: ${{ secrets.DOTTXT_API_KEY }}
    strategy:
      fail-fast: false
      matrix:
        python-version: ["3.10"]

    steps:
    - uses: actions/checkout@v3
      with:
        ref: ${{ github.ref }}

    - name: Set up Python ${{ matrix.python-version }}
      uses: actions/setup-python@v4
      with:
        python-version: ${{ matrix.python-version }}
        cache: 'pip'
        cache-dependency-path: 'pyproject.toml'

    - name: Free disk space
      run: |
        set -eux
        sudo rm -rf /usr/share/dotnet || true
        sudo rm -rf /opt/ghc || true
        sudo rm -rf /usr/local/lib/android || true
        sudo apt-get clean
        df -h

    - name: Install Ollama
      run: |
        curl -fsSL https://ollama.com/install.sh | sh
        ollama --version
        ollama pull tinyllama

    - name: Set up test environment
      run: |
        python -m pip install --upgrade pip
        pip install uv
        uv sync --no-group test-gpu --extra test

    - name: cache HuggingFace models
      uses: actions/cache@v4
      with:
        path: ~/.cache/huggingface
        key: hf-${{ runner.os }}-${{ hashFiles('**/pyproject.toml') }}
        restore-keys: |
          hf-${{ runner.os }}-

    - name: Create matrix id
      id: matrix-id
      env:
        MATRIX_CONTEXT: ${{ toJson(matrix) }}
      run: |
        echo $MATRIX_CONTEXT
        export MATRIX_ID=`echo $MATRIX_CONTEXT | md5sum | cut -c 1-32`
        echo $MATRIX_ID
        echo "::set-output name=id::$MATRIX_ID"

    - name: Run tests
      run: |
        uv run pytest -m 'api_call' --ignore=tests/models/test_dottxt.py
      env:
        COVERAGE_FILE: .coverage.${{ steps.matrix-id.outputs.id }}


================================================
FILE: .gitignore
================================================
__pycache__
.benchmarks
.cache
.coverage
.direnv
.env
.idea
.pytest_cache
.python-version
.venv
*_version.py
*.egg-info
*.gguf
benchmarks/results
build
docs/build
logs
.worktrees/


================================================
FILE: .pre-commit-config.yaml
================================================
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
  rev: v5.0.0
  hooks:
    -   id: check-merge-conflict
    -   id: debug-statements
    -   id: end-of-file-fixer
    -   id: trailing-whitespace
- repo: https://github.com/pre-commit/mirrors-mypy
  rev: v1.14.1
  hooks:
    - id: mypy
      args: [--allow-redefinition]
      exclude: ^examples/
      additional_dependencies: [types-tqdm, types-Pillow]
- repo: https://github.com/astral-sh/ruff-pre-commit
  rev: v0.9.1
  hooks:
    - id: ruff
      args: ["--config=pyproject.toml"]


================================================
FILE: .pydocstyle
================================================
[pydocstyle]
convention = numpy


================================================
FILE: .readthedocs.yaml
================================================
version: 2

python:
  version: "3.8"
  install:
      - method: pip
        path: .
        extra_requirements:
          - rtd
      - requirements: requirements-doc.txt

sphinx:
  builder: html
  configuration: docs/source/conf.py
  fail_on_warning: true


================================================
FILE: .vscode/settings.json
================================================
{
    "python.testing.pytestArgs": [
        "tests"
    ],
    "python.testing.unittestEnabled": false,
    "python.testing.pytestEnabled": true
}


================================================
FILE: LICENSE
================================================
                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright 2023- The Outlines developers

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: README.md
================================================
<div align="center" style="margin-bottom: 1em;">

<img src="./docs/assets/images/logo-light-mode.svg#gh-light-mode-only" alt="Outlines Logo" width=300></img>
<img src="./docs/assets/images/logo-dark-mode.svg#gh-dark-mode-only" alt="Outlines Logo" width=300></img>


 🗒️ *Structured outputs for LLMs* 🗒️

Made with ❤👷️ by the team at [.txt](https://dottxt.co)
<br>Trusted by NVIDIA, Cohere, HuggingFace, vLLM, etc.

<!-- Project Badges -->
[![PyPI Version][pypi-version-badge]][pypi]
[![Downloads][downloads-badge]][pypistats]
[![Stars][stars-badge]][stars]

<!-- Community Badges -->
[![Discord][discord-badge]][discord]
[![Blog][dottxt-blog-badge]][dottxt-blog]
[![Twitter][twitter-badge]][twitter]

</div>

## 🚀 Building the future of structured generation

We're working with select partners to develop new interfaces to structured generation.

Need XML, FHIR, custom schemas or grammars? Let's talk.

Audit your schema: share one schema, we show you what breaks under generation, the constraints that fix it, and compliance rates before and after. Sign up [here](https://h1xbpbfsf0w.typeform.com/to/rtFUraA2?typeform).

## Table of Contents

- [Why Outlines?](#why-outlines)
- [Quickstart](#quickstart)
- [Real-World Examples](#real-world-examples)
  - [🙋‍♂️ Customer Support Triage](#customer-support-triage)
  - [📦 E-commerce Product Categorization](#e-commerce-product-categorization)
  - [📊 Parse Event Details with Incomplete Data](#parse-event-details-with-incomplete-data)
  - [🗂️ Categorize Documents into Predefined Types](#categorize-documents-into-predefined-types)
  - [📅 Schedule a Meeting with Function Calling](#schedule-a-meeting-with-function-calling)
  - [📝 Dynamically Generate Prompts with Re-usable Templates](#dynamically-generate-prompts-with-re-usable-templates)
- [They Use Outlines](#they-use-outlines)
- [Model Integrations](#model-integrations)
- [Core Features](#core-features)
- [Other Features](#other-features)
- [About .txt](#about-txt)
- [Community](#community)

<div align="center"><img src="./docs/assets/images/install.png" width=300></img></div>

## Why Outlines?

LLMs are powerful but their outputs are unpredictable. Most solutions attempt to fix bad outputs after generation using parsing, regex, or fragile code that breaks easily.

Outlines guarantees structured outputs during generation — directly from any LLM.

- **Works with any model** - Same code runs across OpenAI, Ollama, vLLM, and more
- **Simple integration** - Just pass your desired output type: `model(prompt, output_type)`
- **Guaranteed valid structure** - No more parsing headaches or broken JSON
- **Provider independence** - Switch models without changing code


### The Outlines Philosophy

<div align="center"><img src="./docs/assets/images/use_philosophy.png" width=300></img></div>

Outlines follows a simple pattern that mirrors Python's own type system. Simply specify the desired output type, and Outlines will ensure your data matches that structure exactly:

- For a yes/no response, use `Literal["Yes", "No"]`
- For numerical values, use `int`
- For complex objects, define a structure with a [Pydantic model](https://docs.pydantic.dev/latest/)

## Quickstart

Getting started with outlines is simple:

### 1. Install outlines

``` shell
pip install outlines
```

### 2. Connect to your preferred model

``` python
import outlines
from transformers import AutoTokenizer, AutoModelForCausalLM


MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct"
model = outlines.from_transformers(
    AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto"),
    AutoTokenizer.from_pretrained(MODEL_NAME)
)
```

### 3. Start with simple structured outputs

``` python
from typing import Literal
from pydantic import BaseModel


# Simple classification
sentiment = model(
    "Analyze: 'This product completely changed my life!'",
    Literal["Positive", "Negative", "Neutral"]
)
print(sentiment)  # "Positive"

# Extract specific types
temperature = model("What's the boiling point of water in Celsius?", int)
print(temperature)  # 100
```

### 4. Create complex structures

``` python
from pydantic import BaseModel
from enum import Enum

class Rating(Enum):
    poor = 1
    fair = 2
    good = 3
    excellent = 4

class ProductReview(BaseModel):
    rating: Rating
    pros: list[str]
    cons: list[str]
    summary: str

review = model(
    "Review: The XPS 13 has great battery life and a stunning display, but it runs hot and the webcam is poor quality.",
    ProductReview,
    max_new_tokens=200,
)

review = ProductReview.model_validate_json(review)
print(f"Rating: {review.rating.name}")  # "Rating: good"
print(f"Pros: {review.pros}")           # "Pros: ['great battery life', 'stunning display']"
print(f"Summary: {review.summary}")     # "Summary: Good laptop with great display but thermal issues"
```

## Real-world examples

Here are production-ready examples showing how Outlines solves common problems:

<details id="customer-support-triage"><summary><b>🙋‍♂️ Customer Support Triage</b>
<br>This example shows how to convert a free-form customer email into a structured service ticket. By parsing attributes like priority, category, and escalation flags, the code enables automated routing and handling of support issues.
</summary>

``` python
import outlines
from enum import Enum
from pydantic import BaseModel
from transformers import AutoTokenizer, AutoModelForCausalLM
from typing import List


MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct"
model = outlines.from_transformers(
    AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto"),
    AutoTokenizer.from_pretrained(MODEL_NAME)
)


def alert_manager(ticket):
    print("Alert!", ticket)


class TicketPriority(str, Enum):
    low = "low"
    medium = "medium"
    high = "high"
    urgent = "urgent"

class ServiceTicket(BaseModel):
    priority: TicketPriority
    category: str
    requires_manager: bool
    summary: str
    action_items: List[str]


customer_email = """
Subject: URGENT - Cannot access my account after payment

I paid for the premium plan 3 hours ago and still can't access any features.
I've tried logging out and back in multiple times. This is unacceptable as I
have a client presentation in an hour and need the analytics dashboard.
Please fix this immediately or refund my payment.
"""

prompt = f"""
<|im_start|>user
Analyze this customer email:

{customer_email}
<|im_end|>
<|im_start|>assistant
"""

ticket = model(
    prompt,
    ServiceTicket,
    max_new_tokens=500
)

# Use structured data to route the ticket
ticket = ServiceTicket.model_validate_json(ticket)
if ticket.priority == "urgent" or ticket.requires_manager:
    alert_manager(ticket)
```
</details>

<details id="e-commerce-product-categorization"><summary><b>📦 E-commerce product categorization</b>
<br>This use case demonstrates how outlines can transform product descriptions into structured categorization data (e.g., main category, sub-category, and attributes) to streamline tasks such as inventory management. Each product description is processed automatically, reducing manual categorization overhead.
</summary>

```python
import outlines
from pydantic import BaseModel
from transformers import AutoTokenizer, AutoModelForCausalLM
from typing import List, Optional


MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct"
model = outlines.from_transformers(
    AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto"),
    AutoTokenizer.from_pretrained(MODEL_NAME)
)


def update_inventory(product, category, sub_category):
    print(f"Updated {product.split(',')[0]} in category {category}/{sub_category}")


class ProductCategory(BaseModel):
    main_category: str
    sub_category: str
    attributes: List[str]
    brand_match: Optional[str]

# Process product descriptions in batches
product_descriptions = [
    "Apple iPhone 15 Pro Max 256GB Titanium, 6.7-inch Super Retina XDR display with ProMotion",
    "Organic Cotton T-Shirt, Men's Medium, Navy Blue, 100% Sustainable Materials",
    "KitchenAid Stand Mixer, 5 Quart, Red, 10-Speed Settings with Dough Hook Attachment"
]

template = outlines.Template.from_string("""
<|im_start|>user
Categorize this product:

{{ description }}
<|im_end|>
<|im_start|>assistant
""")

# Get structured categorization for all products
categories = model(
    [template(description=desc) for desc in product_descriptions],
    ProductCategory,
    max_new_tokens=200
)

# Use categorization for inventory management
categories = [
    ProductCategory.model_validate_json(category) for category in categories
]
for product, category in zip(product_descriptions, categories):
    update_inventory(product, category.main_category, category.sub_category)
```
</details>

<details id="parse-event-details-with-incomplete-data"><summary><b>📊 Parse event details with incomplete data</b>
<br>This example uses outlines to parse event descriptions into structured information (like event name, date, location, type, and topics), even handling cases where the data is incomplete. It leverages union types to return either structured event data or a fallback “I don’t know” answer, ensuring robust extraction in varying scenarios.
</summary>

```python
import outlines
from typing import Union, List, Literal
from pydantic import BaseModel
from enum import Enum
from transformers import AutoTokenizer, AutoModelForCausalLM


MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct"
model = outlines.from_transformers(
    AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto"),
    AutoTokenizer.from_pretrained(MODEL_NAME)
)

class EventType(str, Enum):
    conference = "conference"
    webinar = "webinar"
    workshop = "workshop"
    meetup = "meetup"
    other = "other"


class EventInfo(BaseModel):
    """Structured information about a tech event"""
    name: str
    date: str
    location: str
    event_type: EventType
    topics: List[str]
    registration_required: bool

# Create a union type that can either be a structured EventInfo or "I don't know"
EventResponse = Union[EventInfo, Literal["I don't know"]]

# Sample event descriptions
event_descriptions = [
    # Complete information
    """
    Join us for DevCon 2023, the premier developer conference happening on November 15-17, 2023
    at the San Francisco Convention Center. Topics include AI/ML, cloud infrastructure, and web3.
    Registration is required.
    """,

    # Insufficient information
    """
    Tech event next week. More details coming soon!
    """
]

# Process events
results = []
for description in event_descriptions:
    prompt = f"""
<|im_start>system
You are a helpful assistant
<|im_end|>
<|im_start>user
Extract structured information about this tech event:

{description}

If there is enough information, return a JSON object with the following fields:

- name: The name of the event
- date: The date where the event is taking place
- location: Where the event is taking place
- event_type: either 'conference', 'webinar', 'workshop', 'meetup' or 'other'
- topics: a list of topics of the conference
- registration_required: a boolean that indicates whether registration is required

If the information available does not allow you to fill this JSON, and only then, answer 'I don't know'.
<|im_end|>
<|im_start|>assistant
"""
    # Union type allows the model to return structured data or "I don't know"
    result = model(prompt, EventResponse, max_new_tokens=200)
    results.append(result)

# Display results
for i, result in enumerate(results):
    print(f"Event {i+1}:")
    if isinstance(result, str):
        print(f"  {result}")
    else:
        # It's an EventInfo object
        print(f"  Name: {result.name}")
        print(f"  Type: {result.event_type}")
        print(f"  Date: {result.date}")
        print(f"  Topics: {', '.join(result.topics)}")
    print()

# Use structured data in downstream processing
structured_count = sum(1 for r in results if isinstance(r, EventInfo))
print(f"Successfully extracted data for {structured_count} of {len(results)} events")
```
</details>

<details id="categorize-documents-into-predefined-types"><summary><b>🗂️ Categorize documents into predefined types</b>
<br>In this case, outlines classifies documents into predefined categories (e.g., “Financial Report,” “Legal Contract”) using a literal type specification. The resulting classifications are displayed in both a table format and through a category distribution summary, illustrating how structured outputs can simplify content management.
</summary>

```python
import outlines
from typing import Literal, List
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM


MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct"
model = outlines.from_transformers(
    AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto"),
    AutoTokenizer.from_pretrained(MODEL_NAME)
)


# Define classification categories using Literal
DocumentCategory = Literal[
    "Financial Report",
    "Legal Contract",
    "Technical Documentation",
    "Marketing Material",
    "Personal Correspondence"
]

# Sample documents to classify
documents = [
    "Q3 Financial Summary: Revenue increased by 15% year-over-year to $12.4M. EBITDA margin improved to 23% compared to 19% in Q3 last year. Operating expenses...",

    "This agreement is made between Party A and Party B, hereinafter referred to as 'the Parties', on this day of...",

    "The API accepts POST requests with JSON payloads. Required parameters include 'user_id' and 'transaction_type'. The endpoint returns a 200 status code on success."
]

template = outlines.Template.from_string("""
<|im_start|>user
Classify the following document into exactly one category among the following categories:
- Financial Report
- Legal Contract
- Technical Documentation
- Marketing Material
- Personal Correspondence

Document:
{{ document }}
<|im_end|>
<|im_start|>assistant
""")

# Classify documents
def classify_documents(texts: List[str]) -> List[DocumentCategory]:
    results = []

    for text in texts:
        prompt = template(document=text)
        # The model must return one of the predefined categories
        category = model(prompt, DocumentCategory, max_new_tokens=200)
        results.append(category)

    return results

# Perform classification
classifications = classify_documents(documents)

# Create a simple results table
results_df = pd.DataFrame({
    "Document": [doc[:50] + "..." for doc in documents],
    "Classification": classifications
})

print(results_df)

# Count documents by category
category_counts = pd.Series(classifications).value_counts()
print("\nCategory Distribution:")
print(category_counts)
```
</details>

<details>
<summary id="schedule-a-meeting-with-function-calling"><b>📅 Schedule a meeting from requests with Function Calling</b>
<br>This example demonstrates how outlines can interpret a natural language meeting request and translate it into a structured format matching a predefined function’s parameters. Once the meeting details are extracted (e.g., title, date, duration, attendees), they are used to automatically schedule the meeting.
</summary>

```python
import outlines
import json
from typing import List, Optional
from datetime import date
from transformers import AutoTokenizer, AutoModelForCausalLM


MODEL_NAME = "microsoft/phi-4"
model = outlines.from_transformers(
    AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto"),
    AutoTokenizer.from_pretrained(MODEL_NAME)
)


# Define a function with typed parameters
def schedule_meeting(
    title: str,
    date: date,
    duration_minutes: int,
    attendees: List[str],
    location: Optional[str] = None,
    agenda_items: Optional[List[str]] = None
):
    """Schedule a meeting with the specified details"""
    # In a real app, this would create the meeting
    meeting = {
        "title": title,
        "date": date,
        "duration_minutes": duration_minutes,
        "attendees": attendees,
        "location": location,
        "agenda_items": agenda_items
    }
    return f"Meeting '{title}' scheduled for {date} with {len(attendees)} attendees"

# Natural language request
user_request = """
I need to set up a product roadmap review with the engineering team for next
Tuesday at 2pm. It should last 90 minutes. Please invite john@example.com,
sarah@example.com, and the product team at product@example.com.
"""

# Outlines automatically infers the required structure from the function signature
prompt = f"""
<|im_start|>user
Extract the meeting details from this request:

{user_request}
<|im_end|>
<|im_start|>assistant
"""
meeting_params = model(prompt, schedule_meeting, max_new_tokens=200)

# The result is a dictionary matching the function parameters
meeting_params = json.loads(meeting_params)
print(meeting_params)

# Call the function with the extracted parameters
result = schedule_meeting(**meeting_params)
print(result)
# "Meeting 'Product Roadmap Review' scheduled for 2023-10-17 with 3 attendees"
```
</details>

<details>
<summary id="dynamically-generate-prompts-with-re-usable-templates"><b>📝 Dynamically generate prompts with re-usable templates</b>
<br>Using Jinja-based templates, this example shows how to generate dynamic prompts for tasks like sentiment analysis. It illustrates how to easily re-use and customize prompts—including few-shot learning strategies—for different content types while ensuring the outputs remain structured.
</summary>

```python
import outlines
from typing import List, Literal
from transformers import AutoTokenizer, AutoModelForCausalLM


MODEL_NAME = "microsoft/phi-4"
model = outlines.from_transformers(
    AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto"),
    AutoTokenizer.from_pretrained(MODEL_NAME)
)


# 1. Create a reusable template with Jinja syntax
sentiment_template = outlines.Template.from_string("""
<|im_start>user
Analyze the sentiment of the following {{ content_type }}:

{{ text }}

Provide your analysis as either "Positive", "Negative", or "Neutral".
<|im_end>
<|im_start>assistant
""")

# 2. Generate prompts with different parameters
review = "This restaurant exceeded all my expectations. Fantastic service!"
prompt = sentiment_template(content_type="review", text=review)

# 3. Use the templated prompt with structured generation
result = model(prompt, Literal["Positive", "Negative", "Neutral"])
print(result)  # "Positive"

# Templates can also be loaded from files
example_template = outlines.Template.from_file("templates/few_shot.txt")

# Use with examples for few-shot learning
examples = [
    ("The food was cold", "Negative"),
    ("The staff was friendly", "Positive")
]
few_shot_prompt = example_template(examples=examples, query="Service was slow")
print(few_shot_prompt)
```
</details>

## They use outlines

<div align="center">
<img src="./docs/assets/images/readme-light.png#gh-light-mode-only" alt="Users Logo"></img>
<img src="./docs/assets/images/readme-dark.png#gh-dark-mode-only" alt="Users Logo"></img>
</div>

## Model Integrations

| Model type | Description | Documentation |
|---------|-------------|:-------------:|
| **Server Support** | vLLM and Ollama | [Server Integrations →](https://dottxt-ai.github.io/outlines/latest/features/models/) |
| **Local Model Support** | transformers and llama.cpp | [Model Integrations →](https://dottxt-ai.github.io/outlines/latest/features/models/) |
| **API Support** | OpenAI and Gemini | [API Integrations →](https://dottxt-ai.github.io/outlines/latest/features/models/) |

## Core Features

| Feature | Description | Documentation |
|---------|-------------|:-------------:|
| **Multiple Choices** | Constrain outputs to predefined options | [Multiple Choices Guide →](https://dottxt-ai.github.io/outlines/latest/features/core/output_types/#multiple-choices) |
| **Function Calls** | Infer structure from function signatures | [Function Guide →](https://dottxt-ai.github.io/outlines/latest/features/core/output_types/#json-schemas) |
| **JSON/Pydantic** | Generate outputs matching JSON schemas | [JSON Guide →](https://dottxt-ai.github.io/outlines/latest/features/core/output_types/#json-schemas) |
| **Regular Expressions** | Generate text following a regex pattern | [Regex Guide →](https://dottxt-ai.github.io/outlines/latest/features/core/output_types/#regex-patterns) |
| **Grammars** | Enforce complex output structures | [Grammar Guide →](https://dottxt-ai.github.io/outlines/latest/features/core/output_types/#context-free-grammars) |

## Other Features

| Feature | Description | Documentation |
|---------|-------------|:-------------:|
| **Prompt templates** | Separate complex prompts from code | [Template Guide →](https://dottxt-ai.github.io/outlines/latest/features/utility/template/) |
| **Custome types** | Intuitive interface to build complex types | [Python Types Guide →](https://dottxt-ai.github.io/outlines/latest/features/core/output_types/#basic-python-types) |
| **Applications** | Encapsulate templates and types into functions | [Application Guide →](https://dottxt-ai.github.io/outlines/latest/features/utility/application/) |

## About .txt

<div align="center">
<img src="./docs/assets/images/dottxt-light.svg#gh-light-mode-only" alt="dottxt logo" width=100></img>
<img src="./docs/assets/images/dottxt-dark.svg#gh-dark-mode-only" alt="dottxt logo" width=100></img>
</div>

Outlines is developed and maintained by [.txt](https://dottxt.co), a company dedicated to making LLMs more reliable for production applications.

Our focus is on advancing structured generation technology through:

- 🧪 **Cutting-edge Research**: We publish our findings on [structured generation](http://blog.dottxt.co/performance-gsm8k.html)
- 🚀 **Enterprise-grade solutions**: You can license [our enterprise-grade libraries](https://docs.dottxt.co).
- 🧩 **Open Source Collaboration**: We believe in building in public and contributing to the community

Follow us on [Twitter](https://twitter.com/dottxtai) or check out our [blog](https://blog.dottxt.co/) to stay updated on our latest work in making LLMs more reliable.

## Community

<div align="center" style="margin-bottom: 1em;">

[![Contributors][contributors-badge]][contributors]
[![Stars][stars-badge]][stars]
[![Downloads][downloads-badge]][pypistats]
[![Discord badge][discord-badge]][discord]

</div>

- 💡 **Have an idea?** Come chat with us on [Discord][discord]
- 🐞 **Found a bug?** Open an [issue](https://github.com/dottxt-ai/outlines/issues)
- 🧩  **Want to contribute?** Consult our [contribution guide](https://dottxt-ai.github.io/outlines/latest/community/contribute/).


## Cite Outlines

```
@article{willard2023efficient,
  title={Efficient Guided Generation for Large Language Models},
  author={Willard, Brandon T and Louf, R{\'e}mi},
  journal={arXiv preprint arXiv:2307.09702},
  year={2023}
}
```

[contributors]: https://github.com/dottxt-ai/outlines/graphs/contributors
[contributors-badge]: https://img.shields.io/github/contributors/dottxt-ai/outlines?style=flat-square&logo=github&logoColor=white&color=ECEFF4
[dottxt-blog]: https://blog.dottxt.co/
[dottxt-blog-badge]: https://img.shields.io/badge/dottxt%20blog-a6b4a3
[dottxt-twitter]: https://twitter.com/dottxtai
[dottxt-twitter-badge]: https://img.shields.io/twitter/follow/dottxtai?style=social
[discord]: https://discord.gg/R9DSu34mGd
[discord-badge]: https://img.shields.io/discord/1182316225284554793?color=ddb8ca&logo=discord&logoColor=white&style=flat-square
[downloads-badge]: https://img.shields.io/pypi/dm/outlines?color=A6B4A3&logo=python&logoColor=white&style=flat-square
[pypistats]: https://pypistats.org/packages/outlines
[pypi-version-badge]: https://img.shields.io/pypi/v/outlines?style=flat-square&logoColor=white&color=ddb8ca
[pypi]: https://pypi.org/project/outlines/
[stars]: https://github.com/dottxt-ai/outlines/stargazers
[stars-badge]: https://img.shields.io/github/stars/dottxt-ai/outlines?style=flat-square&logo=github&color=BD932F&logoColor=white
[twitter-badge]: https://img.shields.io/twitter/follow/dottxtai?style=flat-square&logo=x&logoColor=white&color=bd932f
[twitter]: https://x.com/dottxtai


================================================
FILE: docs/api_reference/index.md
================================================
# API Reference


================================================
FILE: docs/blog/index.md
================================================
# Blog


================================================
FILE: docs/community/contribute.md
================================================
---
title: Contribute
---

## What contributions?

- **Documentation** contributions are very valuable to us!
- **Examples.** Show us what you did with Outlines :)
- **Bug reports** with a minimum working examples in the [issue tracker][issues]
- **Bug fixes** are always a pleasure to review.
- **New features**. Please start a new [discussion][discussions], or [come chat with us][discord] beforehand!

Note that the [issue tracker][issues] is only intended for actionable items. In doubt, open a [discussion][discussions] or [come talk to us][discord].

## How to contribute?

### Setup

First, [fork the repository on GitHub](https://github.com/dottxt-ai/outlines/fork) and clone the fork locally:

```shell
git clone git@github.com/YourUserName/outlines.git
cd outlines
```

Create a new virtual environment:

*If you are using `uv`*:

```shell
uv venv
source .venv/bin/activate
alias pip="uv pip" # ... or just remember to prepend any pip command with uv in the rest of this guide
```

*If you are using `venv`*:

```shell
python -m venv .venv
source .venv/bin/activate
```

*If you are using `conda`*:

```shell
conda env create -f environment.yml
```

Then install the dependencies in editable mode, and install the `pre-commit` hooks:

```shell
pip install -e ".[test]"
pre-commit install
```
If you own a GPU and want to run the vLLM tests you will have to run:

```shell
pip install -e ".[test-gpu]"
```

instead.

Outlines provides optional dependencies for different supported backends, which you can install with

```shell
pip install ".[vllm]"
```

A list of supported optional dependencies can be found in the [installation guide](/installation).

### Using VSCode DevContainer / GitHub Codespaces

If you want a fully pre-configured development environment, you can use VSCode DevContainers or GitHub Codespaces.

#### VSCode DevContainer

1. Ensure that the [Docker](https://www.docker.com/get-started/) daemon is running on your machine.
2. Install the [Dev Containers](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers) extension in VSCode.
3. Open the Outlines repository in VSCode. When prompted, **Reopen in Container** (or press `F1` and select "Remote-Containers: Reopen in Container").
4. Run the normal setup steps. Your environment will not complain about missing system dependencies!

#### GitHub Codespaces

1. Navigate to the Outlines repository on GitHub.
2. Click on the **Code** button and select the **Codespaces** tab.
3. Click **Create codespace on main** (or another branch you are working on).
4. GitHub will launch a pre-configured cloud development environment.

You will not have access to a GPU, but you'll be able to make basic contributions to the project on the go while using a fully featured web-based IDE.

### Before pushing your code

Run the tests:

```shell
pytest
```

And run the code style checks:

```shell
pre-commit run --all-files
```

### Benchmarking

Outlines uses [asv](https://asv.readthedocs.io) for automated benchmark testing. Benchmarks are run automatically before pull requests are merged to prevent performance degradation.

You can run the benchmark test suite locally with the following command:

```shell
asv run --config benchmarks/asv.conf.json
```

Caveats:

- If you're on a device with CUDA, you must add the argument `--launch-method spawn`
- Uncommitted code will not be benchmarked, you must first commit your changes.

#### Run a specific test:

```shell
asv run --config benchmarks/asv.conf.json -b bench_json_schema.JsonSchemaBenchmark.time_json_schema_to_fsm
```

#### Profile a specific test:

```shell
asv run --config benchmarks/asv.conf.json --profile -b bench_json_schema.JsonSchemaBenchmark.time_json_schema_to_fsm
```

#### Compare to `origin/main`

```shell
get fetch origin
asv continuous origin/main HEAD --config benchmarks/asv.conf.json
```

#### ASV PR Behavior

- **View ASV Benchmark Results:** Open the workflow, view `BENCHMARK RESULTS` section.
- Merging is blocked unless benchmarks are run for the latest commit.
- Benchmarks fail if performance degrades by more than 10% for any individual benchmark.
- The "Benchmark PR" workflow runs when it is manually dispatched, or if the `run_benchmarks` label is added to the PR they run for every commit.

### Contribute to the documentation

To work on the *documentation* you will need to install the related dependencies:

```shell
pip install -r requirements-doc.txt
```

To build the documentation and serve it locally, run the following command in the repository's root folder:

```shell
mkdocs serve
```

By following the instruction you will be able to view the documentation locally.
It will be updated every time you make a change.

## Open a Pull Request

Create a new branch on your fork, commit and push the changes:

```shell
git checkout -b new-branch
git add .
git commit -m "Changes I made"
git push origin new-branch
```

Then you can [open a pull request][pull-requests] on GitHub. It should prompt you to do so. Every subsequent change that you make on your branch will update the pull request.

Do not hesitate to open a draft PR before your contribution is ready, especially if you have questions and/or need feedback. If you need help, come tell us on [Discord][discord].

[discord]: https://discord.gg/R9DSu34mGd
[discussions]: https://github.com/dottxt-ai/outlines/discussions
[issues]: https://github.com/dottxt-ai/outlines/issues
[pull-requests]: https://github.com/dottxt-ai/outlines/pulls


================================================
FILE: docs/community/examples.md
================================================
# Community projects and articles

Publishing examples and articles about Outlines are a meaningful way to contribute to the community. Here is a list of projects we are aware of. Drop us a line if we forgot yours!

[MMSG](https://github.com/leloykun/mmsg) is a Python library for generating interleaved text and image content in a structured format you can directly pass to downstream APIs.

[Multimodal Structured Generation: CVPR's 2nd MMFM Challenge Technical Report](https://arxiv.org/abs/2406.11403) shows that Structured Generation can outperform finetuning, and maybe even multimodality, in document-image understanding tasks as part of CVPR's 2nd MMFM Challenge.

[Chess LLM Arena](https://huggingface.co/spaces/mlabonne/chessllm) is a HuggingFace Space where you can make LLMs compete in a chess match.

[LLM Data Gen](https://huggingface.co/spaces/lhoestq/LLM_DataGen) is a HuggingFace Space that generates synthetic dataset files in JSONLines format.

[Fast, High-Fidelity LLM Decoding with Regex Constraints ](https://vivien000.github.io/blog/journal/llm-decoding-with-regex-constraints.html) presents an efficient alternative to Outlines's structured generation.

[gigax](https://github.com/GigaxGames/gigax) is an Open-Source library that allows to create real-time LLM-powered NPCs for video games.

[Improving Prompt Consistency with Structured Generations](https://huggingface.co/blog/evaluation-structured-outputs) shows how structured generation can improve consistency of evaluation runs by reducing sensitivity to changes in prompt format.

[AskNews](https://asknews.app) is a news curation service processing 300k news articles per day in a structured way, with Outlines.


================================================
FILE: docs/community/feedback.md
================================================
---
title: Feedback
---

# Feedback

If Outlines has been helpful to you, let us know on [Discord][discord] or give us a shoutout on [Twitter][twitter]! It's always heartwarming ❤️


<head>
  <!-- From Marvin AI's documentation -->
  <!-- Their library is also awesome -->
  <!-- https://www.askmarvin.ai/ -->
  <style>
    .tweet-masonry {
      column-count: 2;
      column-gap: 20px;
      padding: 20px;
    }

    .twitter-tweet {
      display: inline-block;
      width: 100%;
      margin-bottom: 20px;
      margin-top: 0px !important;
      break-inside: avoid;
    }

    @media (max-width: 600px) {
      .tweet-masonry {
        column-count: 1;
      }
    }
  </style>
</head>
<body>

<div class="tweet-masonry">

<blockquote class="twitter-tweet"><p lang="en" dir="ltr">I am once again reminding you that structured extraction using LLMs is going to transform every single industry in the next 10 years <a href="https://t.co/xQ3tcWnrZ8">https://t.co/xQ3tcWnrZ8</a></p>&mdash; Sam Hogan (@0xSamHogan) <a href="https://twitter.com/0xSamHogan/status/1780637917737816323?ref_src=twsrc%5Etfw">April 17, 2024</a></blockquote> <script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>

<blockquote class="twitter-tweet"><p lang="en" dir="ltr">outline&#39;s growth is insane, using is an understatement! <a href="https://t.co/rHCNWhZdCs">https://t.co/rHCNWhZdCs</a></p>&mdash; jason liu (@jxnlco) <a href="https://twitter.com/jxnlco/status/1780618454040797554?ref_src=twsrc%5Etfw">April 17, 2024</a></blockquote> <script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>

<blockquote class="twitter-tweet"><p lang="en" dir="ltr">Outlines is an amazing lib and more popular than <a href="https://twitter.com/remilouf?ref_src=twsrc%5Etfw">@remilouf</a>’s modesty will admit. <a href="https://t.co/DfHbMPIlX1">https://t.co/DfHbMPIlX1</a> <a href="https://t.co/mDHIWJrD0C">https://t.co/mDHIWJrD0C</a></p>&mdash; Delip Rao e/σ (@deliprao) <a href="https://twitter.com/deliprao/status/1780780217180598377?ref_src=twsrc%5Etfw">April 18, 2024</a></blockquote> <script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>

<blockquote class="twitter-tweet"><p lang="en" dir="ltr">Impressive implementation of a true regex / json / grammar guided text generation <a href="https://t.co/RX5RVYaVIx">pic.twitter.com/RX5RVYaVIx</a></p>&mdash; Rohan Paul (@rohanpaul_ai) <a href="https://twitter.com/rohanpaul_ai/status/1741099984299135403?ref_src=twsrc%5Etfw">December 30, 2023</a></blockquote> <script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>

<blockquote class="twitter-tweet"><p lang="en" dir="ltr">Most underrated Github Repo in AI + LLM JSON guided Generation: <a href="https://t.co/lSB8KIet1H">https://t.co/lSB8KIet1H</a></p>&mdash; 🎙Jean-Louis Queguiner (@JiliJeanlouis) <a href="https://twitter.com/JiliJeanlouis/status/1736857292581093706?ref_src=twsrc%5Etfw">December 18, 2023</a></blockquote> <script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>

<blockquote class="twitter-tweet"><p lang="en" dir="ltr">Nice and useful. <a href="https://t.co/LX72AE0lgt">https://t.co/LX72AE0lgt</a></p>&mdash; Dan Roy (@roydanroy) <a href="https://twitter.com/roydanroy/status/1691556956941525458?ref_src=twsrc%5Etfw">August 15, 2023</a></blockquote> <script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>

<blockquote class="twitter-tweet"><p lang="en" dir="ltr">HUGE dub for open source AI <a href="https://t.co/bYKuiEUZ1j">https://t.co/bYKuiEUZ1j</a></p>&mdash; kenneth 🖇 (@k3nnethfrancis) <a href="https://twitter.com/k3nnethfrancis/status/1691304781732843521?ref_src=twsrc%5Etfw">August 15, 2023</a></blockquote> <script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>

<blockquote class="twitter-tweet"><p lang="en" dir="ltr">This is amazing - glad to see more outp guidance modules! <br><br>Will try this out soon I&#39;m wondering how they translate from regex automatons to token boundaries<br><br>Also why Open Source will succeed. Even today I don&#39;t see any guided output functionality from the big providers. <a href="https://t.co/Ity2H25Klf">https://t.co/Ity2H25Klf</a></p>&mdash; Hrishi (@hrishioa) <a href="https://twitter.com/hrishioa/status/1691181499671080960?ref_src=twsrc%5Etfw">August 14, 2023</a></blockquote> <script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>

<blockquote class="twitter-tweet"><p lang="en" dir="ltr">Outlines - a library to help LLM developers guide text generation in a fast and reliable way.<br><br>&quot;Provides generation methods that guarantee that the output will match a regular expressions, or follow a JSON schema.&quot;<br><br>Need to check this out. Reliable JSON output is a common use… <a href="https://t.co/Bkbh8vKogN">pic.twitter.com/Bkbh8vKogN</a></p>&mdash; elvis (@omarsar0) <a href="https://twitter.com/omarsar0/status/1691179888214966273?ref_src=twsrc%5Etfw">August 14, 2023</a></blockquote> <script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>

<blockquote class="twitter-tweet"><p lang="en" dir="ltr">Woah this is cool! Makes open source models more usable.<br><br>Give any LLM Function Call capability (and more) with Outlines: <a href="https://t.co/PtPykR5ZGR">https://t.co/PtPykR5ZGR</a> <a href="https://t.co/RRQjWHnIxv">https://t.co/RRQjWHnIxv</a> <a href="https://t.co/BwNnH8SMwv">pic.twitter.com/BwNnH8SMwv</a></p>&mdash; Yohei (@yoheinakajima) <a href="https://twitter.com/yoheinakajima/status/1691231912466223104?ref_src=twsrc%5Etfw">August 14, 2023</a></blockquote> <script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>

<blockquote class="twitter-tweet"><p lang="en" dir="ltr">This is awesome! Being able to guarantee the output&#39;s structure unblocks so many applications. This is a great milestone and a fundamental building block for more advanced AI apps. <a href="https://t.co/WdwMOc7hE8">https://t.co/WdwMOc7hE8</a></p>&mdash; Guilherme Castro (@skastr052) <a href="https://twitter.com/skastr052/status/1691239359494619136?ref_src=twsrc%5Etfw">August 15, 2023</a></blockquote> <script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>

<blockquote class="twitter-tweet"><p lang="en" dir="ltr">Juggling with the unpredictable outputs of ChatGPT API lately while building my product. 😓 <br><br>Tried prompt engineering to channel its wisdom into a neat JSON, but it&#39;s like asking a cat to fetch. 🐱<br><br>Luckily, stumbled upon &quot;Outlines&quot; – looks like a promising way to tame the LLM… <a href="https://t.co/oYQ6q8exAS">pic.twitter.com/oYQ6q8exAS</a></p>&mdash; Charlie (@14435635Sun) <a href="https://twitter.com/14435635Sun/status/1691439342689095680?ref_src=twsrc%5Etfw">August 15, 2023</a></blockquote> <script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>

<blockquote class="twitter-tweet"><p lang="en" dir="ltr">A complex system of LLM input-outputs interacting with non-LLM agents and models benefits immeasurably from structured outputs. The outlines package saves so much time, <a href="https://t.co/NhVQ6NpKDR">https://t.co/NhVQ6NpKDR</a></p>&mdash; Amir Sani (@amirsani) <a href="https://twitter.com/amirsani/status/1728734266568376433?ref_src=twsrc%5Etfw">November 26, 2023</a></blockquote> <script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>
</div>
</body>
</html>

# Let us know!

We highly value the insights of our users, and we would love to hear from you. If you are using Outlines for your projects and would like to share your experience with us, let's connect:

- What are you building with it?
- What do you like about it?
- What challenges are you facing?
- What do you think could be improved?

To schedule an appointment follow [this link](https://cal.com/dottxt/outlines). This is exclusively intended to share your experience, please go on [Discord][discord] or [GitHub](https://github.com/dottxt-ai/outlines/discussions) for support.

[discord]: https://discord.gg/UppQmhEpe8
[twitter]: https://twitter.com/dottxtai


================================================
FILE: docs/community/index.md
================================================
# Community

Outlines exists for a community of users who believe software doesn't need to be complicated. Who share the same passion for Large Language Models but don't want to compromise on robustness. Together, we are bringing these powerful models back to the world of software.

## Connect on Discord

The Outlines community lives on our Discord server. There you can ask questions, share ideas or just chat with people like you. Don't be a stranger and [join us][discord].

[discord]: https://discord.gg/UppQmhEpe8


================================================
FILE: docs/community/versioning.md
================================================
---
title: Versioning Guide
---

# Versioning Guide


The Outlines project follows a structured versioning scheme designed to provide clarity and minimize risk for downstream dependents.

Each part of the version number (`major.minor.patch`) conveys information about the nature and impact of the changes included in the release.

- **Major Releases** includes compatibility-breaking changes to core interfaces, such as `LogitsProcessor`s and `Guides`.
- **Minor Releases** introduce changes of substance to internal or unexposed functionality. These changes are well tested and intended to maintain compatibility with existing use of core interfaces.
- **Patch Releases** address bug fixes and incorporate low-risk changes to improve stability and performance.

!!! note "Breaking Changes"

    Outlines v1.0 introduced several breaking changes to the core interface. See [the migration guide](/user_guide/migration) for more details.

## Releases

Releases along with release notes can be found on the [Outlines Releases GitHub Page](https://github.com/dottxt-ai/outlines/releases).

## Version Pinning Recommendations

Here are our recommendations for managing dependencies on the Outlines package:

**Small, Risk-Tolerant Projects:** Pin to a specific major version.

**Large, Conservative Projects:** Pin to a specific minor version.


================================================
FILE: docs/core_concepts.md
================================================
---
title: Core concepts
---

# Core concepts

Coming soon. This will document various concepts at a high level, so users can understand Outlines before diving into specific implementations.

1. Constrained decoding, tokens, and the basics of logit biasing
2. Different ways to define output structure (regex, JSON schema, Pydantic models, context-free grammars)
3. How finite state machines are used to guarantee output structure
4. `Generator`, `Application`, `Template`,
5. Prompt engineering vs. structured generation


================================================
FILE: docs/examples/chain_of_density.md
================================================
# Summarize documents using Chain of Density prompting

A good summary should be informative, concise and clear. While large language models are generally good at summarizing documents, their summaries tend to be long and contain redundant information; their information density tends to be on the lower end. This is where [chain of Density](https://arxiv.org/abs/2309.04269), a new prompting technique, comes in. In this example we will show how one can implement chain of density with a few lines of code using Outlines, leveraging both Outline's prompt templating and its structured generation capabilities.

The article we will try to summarize is the first three paragraphs of the [Alan Turing page on Wikipedia](https://en.wikipedia.org/wiki/Alan_Turing):

```python
article = """
Alan Mathison Turing OBE FRS (/ˈtjʊərɪŋ/; 23 June 1912 – 7 June 1954) was an English mathematician, computer scientist, logician, cryptanalyst, philosopher and theoretical biologist.[5] Turing was highly influential in the development of theoretical computer science, providing a formalisation of the concepts of algorithm and computation with the Turing machine, which can be considered a model of a general-purpose computer.[6][7][8] He is widely considered to be the father of theoretical computer science and artificial intelligence.[9]

Born in Maida Vale, London, Turing was raised in southern England. He graduated at King's College, Cambridge, with a degree in mathematics. Whilst he was a fellow at Cambridge, he published a proof demonstrating that some purely mathematical yes–no questions can never be answered by computation. He defined a Turing machine and proved that the halting problem for Turing machines is undecidable. In 1938, he obtained his PhD from the Department of Mathematics at Princeton University. During the Second World War, Turing worked for the Government Code and Cypher School at Bletchley Park, Britain's codebreaking centre that produced Ultra intelligence. For a time he led Hut 8, the section that was responsible for German naval cryptanalysis. Here, he devised a number of techniques for speeding the breaking of German ciphers, including improvements to the pre-war Polish bomba method, an electromechanical machine that could find settings for the Enigma machine. Turing played a crucial role in cracking intercepted coded messages that enabled the Allies to defeat the Axis powers in many crucial engagements, including the Battle of the Atlantic.[10][11]

After the war, Turing worked at the National Physical Laboratory, where he designed the Automatic Computing Engine, one of the first designs for a stored-program computer. In 1948, Turing joined Max Newman's Computing Machine Laboratory at the Victoria University of Manchester, where he helped develop the Manchester computers[12] and became interested in mathematical biology. He wrote a paper on the chemical basis of morphogenesis[1] and predicted oscillating chemical reactions such as the Belousov–Zhabotinsky reaction, first observed in the 1960s. Despite these accomplishments, Turing was never fully recognised in Britain during his lifetime because much of his work was covered by the Official Secrets Act.[13]
"""
```

## How Chain Of Density works

Chain Of Density starts with asking the model to generate a first long and non-specific summary. Then it asks the model to generate 4 extra summaries by proceeding in the following way:

1. Identify 1-3 entities missing in the previous summary;
2. Add all entities marked as missing in the previous step, while not dropping entities;
3. Make the summary more concise;

The prompt also asks the model to return a list of JSON objects that contain the missing entities and the new summary. This is where structured generation will come in handy :) The paper provides the prompt and an example:

![Figure 2 in the paper](./images/chain_of_density.png)

We can now implement the prompt provided in the paper. We stored the prompt template in a text file, and we can load it using the `Template` class:

```python
from outlines import Template

chain_of_density = Template.from_file("prompt_templates/chain_of_density.txt")
```

??? Note

    Note that we modified the prompt slightly so it returns a JSON object that contains the summaries, instead of a list of summaries.


## Outlines implementation

We will use Outline's JSON-structured generation to ensure that the model's output is consistent with the format specified in the prompt. We start with defining the JSON objects that the model is asked to return using Pydantic. One JSON object that contains a list of `Summary` objects that contain the missing entities and new summary:

```python
from pydantic import BaseModel, conlist

class Summary(BaseModel):
    missing_entities: str
    denser_summary: str

class Summaries(BaseModel):
    summaries: conlist(Summary, max_length=5, min_length=5)
```

We now generate the prompt by passing the article we want to summarize to the prompt template previously loaded. We load a quantized version of Mistral-7B using the AutoAWQ library, and then use the `Summaries` schema to generate the summaries with structured generation:

```python
import outlines
import transformers

MODEL_NAME = "TheBloke/Mistral-7B-OpenOrca-AWQ"

model = outlines.from_transformers(
    transformers.AutoModelForCausalLM.from_pretrained(MODEL_NAME),
    transformers.AutoTokenizer.from_pretrained(MODEL_NAME)
)
prompt = chain_of_density(article=article)
result = model(prompt, Summaries, max_new_tokens=2000)
```

We can now check the results:

```python
print(result)
# {'summaries': [
#     {
#       'missing_entities': 'English mathematician, cryptanalyst, philosopher',
#       'denser_summary': 'Alan Mathison Turing was an English mathematician, cryptanalyst, philosopher.'
#     },
#     {
#       'missing_entities': '',
#       'denser_summary': "Alan Mathison Turing was an English mathematician who was a crucial figure in WW2's Bletchley Park codebreaking centre and designed one of the first computers."
#     },
#     {
#       'missing_entities': 'cryptanalyst, studied, biology, father',
#       'denser_summary': 'Alan Mathison Turing was an English cryptanalyst, studied theoretical computer science, and contributed to mathematical biology.'
#     },
#     {
#       'missing_entities': 'biology, morphogenesis, chemical',
#       'denser_summary': 'Alan Mathison Turing was an English cryptanalyst, studied theoretical computer science, and predicted chemical reactions in morphogenesis.
#     '},
#     {
#       'missing_entities': '',
#       'denser_summary': 'Alan Mathison Turing was an English cryptanalyst, developed computer science, and made strides in mathematical biology research.'
#       }
# ]}
```

Not bad, considering we used a smallish model to generate the summary! Chain of Density seems to be a very effective prompting technique to generate dense summaries, even with small quantized models. Its implementation in Outlines is also very short.

Note that this is the first article I tried and it worked out of the box. Try it out on other articles, and please share the results on Twitter, or by opening [a new discussion](https://github.com/dottxt-ai/outlines/discussions/categories/show-and-tell) on the Outlines repository!


================================================
FILE: docs/examples/chain_of_thought.md
================================================
# Chain of thought


Chain of thought is a prompting technique introduced in the paper ["Chain-of-Thought Prompting Elicits Reasoning in Large Language Models"](https://arxiv.org/abs/2201.11903) where throught prompting the authors generate a series of intermediate reasoning steps which improves the ability of LLMs to perform complex reasoning.

In this guide, we use [outlines](https://dottxt-ai.github.io/outlines/) to apply chain of thought through structured output.

We use [llama.cpp](https://github.com/ggerganov/llama.cpp) using the [llama-cpp-python](https://github.com/abetlen/llama-cpp-python) library. Outlines supports llama-cpp-python, but we need to install it ourselves:

```shell
pip install llama-cpp-python
```

To create an outlines `LlamaCpp` model, you first need to create a `Llama` object from the `llama-cpp-python` library. Then you can create the outlines model by calling `models.from_llamacpp` with the `Llama` object instance as argument. To create the `Llama` object, you need to provide the model weights by passing the name of the repository on the HuggingFace Hub, and the filenames or glob pattern (it will automatically download the weights from the hub):

```python
import llama_cpp
import outlines

llm = llama_cpp.Llama(
    "NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF",
    tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(
        "NousResearch/Hermes-2-Pro-Llama-3-8B"
    ),
    n_gpu_layers=-1,
    flash_attn=True,
    n_ctx=8192,
    verbose=False
)
model = outlines.from_llamacpp(llm)
```

??? note "(Optional) Store the model weights in a custom folder"

    By default the model weights are downloaded to the hub cache but if we want so store the weights in a custom folder, we pull a quantized GGUF model [Hermes-2-Pro-Llama-3-8B](https://huggingface.co/NousResearch/Hermes-2-Theta-Llama-3-8B-GGUF) by [NousResearch](https://nousresearch.com/) from [HuggingFace](https://huggingface.co/):

    ```shell
    wget https://hf.co/NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/resolve/main/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
    ```

    We initialize the model:

    ```python
    from llama_cpp import Llama

    llm = Llama("/path/to/model/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf", ...)
    ```

## Chain of thought

We first define our Pydantic class for a reasoning step:

```python
from pydantic import BaseModel, Field

class Reasoning_Step(BaseModel):
    reasoning_step: str = Field(..., description="Reasoning step")
```

We then define the Pydantic class for reasoning which will consist on a list of reasoning steps and a conclusion, and we get its JSON schema:

```python
from typing import List

class Reasoning(BaseModel):
    reasoning: List[Reasoning_Step] = Field(..., description="List of reasoning steps")
    conclusion: str = Field(..., description="Conclusion")

json_schema = Reasoning.model_json_schema()
```

We then need to adapt our prompt to the [Hermes prompt format for JSON schema](https://github.com/NousResearch/Hermes-Function-Calling?tab=readme-ov-file#prompt-format-for-json-mode--structured-outputs):

```python
from outlines import Template

generate_hermes_prompt = Template.from_string(
    """
    <|im_start|>system
    You are a world class AI model who answers questions in JSON
    Here's the json schema you must adhere to:
    <schema>
    {{ json_schema }}
    </schema>
    <|im_end|>
    <|im_start|>user
    {{ user_prompt }}
    <|im_end|>
    <|im_start|>assistant
    <schema>
    """
)
```

For a given user prompt:

```python
user_prompt = "9.11 and 9.9 -- which is bigger?"
```

We can use `outlines.Generator` with the Pydantic class we previously defined, and call the generator with the Hermes prompt:

```python
generator = outlines.Generator(model, regex_str)
prompt = generate_hermes_prompt(json_schema=json_schema, user_prompt=user_prompt)
response = generator(prompt, max_tokens=1024, temperature=0, seed=42)
```

We obtain a series of intermediate reasoning steps as well as the conclusion:

```python
import json

json_response = json.loads(response)

print(json_response["reasoning"])
print(json_response["conclusion"])
# [{'reasoning_step': 'Both 9.11 and 9.9 are decimal numbers.'},
#  {'reasoning_step': 'When comparing decimal numbers, we look at the numbers after the decimal point.'},
#  {'reasoning_step': 'In this case, 9.11 has the number 1 after the decimal point, while 9.9 has the number 9.'},
#  {'reasoning_step': 'Since 1 is greater than 9, 9.11 is greater than 9.9.'}]
# '9.11 is bigger.'
```

We notice that the 4th reasoning step is wrong ``Since 1 is greater than 9, 9.11 is greater than 9.9.'', so we should probably give the model some examples for this particular task.

This example was originally contributed by [Alonso Silva](https://github.com/alonsosilvaallende).


================================================
FILE: docs/examples/classification.md
================================================
# Classification

Classification is a classic problem in NLP and finds many applications: spam detection, sentiment analysis, triaging of incoming requests, etc. We will use the example of a company that wants to sort support requests between those that require immediate attention (`URGENT`), those that can wait a little (`STANDARD`). You could easily extend the example by adding new labels.


This tutorial shows how one can implement multi-label classification using Outlines.

As always, we start with initializing the model. Since we are GPU poor we will be using a quantized version of Mistal-7B-v0.1:

```python
import outlines
import transformers

MODEL_NAME = "TheBloke/Mistral-7B-OpenOrca-AWQ"

model = outlines.from_transformers(
    transformers.AutoModelForCausalLM.from_pretrained(MODEL_NAME),
    transformers.AutoTokenizer.from_pretrained(MODEL_NAME)
)
```

We will use a prompt template stored in a text file:

```python
from outlines import Template

customer_support = Template.from_file("prompt_templates/classification.txt")
```

## Choosing between multiple choices

Outlines provides a convenient way to do multi-label classification, passing a Literal type hint to the `outlines.Generator` object:

```python
from typing import Literal
import outlines

generator = outlines.Generator(model, Literal["URGENT", "STANDARD"])

```
Outlines supports batched requests, so we will pass two requests to the model:

```python
requests = [
    "My hair is one fire! Please help me!!!",
    "Just wanted to say hi"
]

prompts = [customer_support(request=request) for request in requests]
```

We can now ask the model to classify the requests:

```python
labels = generator(prompts)
print(labels)
# ['URGENT', 'STANDARD']
```

## Using JSON-structured generation

Another (convoluted) way to do multi-label classification is to JSON-structured generation in Outlines. We first need to define our Pydantic schema that contains the labels:

```python
from enum import Enum
from pydantic import BaseModel


class Label(str, Enum):
    urgent = "URGENT"
    standard = "STANDARD"


class Classification(BaseModel):
    label: Label
```

We can then create a generator with the Pydantic model we just defined and call it:

```python
generator = outlines.Generator(model, Classification)
labels = generator(prompts)
print(labels)
# ['{"label":"URGENT"}', '{ "label": "STANDARD" }']
```


================================================
FILE: docs/examples/dating_profiles.md
================================================
# Generate a synthetic dating profile from a description

In this example we will see how we can use Outlines to generate synthetic data for a dating application. This example was originally contributed by [Vibhor Kumar](https://github.com/veezbo).

```python
import json
from dataclasses import dataclass
from enum import Enum

import torch
import transformers
from pydantic import BaseModel, conlist, constr

import outlines
```

## Defining the profile with Pydantic

Here a dating profile will consist in a biography, a job, a list of interests and two question-answer pairs. The questions are written in advance by the team, and the users are asked to provide an answer:

```python
class QuestionChoice(str, Enum):
    A = "The key to my heart is"
    B = "The first item on my bucket list is"
    C = "Perks of dating me"
    D = "Message me if you also love"
    E = "People would describe me as"
    F = "I can beat you in a game of"

@dataclass
class QuestionAnswer:
    question: QuestionChoice
    answer: str
```

Users need to provide a short biography, with a minimum of 10 and a maximum of 300 characters. The application also limits job descriptions to 50 characters. In addition to the question-answer pairs, the user is required to provide a list of between 1 and 5 interests:

```python
class DatingProfile(BaseModel):
    bio: constr(str, min_length=10, max_length=300)
    job: constr(str, max_lengt=50)
    interests: conlist(str, min_length=1, max_length=5)  # type: ignore
    qna1: QuestionAnswer
    qna2: QuestionAnswer
```

## Prompt template and examples

We will ask the model to generate profiles from a high-level description:

```python
@dataclass
class Example:
    description: str
    profile: DatingProfile
```

We will use Outlines' prompt templating abilities to generate the prompt for us. This help clearly separate the general prompting logic from what is specific to an example.

```python
from outlines import Template

dating_profile_prompt = Template.from_string(
    """
    You are a world-renowned matchmaker who understands the modern dating
    market. Your job is to generate dating app profiles for male clients
    interested in women based on a provided description. The profiles should be
    authentic, show off their strengths, and maximize their likelihood of
    getting matches on dating apps.  Here are some examples of past clients that
    you have successfully created profiles for:

    {% for example in examples %}
    Description:
    {{ example.description }}
    Profile:
    {{ example.profile }}
    {% endfor %}

    Here is the new client who you need to create a profile for:
    Description: {{ description }}
    Profile:
    """
)
```

We will provide the model with several few-shot examples:

```python
samples: list[Example] = [
    Example(
        description="I'm an author and former professional soccer player living in Seattle who publishes popular fiction books. A typical day for me starts by hanging out with my cat, drinking a coffee, and reading as much as I can in a few hours. Then, I'll prepare a quick smoothie before starting to write for a few hours, take a break with soccer or running a few miles, and finally meet friends for dinner at a new, hip restaurant in the evening. Sometimes we go axe-throwing afterwards, or play poker, or watch a comedy show, or visit a dive bar. On my vacations, I travel extensively to countries South America, Europe, and Asia, with the goal of visiting them all!",
        profile=DatingProfile(
            bio="Adventurer, dreamer, author, and soccer enthusiast. Life’s too short to waste time so I make the most of each day by exploring new places and playing with my friends on the pitch. What’s your favorite way to get out and have fun?",
            job="Famous Soccer Player -> Famous Author",
            interests=["Soccer", "Travel", "Friends", "Books", "Fluffy Animals"],
            qna1=QuestionAnswer(
                question=QuestionChoice.B, answer="swim in all seven oceans!"
            ),
            qna2=QuestionAnswer(
                question=QuestionChoice.E,
                answer="fun-loving, adventurous, and a little bit crazy",
            ),
        ),
    ),
    Example(
        description="I run my company and build houses for a living. I'm a big fan of the outdoors and love to go hiking, camping, and fishing. I don't like video games, but do like to watch movies. My love language is home-cooked food, and I'm looking for someone who isn't afraid to get their hands dirty.",
        profile=DatingProfile(
            bio="If you're looking for a Montana man who loves to get outdoors and hunt, and who's in-tune with his masculinity then I'm your guy!",
            job="House Construction Manager / Entrepreneur",
            interests=["Hunting", "Hiking", "The outdoors", "Home-cooked food"],
            qna1=QuestionAnswer(question=QuestionChoice.A, answer="food made at home"),
            qna2=QuestionAnswer(
                question=QuestionChoice.C,
                answer="having a man in your life who can fix anything",
            ),
        ),
    ),
    Example(
        description="I run my own Youtube channel with 10M subscribers. I love working with kids, and my audience skews pretty young too. In my free time, I play Fortnite and Roblox. I'm looking for someone who is also a gamer and likes to have fun. I'm learning Japanese in my free time as well as how to cook.",
        profile=DatingProfile(
            bio="Easy on the eyes (find me on Youtube!) and great with kids. What more do you need?",
            job="Youtuber 10M+ subscribers",
            interests=["Kids", "Gaming", "Japanese"],
            qna1=QuestionAnswer(question=QuestionChoice.D, answer="anime and gaming!"),
            qna2=QuestionAnswer(question=QuestionChoice.F, answer="Fortnite, gg ez"),
        ),
    ),
]
```

## Load the model

We will use Mosaic's MPT-7B model (requires 13GB of GPU memory) which can fit on a single GPU with a reasonable context window. We initialize it with Outlines:

```python
MODEL_NAME = "mosaicml/mpt-7b-8k-instruct"

config = transformers.AutoConfig.from_pretrained(
    MODEL_NAME, trust_remote_code=True
)
config.init_device = "meta"
model_kwargs = {
    "config": config,
    "trust_remote_code": True,
    "torch_dtype": torch.bfloat16,
    "device_map": "cuda",
}
tf_model = transformers.AutoModelForCausalLM.from_pretrained(MODEL_NAME, **model_kwargs)
tf_tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_NAME)
model = outlines.from_transformers(tf_model, tokenizer=tf_tokenizer)
```

## JSON-structured generation of profiles

We will now generate a dating profile from a textual description of oneself:

``` python
new_description = """I'm a laid-back lawyer who spends a lot of his free-time
gaming. I work in a corporate office, but ended up here after the start-up  I
cofounded got acquired, so still play ping pong with my cool coworkers every
day.  I have a bar at home where I make cocktails, which is great for
entertaining  friends. I secretly like to wear suits and get a new one tailored
every few  months. I also like weddings because I get to wear those suits, and
it's  a good excuse for a date. I watch the latest series because I'm paying,
with my hard-earned money, for every streaming service."""

prompt = dating_profile_prompt(description=new_description, examples=samples)
profile = model(prompt, DatingProfile)
parsed_profile = DatingProfile.model_validate_json(json.loads(profile))
```

## Results

Here are a couple of results:

```json
{
    "bio": """I'm an ambitious lawyer with a casual and fashionable style. I love
    games and sports, but my true passion is preparing refreshing cocktails at
    home and dressing to the nines at weddings. I'm currently looking for a woman
    to show a good time to and get a kiss on the opulent suit I just had made.
    Send resume to this inbox.""",
    "job": "Lawyer",
    "interests":
    [
        "Stylish guys",
        "Gaming",
        "Ping pong",
        "Cocktails",
        "Weddings"
    ],
    "qna1":
    {
        "question": "The first item on my bucket list is",
        "answer": "be married and have a family."
    },
    "qna2":
    {
        "question": "People would describe me as",
        "answer": "charming, stylish, and funny."
    }
}
```

```json
{
    "bio": """I’m a sexy lawyer with time on my hands. I love to game and
    play ping pong, but the real reason you should swipe to the right
    is because I look great in a suit. Who doesn’t love a man in a
    suit? Just saying. Send me a message if you think it’s time to take
    your dating life to the next level.""",
    "job": "Lawyer",
    "interests":
    [
        "Gaming",
        "Ping Pong",
        "Tailored Suits",
        "Weddings",
        "Streaming Services"
    ],
    "qna1":
    {
        "question": "The first item on my bucket list is",
        "answer": "simulate space but stay alive for as long as possible"
    },
    "qna2":
    {
        "question": "People would describe me as",
        "answer": "easy-going, a little nerdy but with a mature essence"
    }
}
```


================================================
FILE: docs/examples/deploy-using-bentoml.md
================================================
# Run Outlines using BentoML

[BentoML](https://github.com/bentoml/BentoML) is an open-source model serving library for building performant and scalable AI applications with Python. It comes with tools that you need for serving optimization, model packaging, and production deployment.

In this guide, we will show you how to use BentoML to run programs written with Outlines on GPU locally and in [BentoCloud](https://www.bentoml.com/), an AI Inference Platform for enterprise AI teams. The example source code in this guide is also available in the [examples/bentoml/](https://github.com/dottxt-ai/outlines/blob/main/examples/bentoml/) directory.

## Import a model

First we need to download an LLM (Mistral-7B-v0.1 in this example and you can use any other LLM) and import the model into BentoML's [Model Store](https://docs.bentoml.com/en/latest/guides/model-store.html). Let's install BentoML and other dependencies from PyPi (preferably in a virtual environment):

```shell
pip install -r requirements.txt
```

Then save the code snippet below as `import_model.py` and run `python import_model.py`.

**Note**: You need to accept related conditions on [Hugging Face](https://huggingface.co/mistralai/Mistral-7B-v0.1) first to gain access to Mistral-7B-v0.1.

```python
import bentoml

MODEL_ID = "mistralai/Mistral-7B-v0.1"
BENTO_MODEL_TAG = MODEL_ID.lower().replace("/", "--")

def import_model(model_id, bento_model_tag):

    import torch
    from transformers import AutoModelForCausalLM, AutoTokenizer

    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True,
    )

    with bentoml.models.create(bento_model_tag) as bento_model_ref:
        tokenizer.save_pretrained(bento_model_ref.path)
        model.save_pretrained(bento_model_ref.path)


if __name__ == "__main__":
    import_model(MODEL_ID, BENTO_MODEL_TAG)
```

You can verify the download is successful by running:

```shell
$ bentoml models list

Tag                                          Module  Size        Creation Time
mistralai--mistral-7b-v0.1:m7lmf5ac2cmubnnz          13.49 GiB   2024-04-25 06:52:39
```

## Define a BentoML Service

As the model is ready, we can define a [BentoML Service](https://docs.bentoml.com/en/latest/guides/services.html) to wrap the capabilities of the model.

We will run the JSON-structured generation example [in the README](https://github.com/dottxt-ai/outlines?tab=readme-ov-file#efficient-json-generation-following-a-json-schema), with the following schema:

```python
DEFAULT_SCHEMA = """{
    "title": "Character",
    "type": "object",
    "properties": {
        "name": {
            "title": "Name",
            "maxLength": 10,
            "type": "string"
        },
        "age": {
            "title": "Age",
            "type": "integer"
        },
        "armor": {"$ref": "#/definitions/Armor"},
        "weapon": {"$ref": "#/definitions/Weapon"},
        "strength": {
            "title": "Strength",
            "type": "integer"
        }
    },
    "required": ["name", "age", "armor", "weapon", "strength"],
    "definitions": {
        "Armor": {
            "title": "Armor",
            "description": "An enumeration.",
            "enum": ["leather", "chainmail", "plate"],
            "type": "string"
        },
        "Weapon": {
            "title": "Weapon",
            "description": "An enumeration.",
            "enum": ["sword", "axe", "mace", "spear", "bow", "crossbow"],
            "type": "string"
        }
    }
}"""
```

First, we need to define a BentoML service by decorating an ordinary class (`Outlines` here) with `@bentoml.service` decorator. We pass to this decorator some configuration and GPU on which we want this service to run in BentoCloud (here an L4 with 24GB memory):

```python
import typing as t
import bentoml

from import_model import BENTO_MODEL_TAG

@bentoml.service(
    traffic={
        "timeout": 300,
    },
    resources={
        "gpu": 1,
        "gpu_type": "nvidia-l4",
    },
)
class Outlines:

    bento_model_ref = bentoml.models.get(BENTO_MODEL_TAG)

    def __init__(self) -> None:
        import outlines
        import torch
        from transformers import AutoModelForCausalLM, AutoTokenizer

        # Load tokenizer and model from the BentoML model reference path
        hf_tokenizer = AutoTokenizer.from_pretrained(self.bento_model_ref.path)
        hf_model = AutoModelForCausalLM.from_pretrained(
            self.bento_model_ref.path,
            torch_dtype=torch.float16,
            low_cpu_mem_usage=True,
            device_map="cuda"
        )

        # Then use the loaded model with Outlines
        self.model = outlines.from_transformers(hf_model, hf_tokenizer)

    ...
```

We then need to define an HTTP endpoint using `@bentoml.api` to decorate the method `generate` of `Outlines` class:

```python
    ...

    @bentoml.api
    async def generate(
        self,
        prompt: str = "Give me a character description.",
        json_schema: t.Optional[str] = DEFAULT_SCHEMA,
    ) -> t.Dict[str, t.Any]:
        import json
        import outlines
        from outlines.types import JsonSchema

        generator = outlines.Generator(self.model, JsonSchema(json_schema))
        character = generator(prompt)

        return json.loads(character)
```

Here `@bentoml.api` decorator defines `generate` as an HTTP endpoint that accepts a JSON request body with two fields: `prompt` and `json_schema` (optional, which allows HTTP clients to provide their own JSON schema). The type hints in the function signature will be used to validate incoming JSON requests. You can define as many HTTP endpoints as you want by using `@bentoml.api` to decorate other methods of `Outlines` class.

Now you can save the above code to `service.py` (or use [this implementation](https://github.com/dottxt-ai/outlines/blob/main/examples/bentoml/)), and run the code using the BentoML CLI.

## Run locally for testing and debugging

Then you can run a server locally by:

```shell
bentoml serve .
```

The server is now active at <http://localhost:3000>. You can interact with it using the Swagger UI or in other different ways:

<details>

<summary>CURL</summary>

```shell
curl -X 'POST' \
  'http://localhost:3000/generate' \
  -H 'accept: application/json' \
  -H 'Content-Type: application/json' \
  -d '{
  "prompt": "Give me a character description."
}'
```

</details>

<details>

<summary>Python client</summary>

```python
import bentoml

with bentoml.SyncHTTPClient("http://localhost:3000") as client:
    response = client.generate(
        prompt="Give me a character description"
    )
    print(response)
```

</details>

Expected output:

```shell
{
  "name": "Aura",
  "age": 15,
  "armor": "plate",
  "weapon": "sword",
  "strength": 20
}
```

## Deploy to BentoCloud

After the Service is ready, you can deploy it to [BentoCloud](https://docs.bentoml.com/en/latest/bentocloud/get-started.html) for better management and scalability. [Sign up](https://cloud.bentoml.com/signup) if you haven't got a BentoCloud account.

Make sure you have [logged in to BentoCloud](https://docs.bentoml.com/en/latest/bentocloud/how-tos/manage-access-token.html), then run the following command to deploy it.

```shell
bentoml deploy .
```

Once the application is up and running on BentoCloud, you can access it via the exposed URL.

**Note**: For custom deployment in your own infrastructure, use [BentoML to generate an OCI-compliant image](https://docs.bentoml.com/en/latest/guides/containerization.html).


================================================
FILE: docs/examples/deploy-using-cerebrium.md
================================================
# Run Outlines using Cerebrium

[Cerebrium](https://www.cerebrium.ai/) is a serverless AI infrastructure platform that makes it easier for companies to build and deploy AI based applications. They offer Serverless GPU's with low cold start times with over 12 varieties of GPU chips that auto scale and you only pay for the compute you use.

In this guide we will show you how you can use Cerebrium to run programs written with Outlines on GPUs in the cloud.

# Setup Cerebrium

First, we install Cerebrium and login to get authenticated.

```shell
pip install cerebrium
cerebrium login
```

Then let us create our first project

```shell
cerebrium init outlines-project
```

## Setup Environment and Hardware

You set up your environment and hardware in the cerebrium.toml file that was created using the init function above.

```toml
[cerebrium.deployment]
docker_base_image_url = "nvidia/cuda:12.1.1-runtime-ubuntu22.04"

[cerebrium.hardware]
cpu = 2
memory = 14.0
gpu = "AMPERE A10"
gpu_count = 1
provider = "aws"
region = "us-east-1"

[cerebrium.dependencies.pip]
outline = "==1.0.0"
transformers = "==4.38.2"
datasets = "==2.18.0"
accelerate = "==0.27.2"
```

## Setup inference

Running code in Cerebrium is like writing normal python with no special syntax. In a `main.py` file specify the following:

```python
import outlines
import transformers
from outlines.types import JsonSchema


model = outlines.from_transformers(
    transformers.AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct"),
    transformers.AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
)

schema = """{
    "title": "Character",
    "type": "object",
    "properties": {
        "name": {
            "title": "Name",
            "maxLength": 10,
            "type": "string"
        },
        "age": {
            "title": "Age",
            "type": "integer"
        },
        "armor": {"$ref": "#/definitions/Armor"},
        "weapon": {"$ref": "#/definitions/Weapon"},
        "strength": {
            "title": "Strength",
            "type": "integer"
        }
    },
    "required": ["name", "age", "armor", "weapon", "strength"],
    "definitions": {
        "Armor": {
            "title": "Armor",
            "description": "An enumeration.",
            "enum": ["leather", "chainmail", "plate"],
            "type": "string"
        },
        "Weapon": {
            "title": "Weapon",
            "description": "An enumeration.",
            "enum": ["sword", "axe", "mace", "spear", "bow", "crossbow"],
            "type": "string"
        }
    }
}"""

generator = outlines.Generator(model, JsonSchema(schema))
```

On first deploy, it will download the model and store it on disk therefore for subsequent calls it will load the model from disk.

Every function in Cerebrium is callable through an API endpoint. Code at the top most layer (ie: not in a function) is instantiated only when the container is spun up the first time so for subsequent calls, it will simply run the code defined in the function you call.

To deploy an API that creates a new character when called with a prompt you can add the following code to `main.py`:

```python
def generate(
    prompt: str = "Amiri, a 53 year old warrior woman with a sword and leather armor.",
):

    character = generator(
        f"<s>[INST]Give me a character description. Describe {prompt}.[/INST]"
    )

    return character
```


## Run on the cloud

```shell
cerebrium deploy
```

You will see your application deploy, install pip packages and download the model. Once completed it will output a CURL request you can use to call your endpoint. Just remember to end
the url with the function you would like to call - in this case /generate. You should see your response returned!


================================================
FILE: docs/examples/deploy-using-modal.md
================================================
# Run Outlines using Modal

[Modal](https://modal.com/) is a serverless platform that allows you to easily run code on the cloud, including GPUs. It can come very handy for those of us who don't have a monster GPU at home and want to be able to quickly and easily provision, configure and orchestrate cloud infrastructure.

In this guide we will show you how you can use Modal to run programs written with Outlines on GPU in the cloud.

## Requirements

We recommend installing `modal` and `outlines` in a virtual environment. You can create one with:

```shell
python -m venv venv
source venv/bin/activate
```

Then install the required packages:

```shell
pip install modal outlines
```

## Build the image

First we need to define our container image. If you need to access a gated model, you will need to provide an [access token](https://huggingface.co/settings/tokens). See the `.env` call below for how to provide a HuggingFace token.

Setting a token is best done by setting an environment variable `HF_TOKEN` with your token. If you do not wish to do this, we provide a commented-out line in the code to set the token directly in the code.

```python
from modal import Image, App, gpu
import os

# This creates a modal App object. Here we set the name to "outlines-app".
# There are other optional parameters like modal secrets, schedules, etc.
# See the documentation here: https://modal.com/docs/reference/modal.App
app = App(name="outlines-app")

# Specify a language model to use.
# Another good model to use is "NousResearch/Hermes-2-Pro-Mistral-7B"
language_model = "mistral-community/Mistral-7B-v0.2"

# Please set an environment variable HF_TOKEN with your Hugging Face API token.
# The code below (the .env({...}) part) will copy the token from your local
# environment to the container.
# More info on Image here: https://modal.com/docs/reference/modal.Image
outlines_image = Image.debian_slim(python_version="3.11").pip_install(
    "outlines",
    "transformers",
    "datasets",
    "accelerate",
    "sentencepiece",
).env({
    # This will pull in your HF_TOKEN environment variable if you have one.
    'HF_TOKEN':os.environ['HF_TOKEN']

    # To set the token directly in the code, uncomment the line below and replace
    # 'YOUR_TOKEN' with the HuggingFace access token.
    # 'HF_TOKEN':'YOUR_TOKEN'
})
```

## Setting the container up

When running longer Modal apps, it's recommended to download your language model when the container starts, rather than when the function is called. This will cache the model for future runs.

```python
# This function imports the model from Hugging Face. The modal container
# will call this function when it starts up. This is useful for
# downloading models, setting up environment variables, etc.
def import_model():
    import outlines
    import transformers

    outlines.from_transformers(
        transformers.AutoModelForCausalLM.from_pretrained(language_model),
        transformers.AutoTokenizer.from_pretrained(language_model)
    )

# This line tells the container to run the import_model function when it starts.
outlines_image = outlines_image.run_function(import_model)
```

## Define a schema

We will run the JSON-structured generation example [in the README](https://github.com/dottxt-ai/outlines?tab=readme-ov-file#efficient-json-generation-following-a-json-schema), with the following schema:

```python
# Specify a schema for the character description. In this case,
# we want to generate a character with a name, age, armor, weapon, and strength.
schema = """{
    "title": "Character",
    "type": "object",
    "properties": {
        "name": {
            "title": "Name",
            "maxLength": 10,
            "type": "string"
        },
        "age": {
            "title": "Age",
            "type": "integer"
        },
        "armor": {"$ref": "#/definitions/Armor"},
        "weapon": {"$ref": "#/definitions/Weapon"},
        "strength": {
            "title": "Strength",
            "type": "integer"
        }
    },
    "required": ["name", "age", "armor", "weapon", "strength"],
    "definitions": {
        "Armor": {
            "title": "Armor",
            "description": "An enumeration.",
            "enum": ["leather", "chainmail", "plate"],
            "type": "string"
        },
        "Weapon": {
            "title": "Weapon",
            "description": "An enumeration.",
            "enum": ["sword", "axe", "mace", "spear", "bow", "crossbow"],
            "type": "string"
        }
    }
}"""
```

To make the inference work on Modal we need to wrap the corresponding function in a `@app.function` decorator. We pass to this decorator the image and GPU on which we want this function to run.

Let's choose an A100 with 80GB memory. Valid GPUs can be found [here](https://modal.com/docs/reference/modal.gpu).

```python
# Define a function that uses the image we chose, and specify the GPU
# and memory we want to use.
@app.function(image=outlines_image, gpu=gpu.A100(size='80GB'))
def generate(
    prompt: str = "Amiri, a 53 year old warrior woman with a sword and leather armor.",
):
    # Remember, this function is being executed in the container,
    # so we need to import the necessary libraries here. You should
    # do this with any other libraries you might need.
    import outlines
    import transformers
    from outlines.types import JsonSchema

    # Load the model into memory. The import_model function above
    # should have already downloaded the model, so this call
    # only loads the model into GPU memory.
    outlines.from_transformers(
        transformers.AutoModelForCausalLM.from_pretrained(language_model, device_map="cuda"),
        transformers.AutoTokenizer.from_pretrained(language_model)
    )

    # Generate a character description based on the prompt.
    # We use the .json generation method -- we provide the
    # - model: the model we loaded above
    # - schema: the JSON schema we defined above
    generator = outlines.Generator(model, JsonSchema(schema))

    # Make sure you wrap your prompt in instruction tags ([INST] and [/INST])
    # to indicate that the prompt is an instruction. Instruction tags can vary
    # by models, so make sure to check the model's documentation.
    character = generator(
        f"<s>[INST]Give me a character description. Describe {prompt}.[/INST]"
    )

    # Print out the generated character.
    print(character)
```

We then need to define a `local_entrypoint` to call our function `generate` remotely.

```python
@app.local_entrypoint()
def main(
    prompt: str = "Amiri, a 53 year old warrior woman with a sword and leather armor.",
):
    # We use the "generate" function defined above -- note too that we are calling
    # .remote() on the function. This tells modal to run the function in our cloud
    # machine. If you want to run the function locally, you can call .local() instead,
    # though this will require additional setup.
    generate.remote(prompt)
```

Here `@app.local_entrypoint()` decorator defines `main` as the function to start from locally when using the Modal CLI. You can save above code to `example.py` (or use [this implementation](https://github.com/dottxt-ai/outlines/blob/main/examples/modal_example.py)). Let's now see how to run the code on the cloud using the Modal CLI.

## Run on the cloud

First install the Modal client from PyPi, if you have not already:

```shell
pip install modal
```

You then need to obtain a token from Modal. Run the following command:

```shell
modal setup
```

Once that is set you can run inference on the cloud using:

```shell
modal run example.py
```

You should see the Modal app initialize, and soon after see the result of the `print` function in your terminal. That's it!


================================================
FILE: docs/examples/earnings-reports.md
================================================
# Extracting financial data from earnings reports

A common task in finance is to extract financial data from earnings reports. Earnings reports are infamously poorly formatted, as the SEC does not have requirements for producing machine-readable documents.

Earnings reports are often provided as HTML documents, which can be difficult to parse. Investors often use complicated parsing systems or manual review to extract data. Entire companies are built around automating this task.

This cookbook is a proof of concept about how we can use LLMs to extract financial data directly into CSV. Comma-separated values are well-structured and can be defined by a regular expression, which Outlines can use to guide the LLM's output.

The example is a smaller subset of a full demo found [here](https://github.com/dottxt-ai/demos/tree/main/earnings-reports). The demo contains the full set of pre-processing steps needed to convert raw HTML into a structured CSV file, and tests the results across three company's 10k reports.

## Setup

Install outlines and required dependencies:

```shell
# Later versions of torch can have difficulty with certain CUDA drivers.
# We recommend using 2.4.0 for now, but you may wish to experiment with
# other versions.
pip install outlines pandas transformers torch==2.4.0 accelerate
```

## Load the model

Choose your language model. We'll use Phi-3 mini, which is small enough to run on reasonably small machines.

```python
import outlines
import torch
import transformers

model_name = 'microsoft/Phi-3-mini-4k-instruct'
tf_model = transformers.AutoModelForCausalLM.from_pretrained(
    model_name, device_map="cuda", torch_dtype=torch.bfloat16
)
tf_tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
model = outlines.from_transformers(tf_model, tf_tokenizer)
```

## Set up the data

For brevity, we've attached the markdown version of Nvidia's 10k report. The [full demonstration](https://github.com/dottxt-ai/demos/tree/main/earnings-reports) processes the raw HTML version of the report to these markdown tables. Pages are filtered by whether they seem to contain income statements, and then compacted into the string you see below.

```python
income_statement = """
Table of ContentsNVIDIA Corporation and SubsidiariesConsolidated Statements of Income(In millions, except per share data)

|  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |  |
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
|  | | | Year Ended | | | | | | | | | | | | | | |
|  | | | Jan 28, 2024 | | |  | | | Jan 29, 2023 | | |  | | | Jan 30, 2022 | | |
| Revenue | | | $ | 60,922 |  |  | | | $ | 26,974 |  |  | | | $ | 26,914 |  |
| Cost of revenue | | | 16,621 | |  |  | | | 11,618 | |  |  | | | 9,439 | |  |
| Gross profit | | | 44,301 | |  |  | | | 15,356 | |  |  | | | 17,475 | |  |
| Operating expenses | | |  | | |  | | |  | | |  | | |  | | |
| Research and development | | | 8,675 | |  |  | | | 7,339 | |  |  | | | 5,268 | |  |
| Sales, general and administrative | | | 2,654 | |  |  | | | 2,440 | |  |  | | | 2,166 | |  |
| Acquisition termination cost | | |  | |  |  | | | 1,353 | |  |  | | |  | |  |
| Total operating expenses | | | 11,329 | |  |  | | | 11,132 | |  |  | | | 7,434 | |  |
| Operating income | | | 32,972 | |  |  | | | 4,224 | |  |  | | | 10,041 | |  |
| Interest income | | | 866 | |  |  | | | 267 | |  |  | | | 29 | |  |
| Interest expense | | | (257) | |  |  | | | (262) | |  |  | | | (236) | |  |
| Other, net | | | 237 | |  |  | | | (48) | |  |  | | | 107 | |  |
| Other income (expense), net | | | 846 | |  |  | | | (43) | |  |  | | | (100) | |  |
| Income before income tax | | | 33,818 | |  |  | | | 4,181 | |  |  | | | 9,941 | |  |
| Income tax expense (benefit) | | | 4,058 | |  |  | | | (187) | |  |  | | | 189 | |  |
| Net income | | | $ | 29,760 |  |  | | | $ | 4,368 |  |  | | | $ | 9,752 |  |
|  | | |  | | |  | | |  | | |  | | |  | | |
| Net income per share: | | |  | | |  | | |  | | |  | | |  | | |
| Basic | | | $ | 12\.05 |  |  | | | $ | 1\.76 |  |  | | | $ | 3\.91 |  |
| Diluted | | | $ | 11\.93 |  |  | | | $ | 1\.74 |  |  | | | $ | 3\.85 |  |
|  | | |  | | |  | | |  | | |  | | |  | | |
| Weighted average shares used in per share computation: | | |  | | |  | | |  | | |  | | |  | | |
| Basic | | | 2,469 | |  |  | | | 2,487 | |  |  | | | 2,496 | |  |
| Diluted | | | 2,494 | |  |  | | | 2,507 | |  |  | | | 2,535 | |  |
"""
```

The markdown tables extracted from the earnings reports can vary widely in row names, column counts, data types, etc. The advantage of LLMs here is that we can define the data we want in terms of the data types, and the LLM will output the data in the desired format.

For comparison, here is how the income statement looks in the original HTML:

![Nvidia income statement](./images/nvidia-income.png)

## Define the data we want

Outlines is often used for JSON output, but it can also be used for CSV. We know the columns we want to extract, and we know the data types of the columns. Year for example is always a four-digit number, revenue is a number with commas, and so on.

We can define a regex pattern for each column type:

```python
# Define the column type regex patterns
column_types = {
    # Year is always a four-digit number
    "year": r"\d{4}",

    # Revenue, operating income, and net income are always numbers with commas.
    # This regex permits integers that may begin with a minus sign, and may have
    # commas separating the thousands, millions, etc.
    "integer_comma": r"((-?\d+),?\d+|(-?\d+))",
    # Number is currently not used, but it represents a number with up to two decimal places.
    "number": r"(-?\d+(?:\.\d{1,2})?)",
}
```

Next, let's choose the columns we want to extract. We want

- Year, always a four-digit number
- Revenue, a number with commas
- Operating income, a number with commas
- Net income, a number with commas

```python
# Define the columns to extract, and their data types.
columns_to_extract = {
    "year": "year",
    "revenue": "integer_comma",
    "operating_income": "integer_comma",
    "net_income": "integer_comma",
}
```

You can modify `column_type_regex` to match the data types of the columns you want to extract.  Adding a new financial metric to extract is as simple as adding a new key/value pair to `columns_to_extract`:

```python
columns_to_extract["diluted_earnings_per_share"] = "number"
```

Additional columns are not well tested for accuracy, so use with caution.

## Create the regex describing the data we want


```python
# Create the header line. This is the requested column names
# separated by commas, i.e. "year,revenue,..."
header = ",".join(columns_to_extract.keys())

# Create the data capture patterns. These are the regex patterns
# that will be used to capture the data in each column
data_patterns = [column_types[dtype] for dtype in columns_to_extract.values()]
data_line = ",".join(data_patterns)

# Our final regex pattern.
max_rows = 3 # We expect 3 rows of data, firms usually report 3 years of income statements
csv_regex = f"{header}(\n{data_line}){{,{max_rows}}}\n\n"

print(csv_regex)
```

which gives us

```
year,revenue,operating_income,net_income,basic_earnings_per_share(
\d{4},((-?\d+),?\d+|(-?\d+)),((-?\d+),?\d+|(-?\d+)),((-?\d+),?\d+|(-?\d+)),(-?\d+(?:\.\d{1,2})?)){,3}
```

Pretty hairy, right? Thankfully, we have a simple function to construct this regex for you. The regex defines a header line, followed by a data line that repeats for each row of data we want to extract. Passing the regex to `outlines.Generator` will produce a function that will __always__ produce a CSV string that is consistent with the regex.

## Prompting the model

Outlines does not add system or instruction tokens by default, so we need to use `transformers.AutoTokenizer` to add them for whatever model we're using.

```python
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)

def add_instruction(prompt):
    return tokenizer.apply_chat_template([{"role": "user", "content": prompt}], tokenize=False, add_generation_prompt=True)

print(add_instruction("Howdy"))
```
```
<|user|>
Howdy<|end|>
<|assistant|>
```

Our prompt roughly describes the task we want the model to perform, and a few pieces of information it may need to know about income statements.

```python
def extract_financial_data_prompt(columns_to_extract, income_statement):
    user_prompt = f"""
    Extract annual financial data from this set of pages. Pages
    are from a 10k filing and were chosen because they may contain
    a comprehensive income statement. Note that selected pages may
    be incorrectly extracted, so you should verify that you are extracting
    from the comprehensive income statement and not some other financial
    statement.

    Create a row for each year available in the income statement with the
    following columns: {', '.join(columns_to_extract.keys())}. Firms typically report the
    most recent 3 years of data, but this can vary.

    Each column has types: {', '.join(columns_to_extract.values())}.

    # Relevant pages:

    {income_statement}

    # Key instructions:

    1. Look ONLY at the "Consolidated Statements of Income" table
    2. For operating income, look for "Income from operations" or "Operating income"
    3. For net income, use the TOTAL net income figure, not amounts allocated to specific share classes
    4. Use NULL for missing values
    5. Operating income must be less than revenue
    6. Net income must be less than operating income
    7. Ignore segment breakdowns, quarterly data, or per-share amounts

    # Output format:

    - CSV format with headers: {','.join(columns_to_extract.keys())}
    - Use NULL for missing values
    - If no data are found, do not create a row.
    - Enter two newline characters to terminate the CSV when no more data are found.

    # Definitions:
    - Revenue: Total sales of goods and services. Usually this is at the top of the
    income statement.
    - Operating income: Revenue minus operating expenses for the entire company. This is revenue
    minus costs. Operating income is also called operating profit, EBIT, or income from
    operations.
    - Net income: Operating income minus taxes. This is the bottom line of the
    income statement.
    """

    return add_instruction(user_prompt)
```

## Running the model

Now that we have our prompt and regular expression, we can run the model.

Construct our regex extractor function.

```python
from outlines.types import Regex

csv_extractor = outlines.Generator(model, Regex(csv_regex))
```

Provide the prompt to the model and run it:

```python
csv_data = csv_extractor(
    extract_financial_data_prompt(columns_to_extract, income_statement),
    max_new_tokens=1024,
)

print(csv_data)
```
```
year,revenue,operating_income,net_income
2024,60922,32972,29760
2023,26974,4224,4368
2022,26914,10041,9752
```

Voila! We've extracted the financial data from the income statement, and it's correct upon inspection.

You can even load this into a `pandas` DataFrame for further analysis:

```python
import pandas as pd
from io import StringIO

df = pd.read_csv(StringIO(csv_data))
print(df)
```
```
   year  revenue  operating_income  net_income
0  2024    60922             32972       29760
1  2023    26974              4224        4368
2  2022    26914             10041        9752
```


================================================
FILE: docs/examples/extract_event_details.md
================================================
This recipe demonstrates how to use the `outlines` library to extract structured event details from a text message.
We will extract the title, location, and start date and time from messages like the following:

```plaintext
Hello Kitty, my grandmother will be here, I think it's better to postpone
our appointment to review math lessons to next Monday at 2pm at the same
place, 3 avenue des tanneurs, one hour will be enough see you 😘
```

Let see how to extract the event details from the message with the MLX
library dedicated to Apple Silicon processor (M series).

```python
--8<-- "docs/cookbook/extract_event_details.py"
```

The output will be:

```plaintext
Today: Saturday 16 November 2024 and it's 10:55
```

and the extracted event information will be:

```json
{
  "title":"Math Review",
  "location":"3 avenue des tanneurs",
  "start":"2024-11-22T14:00:00Z"
}
```


To find out more about this use case, we recommend the project developped by [Joseph Rudoler](https://x.com/JRudoler) the [ICS Generator](https://github.com/jrudoler/ics-generator)


================================================
FILE: docs/examples/extract_event_details.py
================================================
from datetime import datetime

from mlx_lm import load
from pydantic import BaseModel, Field

import outlines
from outlines import Generator, Template


# Load the model
model = outlines.from_mlxlm(*load("mlx-community/Hermes-3-Llama-3.1-8B-8bit"))


# Define the event schema using Pydantic
class Event(BaseModel):
    title: str = Field(description="title of the event")
    location: str
    start: datetime = Field(
        default=None, description="date of the event if available in iso format"
    )

# Load the prompt template from a string
prompt_template = Template.from_string(
    """
    Today's date and time are {{ now }}
    Given a user message, extract information of the event like date and time in iso format, location and title.
    If the given date is relative, think step by step to find the right date.
    Here is the message:
    {{ message }}
    """
)

# Get the current date and time
now = datetime.now().strftime("%A %d %B %Y and it's %H:%M")

# Sample message
message = """Hello Kitty, my grandmother will be here, I think it's better to postpone our
appointment to review math lessons to next Friday at 2pm at the same place, 3 avenue des tanneurs, I think that one hour will be enough
see you 😘 """

# Create the generator
generator = Generator(model, Event)

# Create the prompt
prompt = prompt_template(now=now, message=message)

# Extract the event information
event = generator(prompt)

# Print the current date and time
print(f"Today: {now}")

# Print the extracted event information
print(event)


================================================
FILE: docs/examples/extraction.md
================================================
# Named entity extraction

Named Entity Extraction is a fundamental problem in NLP. It involves identifying and categorizing named entities within a document: people, organization, dates, places, etc. It is usually the first step in a more complex NLP worklow. Here we will use the example of a pizza restaurant that receives orders via their website and need to identify the number and types of pizzas that are being ordered.

Getting LLMs to output the extracted entities in a structured format can be challenging. In this tutorial we will see how we can use Outlines' JSON-structured generation to extract entities from a document and return them in a valid JSON data structure 100% of the time.

As always, we start with initializing the model. We will be using a quantized version of Mistal-7B-v0.1 (we're GPU poor):

```python
import transformers
import outlines

model_name = "microsoft/Phi-3-mini-4k-instruct"
model = outlines.from_transformers(
    transformers.AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda"),
    transformers.AutoTokenizer.from_pretrained(model_name),
)
```

And we will be using the following prompt template:

```python
from outlines import Template

take_order = Template.from_string(
    """You are the owner of a pizza parlor. Customers \
    send you orders from which you need to extract:

    1. The pizza that is ordered
    2. The number of pizzas

    # EXAMPLE

    ORDER: I would like one Margherita pizza
    RESULT: {"pizza": "Margherita", "number": 1}

    # OUTPUT INSTRUCTIONS

    Answer in valid JSON. Here are the different objects relevant for the output:

    Order:
        pizza (str): name of the pizza
        number (int): number of pizzas

    Return a valid JSON of type "Order"

    # OUTPUT

    ORDER: {{ order }}
    RESULT: """
)
```

We now define our data model using Pydantic:

```python
from enum import Enum
from pydantic import BaseModel

class Pizza(str, Enum):
    margherita = "Margherita"
    pepperonni = "Pepperoni"
    calzone = "Calzone"

class Order(BaseModel):
    pizza: Pizza
    number: int
```

We can now define our generator and call it on several incoming orders:

```python
orders = [
    "Hi! I would like to order two pepperonni pizzas and would like them in 30mins.",
    "Is it possible to get 12 margheritas?"
]
prompts = [take_order(order=order) for order in orders]

generator = outlines.Generator(model, Order)

results = generator(prompts)
print(results)
# ['{"pizza": "Pepperoni", "number": 2}',
# '{"pizza": "Margherita", "number": 12}']
```

There are several ways you could improve this example:

- Clients may order several types of pizzas.
- Clients may order drinks as well.
- If the pizza place has a delivery service we need to extract the client's address and phone number
- Clients may specify the time for which they want the pizza. We could then check against a queuing system and reply to them with the estimated delivery time.

How would you change the Pydantic model to account for these use cases?


================================================
FILE: docs/examples/index.md
================================================
# Examples

This part of the documentation provides a few cookbooks that you can browse to get acquainted with the library and get some inspiration about what you could do with structured generation. Remember that you can easily change the model that is being used!

- [Classification](classification.md): Classify customer requests.
- [Named Entity Extraction](extraction.md): Extract information from pizza orders.
- [Dating Profiles](dating_profiles.md): Build dating profiles from descriptions using prompt templating and JSON-structured generation.
- [Chain Of Density](chain_of_density.md): Summarize documents using chain of density prompting and JSON-structured generation.
- [Playing Chess](models_playing_chess.md): Make Phi-3 Mini play chess against itself using regex-structured generation.
- [SimToM](simtom.md): Improve LLMs' Theory of Mind capabilities with perspective-taking prompting and JSON-structured generation.
- [Q&A with Citations](qa-with-citations.md): Answer questions and provide citations using JSON-structured generation.
- [Knowledge Graph Generation](knowledge_graph_extraction.md): Generate a Knowledge Graph from unstructured text using JSON-structured generation.
- [Structured Generation Workflow](structured_generation_workflow.md):
- [Chain Of Thought (CoT)](chain_of_thought.md): Generate a series of intermediate reasoning steps using regex-structured generation.
- [ReAct Agent](react_agent.md): Build an agent with open weights models using regex-structured generation.
- [Structured Generation from PDFs](read-pdfs.md): Use Outlines with vision-language models to read PDFs and produce structured output.
- [Earnings reports to CSV](earnings-reports.md): Extract data from earnings reports to CSV using regex-structured generation.
- [Receipt Digitization](receipt-digitization.md): Extract information from a picture of a receipt using structured generation.
- [Extract Events Details](extract_event_details.md):

Run Outlines on the cloud:

- [BentoML](deploy-using-bentoml.md)
- [Cerebrium](deploy-using-cerebrium.md)
- [Modal](deploy-using-modal.md)


================================================
FILE: docs/examples/knowledge_graph_extraction.md
================================================
# Knowledge Graph Extraction

In this guide, we use [outlines](https://dottxt-ai.github.io/outlines/) to extract a knowledge graph from unstructured text.

We will use [llama.cpp](https://github.com/ggerganov/llama.cpp) using the [llama-cpp-python](https://github.com/abetlen/llama-cpp-python) library. Outlines supports llama-cpp-python, but we need to install it ourselves:

```shell
pip install llama-cpp-python
```

To create an outlines `LlamaCpp` model, you first need to create a `Llama` object from the `llama-cpp-python` library. Then you can create the outlines model by calling `models.from_llamacpp` with the `Llama` object instance as argument. To create the `Llama` object, you need to provide the model weights by passing the name of the repository on the HuggingFace Hub, and the filenames or glob pattern (it will automatically download the weights from the hub):

```python
import llama_cpp
import outlines

llm = llama_cpp.Llama(
    "NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF",
    tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(
        "NousResearch/Hermes-2-Pro-Llama-3-8B"
    ),
    n_gpu_layers=-1,
    flash_attn=True,
    n_ctx=8192,
    verbose=False
)
model = outlines.from_llamacpp(llm)
```

??? note "(Optional) Store the model weights in a custom folder"

    By default the model weights are downloaded to the hub cache but if we want so store the weights in a custom folder, we pull a quantized GGUF model [Hermes-2-Pro-Llama-3-8B](https://huggingface.co/NousResearch/Hermes-2-Theta-Llama-3-8B-GGUF) by [NousResearch](https://nousresearch.com/) from [HuggingFace](https://huggingface.co/):

    ```shell
    wget https://hf.co/NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/resolve/main/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
    ```

    We initialize the model:

    ```python
    from llama_cpp import Llama

    llm = Llama("/path/to/model/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf", ...)
    ```

## Knowledge Graph Extraction

We first need to define our Pydantic class for each node and each edge of the knowledge graph:

```python
from pydantic import BaseModel, Field

class Node(BaseModel):
    """Node of the Knowledge Graph"""

    id: int = Field(..., description="Unique identifier of the node")
    label: str = Field(..., description="Label of the node")
    property: str = Field(..., description="Property of the node")


class Edge(BaseModel):
    """Edge of the Knowledge Graph"""

    source: int = Field(..., description="Unique source of the edge")
    target: int = Field(..., description="Unique target of the edge")
    label: str = Field(..., description="Label of the edge")
    property: str = Field(..., description="Property of the edge")
```

We then define the Pydantic class for the knowledge graph and get its JSON schema:

```python
from typing import List

class KnowledgeGraph(BaseModel):
    """Generated Knowledge Graph"""

    nodes: List[Node] = Field(..., description="List of nodes of the knowledge graph")
    edges: List[Edge] = Field(..., description="List of edges of the knowledge graph")

schema = KnowledgeGraph.model_json_schema()
```

We then need to adapt our prompt to the [Hermes prompt format for JSON schema](https://github.com/NousResearch/Hermes-Function-Calling?tab=readme-ov-file#prompt-format-for-json-mode--structured-outputs):

```python
from outlines import Template

generate_hermes_prompt = Template.from_string(
    """
    <|im_start|>system
    You are a world class AI model who answers questions in JSON
    Here's the json schema you must adhere to:
    <schema>
    {{ schema }}
    </schema>
    <|im_end|>
    <|im_start|>user
    {{ user_prompt }}
    <|im_end|>
    <|im_start|>assistant
    <schema>
    """
)
```

For a given user prompt, for example:

```python
user_prompt = "Alice loves Bob and she hates Charlie."
```

We can use `outlines.Generator` by passing the Pydantic class we previously defined, and call the generator with the Hermes prompt:

```python
from outlines import Generator

generator = Generator(model, KnowledgeGraph)
prompt = generate_hermes_prompt(schema=schema, user_prompt=user_prompt)
response = generator(prompt, max_tokens=1024, temperature=0, seed=42)
```

We obtain the nodes and edges of the knowledge graph:

```python
print(response)
# {"nodes":[{"id":1,"label":"Alice","property":"loves,hates"},
# {"id":2,"label":"Bob","property":"loved_by"},
# {"id":3,"label":"Charlie","property":"hated_by"}],
# "edges":[{"source":1,"target":2,"label":"loves","property":"love"},
# {"source":1,"target":3,"label":"hates","property":"hate"}]}

```

## (Optional) Visualizing the Knowledge Graph

We can use the [Graphviz library](https://graphviz.readthedocs.io/en/stable/) to visualize the generated knowledge graph. For detailed installation instructions, see [here](https://graphviz.readthedocs.io/en/stable/#installation).

```python
from graphviz import Digraph

dot = Digraph()
for node in response["nodes"]:
    dot.node(str(node["id"]), node["label"], shape='circle', width='1', height='1')
for edge in response["edges"]:
    dot.edge(str(edge["source"]), str(edge["target"]), label=edge["label"])

dot.render('knowledge-graph.gv', view=True)
```

![Image of the Extracted Knowledge Graph](./images/knowledge-graph-extraction.png)

This example was originally contributed by [Alonso Silva](https://github.com/alonsosilvaallende).


================================================
FILE: docs/examples/models_playing_chess.md
================================================
# Large language models playing chess

In this example we will make a Phi-3 model play chess against itself. On its own the model easily generates invalid moves, so we will give it a little help. At each step we will generate a regex that only matches valid move, and use it to help the model only generating valid moves.

## The chessboard

The game will be played on a standard checkboard. We will use the `chess` [library](https://github.com/niklasf/python-chess) to track the opponents' moves, and check that the moves are valid.

```python
%pip install outlines -q
%pip install chess -q
%pip install transformers accelerate einops -q

import chess

board = chess.Board("rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1")
```

## The opponents

Phi-3 will be playing against itself:

```python
import transformers
import outlines

model_name = "microsoft/Phi-3-mini-4k-instruct"
model = outlines.from_transformers(
    transformers.AutoModelForCausalLM.from_pretrained(model_name),
    transformers.AutoTokenizer.from_pretrained(model_name),
)
```

## A little help for the language model

To make sure Phi-3 generates valid chess moves we will use Outline's regex-structured generation. We define a function that takes the current state of the board and returns a regex that matches all possible legal moves:

```python
import re
from outlines.types.dsl import either, String

def legal_moves_regex(board):
    """Build a regex that only matches valid moves."""
    legal_moves = list(board.legal_moves)
    legal_modes_str = [board.san(move) for move in legal_moves]
    legal_modes_str = [re.sub(r"[+#]", "", move) for move in legal_modes_str]
    regex_pattern = either(*[String(move) for move in legal_modes_str])
    return regex_pattern
```

## Prompting the language model

The prompt corresponds to the current state of the board, so we start with:

```python
prompt = "Let's play Chess. Moves: "

```

We update the prompt at each step so it reflects the state of the board after the previous move.

## Let's play

```python
board_state = " "
turn_number = 0
while not board.is_game_over():
    regex_pattern = legal_moves_regex(board)
    structured = model(prompt + board_state, regex_pattern)
    move = board.parse_san(structured)

    if turn_number % 2 == 0 :  # It's White's turn
        board_state += board.san(move) + " "
    else:
        board_state += board.san(move) + " " + str(turn_number) + "."

    turn_number += 1

    board.push(move)

    print(board_state)
```

Interestingly enough, Phi-3 hates capturing.

```pgn
 e4 e5 1.Nf3 Ne7 3.b4 Nf5 5.Nc3 Ne7 7.Bb5 a6 9.Na4 b6 11.c3 Nec6 13.c4 a5 15.d4 Qg5 17.Nd2 Bb7 19.dxe5
```

*This example was originally authored by [@903124S](https://x.com/903124S) in [this gist](https://gist.github.com/903124/cfbefa24da95e2316e0d5e8ef8ed360d).*


================================================
FILE: docs/examples/prompt_templates/chain_of_density.txt
================================================
Article: {{ article }}

You will generate increasingly concise, entity-dense summaries of the above Article.

Repeat the following 2 steps 5 times.

Step 1. Identify 1-3 informative Entities ("; " delimited) from the Article which are missing from the previously generated summary.
Step 2. Write a new, denser summary of identical length which covers every entity and detail from the previous summary plus the Missing Entities.

A Missing Entity is:
- Relevant: to the main story.
- Specific: descriptive yet concise (5 words or fewer).
- Novel: not in the previous summary.
- Faithful: present in the Article.
- Anywhere: located anywhere in the Article.

Guidelines:
- The first summary should be long (4-5 sentences, ~80 words) yet highly non-specific, containing little information beyond the entities marked as missing. Use overly verbose language and fillers (e.g., "this article discusses") to reach ~80 words.
- Make every word count: rewrite the previous summary to improve flow and make space for additional entities.
- Make space with fusion, compression, and removal of uninformative phrases like "the article discusses".
- The summaries should become highly dense and concise yet self-contained, e.g., easily understood without the Article.
- Missing entities can appear anywhere in the new summary.
- Never drop entities from the previous summary. If space cannot be made, add fewer new entities.

Remember, use the exact same number of words for each summary.

Answer in JSON. The JSON should be a a dictionary with key "summaries" that contains a list (length 5) of dictionaries whose keys are "Missing_Entities" and "Denser_Summary".


================================================
FILE: docs/examples/prompt_templates/classification.txt
================================================
You are an experienced customer success manager.

Given a request from a client, you need to determine when the
request is urgent using the label "URGENT" or when it can wait
a little with the label "STANDARD".

# Examples

Request: "How are you?"
Label: STANDARD

Request: "I need this fixed immediately!"
Label: URGENT

# TASK

Request: {{ request }}
Label:


================================================
FILE: docs/examples/prompt_templates/react_agent.txt
================================================
<|im_start|>system
You are a world class AI model who answers questions in JSON with correct Pydantic schema.
Here's the json schema you must adhere to:
<schema>
{{ schema }}
</schema>
Today is {{ today }}
You run in a loop of Scratchpad, Thought, Action, Action Input, PAUSE, Observation.
At the end of the loop you output a Final Answer.
Use Scratchpad to store the information from the Observation useful to answer the question
Use Thought to describe your thoughts about the question you have been asked and reflect carefully about the Observation if it exists.
Use Action to run one of the actions available to you.
Use Action Input to input the arguments of the selected action - then return PAUSE.
Observation will be the result of running those actions.
Your available actions are:
calculate:
e.g. calulate: 4**2 / 3
Runs a calculation and returns the number - uses Python so be sure to use floating point syntax if necessary
wikipedia:
e.g. wikipedia: Django
Returns a summary from searching Wikipedia
DO NOT TRY TO GUESS THE ANSWER. Begin!
<|im_end|>
<|im_start|>user
{{ question }}
<|im_end|>
<|im_start|>assistant


================================================
FILE: docs/examples/prompt_templates/simtom_prospective_taking.txt
================================================
<s>[INST] The following is a sequence of events about some characters, that takes place in multiple locations.
Your job is to output only the events that the specified character, {{character}}, knows about.

Here are a few rules:
1. A character knows about all events that they do.
2. If a character is in a certain room/location, that character knows about all other events that happens in the room. This includes other characters leaving or exiting the location, the locations of objects in that location, and whether somebody moves an object to another place.
3. If a character leaves a location, and is NOT in that location, they no longer know about any events that happen within that location. However, they can re-enter the location.

Story: {{story}}
What events does {{character}} know about? Only output the events according to the above rules, do not provide an explanation. [/INST]


================================================
FILE: docs/examples/prompt_templates/simtom_simulation.txt
================================================
<s>[INST] {% for event in events %}
{{event}}
{% endfor %}
You are {{name}}.
Based on the above information, answer the following question:
{{question}}
You must choose one of the above choices, do not say there is not enough information. Answer with a single word, do not output anything else. [/INST]


================================================
FILE: docs/examples/qa-with-citations.md
================================================
# Generate Synthetic Data and Q&A with Citations

This tutorial is adapted from the [instructor-ollama notebook](https://github.com/alonsosilvaallende/Hermes-Function-Calling/blob/main/examples/instructor_ollama.ipynb). We start with a simple example to generate synthetic data and then we approach the problem of question answering by providing citations.

We will use [llama.cpp](https://github.com/ggerganov/llama.cpp) using the [llama-cpp-python](https://github.com/abetlen/llama-cpp-python) library. Outlines supports llama-cpp-python, but we need to install it ourselves:

```shell
pip install llama-cpp-python
```

We download the model weights by passing the name of the repository on the HuggingFace Hub, and the filenames (or glob pattern):
```python
import llama_cpp
import outlines

llm = llama_cpp.Llama(
    "NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF",
    tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(
        "NousResearch/Hermes-2-Pro-Llama-3-8B"
    ),
    n_gpu_layers=-1,
    flash_attn=True,
    n_ctx=8192,
    verbose=False
)
model = outlines.from_llamacpp(llm)
```

??? note "(Optional) Store the model weights in a custom folder"

    By default the model weights are downloaded to the hub cache but if we want so store the weights in a custom folder, we pull a quantized GGUF model [Hermes-2-Pro-Llama-3-8B](https://huggingface.co/NousResearch/Hermes-2-Theta-Llama-3-8B-GGUF) by [NousResearch](https://nousresearch.com/) from [HuggingFace](https://huggingface.co/):

    ```shell
    wget https://hf.co/NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/resolve/main/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
    ```

    We initialize the model:

    ```python
    from llama_cpp import Llama

    llm = Llama("/path/to/model/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf", ...)
    ```

## Generate Synthetic Data

We first need to define our Pydantic class for a user:

```python
from pydantic import BaseModel, Field

class UserDetail(BaseModel):
    id: int = Field(..., description="Unique identifier") # so the model keeps track of the number of users
    first_name: str
    last_name: str
    age: int
```

We then define a Pydantic class for a list of users:

```python
from typing import List

class Users(BaseModel):
    users: List[UserDetail]
```

We can use a `outlines.Generator` by passing this Pydantic class we just defined, and call the generator:

```python
import json

generator = outlines.Generator(model, Users)
response = generator("Create 5 fake users", max_tokens=1024, temperature=0, seed=42)
response = json.loads(response)
print(response['users'])
# [{'id': 1, 'first_name': 'John', 'last_name': 'Doe', 'age': 25},
# {'id': 2, 'first_name': 'Jane', 'last_name': 'Doe', 'age': 30},
# {'id': 3, 'first_name': 'Bob', 'last_name': 'Smith', 'age': 40},
# {'id': 4, 'first_name': 'Alice', 'last_name': 'Smith', 'age': 35},
# {'id': 5, 'first_name': 'John', 'last_name': 'Smith', 'age': 20}]
```

```python
for user in response['users']:
    print(user['first_name'])
    print(user['last_name'])
    print(user['age'])
    print("#####")
# John
# Doe
# 25
# #####
# Jane
# Doe
# 30
# #####
# Bob
# Smith
# 40
# #####
# Alice
# Smith
# 35
# #####
# John
# Smith
# 20
# #####
```

## QA with Citations

We first need to define our Pydantic class for QA with citations:

```python
from typing import List
from pydantic import BaseModel

class QuestionAnswer(BaseModel):
    question: str
    answer: str
    citations: List[str]

schema = QuestionAnswer.model_json_schema()
```

We then need to adapt our prompt to the [Hermes prompt format for JSON schema](https://github.com/NousResearch/Hermes-Function-Calling?tab=readme-ov-file#prompt-format-for-json-mode--structured-outputs):

```python
from outlines import Template

hermes_prompt = Template.from_string(
    """
    <|im_start|>system
    You are a world class AI model who answers questions in JSON with correct and exact citations
    extracted from the `Context`.
    Here's the json schema you must adhere to:
    <schema>
    {{ schema }}
    </schema>
    <|im_end|>
    <|im_start|>user
    `Context`:
    {{ context }}
    `Question`:
    {{ question }}
    <|im_end|>
    <|im_start|>assistant
    """
)
```

We can use `outlines.Generator` by passing the Pydantic class we previously defined, and call the generator with Hermes prompt:

```python
question = "What did the author do during college?"
context = """
My name is Jason Liu, and I grew up in Toronto Canada but I was born in China.
I went to an arts high school but in university I studied Computational Mathematics and physics.
As part of coop I worked at many companies including Stitchfix, Facebook.
I also started the Data Science club at the University of Waterloo and I was the president of the club for 2 years.
"""
generator = outlines.Generator(model, QuestionAnswer)
prompt = hermes_prompt(question=question, context=context, schema=schema)
response = generator(prompt, max_tokens=1024, temperature=0, seed=42)
print(response)
# {"question": "What did the author do during college?", "answer": "The author studied Computational Mathematics and physics in university and was also involved in starting the Data Science club, serving as its president for 2 years.", "citations": ["I went to an arts high school but in university I studied Computational Mathematics and physics.", "I also started the Data Science club at the University of Waterloo and I was the president of the club for 2 years."]}
```

We can do the same for a list of question-context pairs:

```python
question1 = "Where was John born?"
context1 = """
John Doe is a software engineer who was born in New York, USA.
He studied Computer Science at the Massachusetts Institute of Technology.
During his studies, he interned at Google and Microsoft.
He also founded the Artificial Intelligence club at his university and served as its president for three years.
"""

question2 = "What did Emily study in university?"
context2 = """
Emily Smith is a data scientist from London, England.
She attended the University of Cambridge where she studied Statistics and Machine Learning.
She interned at IBM and Amazon during her summer breaks.
Emily was also the head of the Women in Tech society at her university.
"""

question3 = "Which companies did Robert intern at?"
context3 = """
Robert Johnson, originally from Sydney, Australia, is a renowned cybersecurity expert.
He studied Information Systems at the University of Melbourne.
Robert interned at several cybersecurity firms including NortonLifeLock and McAfee.
He was also the leader of the Cybersecurity club at his university.
"""

question4 = "What club did Alice start at her university?"
context4 = """
Alice Williams, a native of Dublin, Ireland, is a successful web developer.
She studied Software Engineering at Trinity College Dublin.
Alice interned at several tech companies including Shopify and Squarespace.
She started the Web Development club at her university and was its president for two years.
"""

question5 = "What did Michael study in high school?"
context5 = """
Michael Brown is a game developer from Tokyo, Japan.
He attended a specialized high school where he studied Game Design.
He later attended the University of Tokyo where he studied Computer Science.
Michael interned at Sony and Nintendo during his university years.
He also started the Game Developers club at his university.
"""

for question, context in [
    (question1, context1),
    (question2, context2),
    (question3, context3),
    (question4, context4),
    (question5, context5),
]:
    prompt = hermes_prompt(question=question, context=context, schema=schema)
    generator = outlines.Generator(model, QuestionAnswer)
    response = generator(prompt, max_tokens=1024, temperature=0, seed=42)
    response = json.loads(response)
    print(question)
    print(response['answer'])
    print(response['citations'])
    print("\n\n")

# 'Where was John born?'
# 'John Doe was born in New York, USA.'
# ['John Doe is a software engineer who was born in New York, USA.']
#
#
# 'What did Emily study in university?'
# 'Emily studied Statistics and Machine Learning in university.'
# ['She attended the University of Cambridge where she studied Statistics and Machine Learning.']
#
#
# 'Which companies did Robert intern at?'
# 'Robert interned at NortonLifeLock and McAfee.'
# ['Robert Johnson, originally from Sydney, Australia, is a renowned cybersecurity expert. He interned at several cybersecurity firms including NortonLifeLock and McAfee.']
#
#
# 'What club did Alice start at her university?'
# 'Alice started the Web Development club at her university.'
# ['Alice Williams, a native of Dublin, Ireland, is a successful web developer. She started the Web Development club at her university and was its president for two years.']
#
#
# 'What did Michael study in high school?'
# 'Michael studied Game Design in high school.'
# ['Michael Brown is a game developer from Tokyo, Japan. He attended a specialized high school where he studied Game Design.']
```

This example was originally contributed by [Alonso Silva](https://github.com/alonsosilvaallende).


================================================
FILE: docs/examples/react_agent.md
================================================
# ReAct Agent

This example shows how to use [outlines](https://dottxt-ai.github.io/outlines/) to build your own agent with open weights local models and structured outputs. It is inspired by the blog post [A simple Python implementation of the ReAct pattern for LLMs](https://til.simonwillison.net/llms/python-react-pattern) by [Simon Willison](https://simonwillison.net/).

The ReAct pattern (for Reason+Act) is described in the paper [ReAct: Synergizing Reasoning and Acting in Language Models](https://arxiv.org/abs/2210.03629). It's a pattern where you implement additional actions that an LLM can take - searching Wikipedia or running calculations for example - and then teach it how to request the execution of those actions, and then feed their results back into the LLM.

Additionally, we give the LLM the possibility of using a scratchpad described in the paper [Show Your Work: Scratchpads for Intermediate Computation with Language Models](https://arxiv.org/abs/2112.00114) which improves the ability of LLMs to perform multi-step computations.

We use [llama.cpp](https://github.com/ggerganov/llama.cpp) using the [llama-cpp-python](https://github.com/abetlen/llama-cpp-python) library. Outlines supports llama-cpp-python, but we need to install it ourselves:

```shell
pip install llama-cpp-python
```

We download the model weights by passing the name of the repository on the HuggingFace Hub, and the filenames (or glob pattern):
```python
import llama_cpp
import outlines

llm = llama_cpp.Llama(
    "NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF",
    tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(
        "NousResearch/Hermes-2-Pro-Llama-3-8B"
    ),
    n_gpu_layers=-1,
    flash_attn=True,
    n_ctx=8192,
    verbose=False
)
model = outlines.from_llamacpp(llm)
```

??? note "(Optional) Store the model weights in a custom folder"

    By default the model weights are downloaded to the hub cache but if we want so store the weights in a custom folder, we pull a quantized GGUF model [Hermes-2-Pro-Llama-3-8B](https://huggingface.co/NousResearch/Hermes-2-Theta-Llama-3-8B-GGUF) by [NousResearch](https://nousresearch.com/) from [HuggingFace](https://huggingface.co/):

    ```shell
    wget https://hf.co/NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/resolve/main/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
    ```

    We initialize the model:

    ```python
    from llama_cpp import Llama

    llm = Llama("/path/to/model/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf", ...)
    ```

## Build a ReAct agent

In this example, we use two tools:

- wikipedia: \<search term\> - search Wikipedia and returns the snippet of the first result
- calculate: \<expression\> - evaluate an expression using Python's eval() function

```python
import httpx

def wikipedia(q):
    return httpx.get("https://en.wikipedia.org/w/api.php", params={
        "action": "query",
        "list": "search",
        "srsearch": q,
        "format": "json"
    }).json()["query"]["search"][0]["snippet"]


def calculate(numexp):
    return eval(numexp)
```

We define the logic of the agent through a Pydantic class. First, we want the LLM to decide only between the two previously defined tools:

```python
from enum import Enum

class Action(str, Enum):
    wikipedia = "wikipedia"
    calculate = "calculate"
```

Our agent will loop through Thought and Action. We explicitly give the Action Input field so it doesn't forget to add the arguments of the Action. We also add a scratchpad (optional).

```python
from pydantic import BaseModel, Field

class Reason_and_Act(BaseModel):
    Scratchpad: str = Field(..., description="Information from the Observation useful to answer the question")
    Thought: str = Field(..., description="It describes your thoughts about the question you have been asked")
    Action: Action
    Action_Input: str = Field(..., description="The arguments of the Action.")
```

Our agent will reach a Final Answer. We also add a scratchpad (optional).

```python
class Final_Answer(BaseModel):
    Scratchpad: str = Field(..., description="Information from the Observation useful to answer the question")
    Final_Answer: str = Field(..., description="Answer to the question grounded on the Observation")
```

Our agent will decide when it has reached a Final Answer and therefore to stop the loop of Thought and Action.

```python
from typing import Union

class Decision(BaseModel):
    Decision: Union[Reason_and_Act, Final_Answer]

json_schema = Decision.model_json_schema()
```

We then need to adapt our prompt to the [Hermes prompt format for JSON schema](https://github.com/NousResearch/Hermes-Function-Calling?tab=readme-ov-file#prompt-format-for-json-mode--structured-outputs) and explain the agent logic. We can load a template from a file for that:

```python
from outlines import Template

hermes_prompt = Template.from_file("prompt_templates/react_agent.txt")
```

We define a ChatBot class

```python
class ChatBot:
    def __init__(self, prompt=""):
        self.prompt = prompt

    def __call__(self, user_prompt):
        self.prompt += user_prompt
        result = self.execute()
        return result

    def execute(self):
        generator = outlines.Generator(model, Decision)
        result = generator(self.prompt, max_tokens=1024, temperature=0, seed=42)
        return result
```

We define a query function:

```python
import json

def query(question, max_turns=5):
    i = 0
    next_prompt = (
        "\n<|im_start|>user\n" + question + "<|im_end|>"
        "\n<|im_start|>assistant\n"
    )
    previous_actions = []
    while i < max_turns:
        i += 1
        prompt = generate_hermes_prompt(
            question=question,
            schema=Decision.model_json_schema(),
            today=datetime.datetime.today().strftime('%Y-%m-%d')
        )
        bot = ChatBot(prompt=prompt)
        result = bot(next_prompt)
        json_result = json.loads(result)['Decision']
        if "Final_Answer" not in list(json_result.keys()):
            scratchpad = json_result['Scratchpad'] if i == 0 else ""
            thought = json_result['Thought']
            action = json_result['Action']
            action_input = json_result['Action_Input']
            print(f"\x1b[34m Scratchpad: {scratchpad} \x1b[0m")
            print(f"\x1b[34m Thought: {thought} \x1b[0m")
            print(f"\x1b[36m  -- running {action}: {str(action_input)}\x1b[0m")
            if action + ": " + str(action_input) in previous_actions:
                observation = "You already run that action. **TRY A DIFFERENT ACTION INPUT.**"
            else:
                if action=="calculate":
                    try:
                        observation = eval(str(action_input))
                    except Exception as e:
                        observation = f"{e}"
                elif action=="wikipedia":
                    try:
                        observation = wikipedia(str(action_input))
                    except Exception as e:
                        observation = f"{e}"
            print()
            print(f"\x1b[33m Observation: {observation} \x1b[0m")
            print()
            previous_actions.append(action + ": " + str(action_input))
            next_prompt += (
                "\nScratchpad: " + scratchpad +
                "\nThought: " + thought +
                "\nAction: " + action  +
                "\nAction Input: " + action_input +
                "\nObservation: " + str(observation)
            )
        else:
            scratchpad = json_result["Scratchpad"]
            final_answer = json_result["Final_Answer"]
            print(f"\x1b[34m Scratchpad: {scratchpad} \x1b[0m")
            print(f"\x1b[34m Final Answer: {final_answer} \x1b[0m")
            return final_answer
    print(f"\nFinal Answer: I am sorry, but I am unable to answer your question. Please provide more information or a different question.")
    return "No answer found"
```

We can now test our ReAct agent:

```python
print(query("What's 2 to the power of 10?"))
# Scratchpad:
# Thought: I need to perform a mathematical calculation to find the result of 2 to the power of 10.
#  -- running calculate: 2**10
#
# Observation: 1024
#
# Scratchpad: 2 to the power of 10 is 1024.
# Final Answer: 2 to the power of 10 is 1024.
# 2 to the power of 10 is 1024.
```

```python
print(query("What does England share borders with?"))
# Scratchpad:
# Thought: To answer this question, I will use the 'wikipedia' action to gather information about England's geographical location and its borders.
#  -- running wikipedia: England borders
#
# Observation: Anglo-Scottish <span class="searchmatch">border</span> (Scottish Gaelic: Crìochan Anglo-Albannach) is an internal <span class="searchmatch">border</span> of the United Kingdom separating Scotland and <span class="searchmatch">England</span> which runs for
#
# Scratchpad: Anglo-Scottish border (Scottish Gaelic: Crìochan Anglo-Albannach) is an internal border of the United Kingdom separating Scotland and England which runs for
# Final Answer: England shares a border with Scotland.
# England shares a border with Scotland.
```

As mentioned in Simon's blog post, this is not a very robust implementation at all and there's a ton of room for improvement. But it is lovely how simple it is with a few lines of Python to make these extra capabilities available to the LLM. And now you can run it locally with an open weights LLM.

This example was originally contributed by [Alonso Silva](https://github.com/alonsosilvaallende).


================================================
FILE: docs/examples/read-pdfs.md
================================================
# PDF to structured output with vision language models

A common task with language models is to ask language models questions about a PDF file.

Typically, the output is unstructured text, i.e. "talking" to your PDF.

In some cases, you may wish to extract structured information from the PDF, like tables, lists, citations, etc.

PDFs are difficult to machine read. However, you can simply convert the PDF to images, and then use a vision language model to extract structured information from the images.

This cookbook demonstrates how to

1. Convert a PDF to a list of images
2. Use a vision language model to extract structured information from the images

## Dependencies

You'll need to install these dependencies:

```shell
pip install outlines pillow transformers torch==2.4.0 pdf2image

# Optional, but makes the output look nicer
pip install rich
```

## Import the necessary libraries

```python
from PIL import Image
import outlines
import torch
from transformers import AutoProcessor
from pydantic import BaseModel
from typing import List, Optional
from pdf2image import convert_from_path
import os
from rich import print
import requests
```

## Choose a model

We've tested this example with [Pixtral 12b](https://huggingface.co/mistral-community/pixtral-12b) and [Qwen2-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct).

To use Pixtral:

```python
from transformers import LlavaForConditionalGeneration, LlavaProcessor
model_name="mistral-community/pixtral-12b"
model_class=LlavaForConditionalGeneration
processor_class = LlavaProcessor
```

To use Qwen-2-VL:

```python
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
model_name = "Qwen/Qwen2-VL-7B-Instruct"
model_class = Qwen2VLForConditionalGeneration
processor_class = AutoProcessor
```

You can load your model into memory with:

```python
# This loads the model into memory. On your first run,
# it will have to download the model, which might take a while.
model_kwargs={"device_map": "auto", "torch_dtype": torch.bfloat16}
processor_kwargs={"device_map": "cpu"}
tf_model = model_class.from_pretrained(model_name, **model_kwargs)
tf_processor = processor_class.from_pretrained(model_name, **processor_kwargs)

model = outlines.from_transformers(tf_model, tf_processor)
```

## Convert the PDF to images

We'll use the `pdf2image` library to convert each page of the PDF to an image.

`convert_pdf_to_images` is a convenience function that converts each page of the PDF to an image, and optionally saves the images to disk when `output_dir` is provided.

Note: the `dpi` argument is important. It controls the resolution of the images. High DPI images are higher quality and may yield better results,
but they are also larger, slower to process, and require more memory.

```python
from pdf2image import convert_from_path
from PIL import Image
import os
from typing import List, Optional

def convert_pdf_to_images(
    pdf_path: str,
    output_dir: Optional[str] = None,
    dpi: int = 120,
    fmt: str = 'PNG'
) -> List[Image.Image]:
    """
    Convert a PDF file to a list of PIL Image objects.

    Args:
        pdf_path: Path to the PDF file
        output_dir: Optional directory to save the images
        dpi: Resolution for the conversion. High DPI is high quality, but also slow and memory intensive.
        fmt: Output format (PNG recommended for quality)

    Returns:
        List of PIL Image objects
    """
    # Convert PDF to list of images
    images = convert_from_path(
        pdf_path,
        dpi=dpi,
        fmt=fmt
    )

    # Optionally save images
    if output_dir:
        os.makedirs(output_dir, exist_ok=True)
        for i, image in enumerate(images):
            image.save(os.path.join(output_dir, f'page_{i+1}.{fmt.lower()}'))

    return images
```

We're going to use the [Louf & Willard paper](https://arxiv.org/pdf/2307.09702) that described the method that Outlines uses for structured generation.

To download the PDF, run:

```python
# Download the PDF file
pdf_url = "https://arxiv.org/pdf/2307.09702"
response = requests.get(pdf_url)

# Save the PDF locally
with open("louf-willard.pdf", "wb") as f:
    f.write(response.content)
```

Now, we can convert the PDF to a list of images:

```python
# Load the pdf
images = convert_pdf_to_images(
    "louf-willard.pdf",
    dpi=120,
    output_dir="output_images"
)
```

## Extract structured information from the images

The structured output you can extract is exactly the same as everywhere else in Outlines -- you can use regular expressions, JSON schemas, selecting from a list of options, etc.

### Extracting data into JSON

Suppose you wished to go through each page of the PDF, and extract the page description, key takeaways, and page number.

You can do this by defining a JSON schema, and then using `outlines.Generator` to extract the data.

First, define the structure you want to extract:

```python
class PageSummary(BaseModel):
    description: str
    key_takeaways: List[str]
    page_number: int
```

Second, we need to set up the prompt. Adding special tokens can be tricky, so we use the transformers processor to apply the special tokens for us. To do so, we specify a list of messages, where each message is a dictionary with a `role` and `content` key.

Images are denoted with `type: "image"`, and text is denoted with `type: "text"`.

```python
messages = [
    {
        "role": "user",
        "content": [
            # The text you're passing to the model --
            # this is where you do your standard prompting.
            {"type": "text", "text": f"""
                Describe the page in a way that is easy for a PhD student to understand.

                Return the information in the following JSON schema:
                {PageSummary.model_json_schema()}

                Here is the page:
                """
            },

            # This a placeholder, the actual image is passed in when
            # we call the generator function down below.
            {"type": "image", "image": ""},
        ],
    }
]

# Convert the messages to the final prompt
prompt = tf_processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
```

Now we iterate through each image, and extract the structured information:

```python
# Page summarizer function
page_summary_generator = outlines.Generator(model, PageSummary)

for image in images:
    result = page_summary_generator({"text": prompt, "images": image})
    print(result)
```

### Regular expressions to extract the arxiv paper identifier

The [arXiv paper identifier](https://info.arxiv.org/help/arxiv_identifier.html) is a unique identifier for each paper. These identifiers have the format `arXiv:YYMM.NNNNN` (five end digits) or `arXiv:YYMM.NNNN` (four end digits). arXiv identifiers are typically watermarked on papers uploaded to arXiv.

arXiv identifiers are optionally followed by a version number, i.e. `arXiv:YYMM.NNNNNvX`.

We can use a regular expression to define this patter:

```python
from outlines.types import Regex

paper_regex = Regex(r'arXiv:\d{2}[01]\d\.\d{4,5}(v\d)?')
```

We can build an extractor function from the regex:

```python
id_extractor = outlines.Generator(model, paper_regex)
```

Now, we can extract the arxiv paper identifier from the first image:

```python
arxiv_instruction = tf_processor.apply_chat_template(
    [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": f"""
                Extract the arxiv paper identifier from the page.

                Here is the page:
                """},
                {"type": "image", "image": ""},
            ],
        }
    ],
    tokenize=False,
    add_generation_prompt=True
)

# Extract the arxiv paper identifier
paper_id = id_extractor({"text": arxiv_instruction, "images": images[0]})
```

As of the time of this writing, the arxiv paper identifier is

```
arXiv:2307.09702v4
```

Your version number may be different, but the part before `vX` should match.

### Categorize the paper into one of several categories

`outlines.Generator` also allows the model to select one of several options by providing a Literal type hint with the categories.

Suppose we wanted to categorize the paper into being about "language models", "cell biology", or "other". We would then define the output type as `Literal["llms", "cell biology", "other"]`.

Let's define a few categories we might be interested in:

```python
categories = [
    "llms",
    "cell biology",
    "other"
]
```

Now we can construct the prompt:

```python
categorization_instruction = tf_processor.apply_chat_template(
    [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": f"""
                Please choose one of the following categories
                that best describes the paper.

                {categories}

                Here is the paper:
                """},

                {"type": "image", "image": ""},
            ],
        }
    ],
    tokenize=False,
    add_generation_prompt=True
)
```

Now we can show the model the first page and extract the category:

```python
from typing import Literal

# Build the choice extractor
categorizer = outlines.Generator(model, Literal["llms", "cell biology", "other"])

# Categorize the paper
category = categorizer({"text": categorization_instruction, "images": images[0]})
print(category)
```

Which should return:

```
llms
```

## Additional notes

You can provide multiple images to the model by

1. Adding additional image messages
2. Providing a list of images to the generator

For example, to have two images, you can do:

```python
two_image_prompt = tf_processor.apply_chat_template(
    [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "are both of these images of hot dogs?"},

                # Tell the model there are two images
                {"type": "image", "image": ""},
                {"type": "image", "image": ""},
            ],
        }
    ],
    tokenize=False,
    add_generation_prompt=True
)

# Pass two images to the model
generator = outlines.Generator(model, Literal["hot dog", "not hot dog"])

result = generator({"text": two_image_prompt, "images": [images[0], images[1]]})
print(result)
```

Using the first to pages of the paper (they are not images of hot dogs), we should get

```
not hot dog
```


================================================
FILE: docs/examples/receipt-digitization.md
================================================
# Receipt Data Extraction with VLMs

## Setup

You'll need to install the dependencies:

```shell
pip install outlines torch==2.4.0 transformers accelerate pillow rich
```

## Import libraries

Load all the necessary libraries:

```python
# LLM stuff
import outlines
import torch
from transformers import AutoProcessor
from pydantic import BaseModel, Field
from typing import Literal, Optional, List

# Image stuff
from PIL import Image
import requests

# Rich for pretty printing
from rich import print
```

## Choose a model

This example has been tested with `mistral-community/pixtral-12b` ([HF link](https://huggingface.co/mistral-community/pixtral-12b)) and `Qwen/Qwen2-VL-7B-Instruct` ([HF link](https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct)).

We recommend Qwen-2-VL as we have found it to be more accurate than Pixtral.

If you want to use Qwen-2-VL, you can do the following:

```python
# To use Qwen-2-VL:
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
model_name = "Qwen/Qwen2-VL-7B-Instruct"
model_class = Qwen2VLForConditionalGeneration
processor_class = AutoProcessor
```

If you want to use Pixtral, you can do the following:

```python
# To use Pixtral:
from transformers import LlavaForConditionalGeneration, LlavaProcessor
model_name="mistral-community/pixtral-12b"
model_class=LlavaForConditionalGeneration
processor_class = LlavaProcessor
```

## Load the model

Load the model into memory:

```python
model_kwargs={"device_map": "auto", "torch_dtype": torch.bfloat16}
processor_kwargs={"device_map": "cuda"}
tf_model = model_class.from_pretrained(model_name, **model_kwargs)
tf_processor = processor_class.from_pretrained(model_name, **processor_kwargs)

model = outlines.from_transformers(tf_model, tf_processor)
```

## Image processing

Images can be quite large. In GPU-poor environments, you may need to resize the image to a smaller size.

Here's a helper function to do that:

```python
def load_and_resize_image(image_path, max_size=1024):
    """
    Load and resize an image while maintaining aspect ratio

    Args:
        image_path: Path to the image file
        max_size: Maximum dimension (width or height) of the output image

    Returns:
        PIL Image: Resized image
    """
    image = Image.open(image_path)

    # Get current dimensions
    width, height = image.size

    # Calculate scaling factor
    scale = min(max_size / width, max_size / height)

    # Only resize if image is larger than max_size
    if scale < 1:
        new_width = int(width * scale)
        new_height = int(height * scale)
        image = image.resize((new_width, new_height), Image.Resampling.LANCZOS)

    return image
```

You can change the resolution of the image by changing the `max_size` argument. Small max sizes will make the image more blurry, but processing will be faster and require less memory.

## Load an image

Load an image and resize it. We've provided a sample image of a Trader Joe's receipt, but you can use any image you'd like.

Here's what the image looks like:

![Trader Joe's receipt](./images/trader-joes-receipt.jpg)

```python
# Path to the image
image_path = "https://raw.githubusercontent.com/dottxt-ai/outlines/refs/heads/main/docs/cookbook/images/trader-joes-receipt.jpg"

# Download the image
response = requests.get(image_path)
with open("receipt.png", "wb") as f:
    f.write(response.content)

# Load + resize the image
image = load_and_resize_image("receipt.png")
```

## Define the output structure

We'll define a Pydantic model to describe the data we want to extract from the image.

In our case, we want to extract the following information:

- The store name
- The store address
- The store number
- A list of items, including the name, quantity, price per unit, and total price
- The tax
- The total
- The date
- The payment method

Most fields are optional, as not all receipts contain all information.

```python
class Item(BaseModel):
    name: str
    quantity: Optional[int]
    price_per_unit: Optional[float]
    total_price: Optional[float]

class ReceiptSummary(BaseModel):
    store_name: str
    store_address: str
    store_number: Optional[int]
    items: List[Item]
    tax: Optional[float]
    total: Optional[float]
    # Date is in the format YYYY-MM-DD. We can apply a regex pattern to ensure it's formatted correctly.
    date: Optional[str] = Field(pattern=r'\d{4}-\d{2}-\d{2}', description="Date in the format YYYY-MM-DD")
    payment_method: Literal["cash", "credit", "debit", "check", "other"]
```

## Prepare the prompt

We'll use the `tf_processor` to convert the image and the text prompt into a format that the model can understand. Practically,
this is the code that adds user, system, assistant, and image tokens to the prompt.

```python
# Set up the content you want to send to the model
messages = [
    {
        "role": "user",
        "content": [
            {
                # The image is provided as a PIL Image object
                "type": "image",
                "image": image,
            },
            {
                "type": "text",
                "text": f"""You are an expert at extracting information from receipts.
                Please extract the information from the receipt. Be as detailed as possible --
                missing or misreporting information is a crime.

                Return the information in the following JSON schema:
                {ReceiptSummary.model_json_schema()}
            """},
        ],
    }
]

# Convert the messages to the final prompt
prompt = tf_processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
```

If you are curious, the final prompt that is sent to the model looks (roughly) like this:

```
<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
<|vision_start|><|image_pad|><|vision_end|>
You are an expert at extracting information from receipts.
Please extract the information from the receipt. Be as detailed as
possible -- missing or misreporting information is a crime.

Return the information in the following JSON schema:

<JSON SCHEMA OMITTED>
<|im_end|>
<|im_start|>assistant
```

## Run the model

```python
# Prepare a function to process receipts
receipt_summary_generator = outlines.Generator(model, ReceiptSummary)

# Generate the receipt summary
result = receipt_summary_generator(
    {"text": prompt, "images": image},
    max_new_tokens=1024
)
print(result)
```

## Output

The output should look like this:

```
{
  "store_name": "Trader Joe's",
  "store_address": "401 Bay Street, San Francisco, CA 94133",
  "store_number": 0,
  "items": [
    {"name": "BANANA EACH", "quantity": 7, "price_per_unit": 0.23, "total_price": 1.61},
    {"name": "BAREBELLS CHOCOLATE DOUG", "quantity": 1, "price_per_unit": 2.29, "total_price": 2.29},
    {"name": "BAREBELLS CREAMY CRISP", "quantity": 1, "price_per_unit": 2.29, "total_price": 2.29},
    {"name": "BAREBELLS CHOCOLATE DOUG", "quantity": 1, "price_per_unit": 2.29, "total_price": 2.29},
    {"name": "BAREBELLS CARAMEL CASHEW", "quantity": 2, "price_per_unit": 2.29, "total_price": 4.58},
    {"name": "BAREBELLS CREAMY CRISP", "quantity": 1, "price_per_unit": 2.29, "total_price": 2.29},
    {"name": "SPINDRIFT ORANGE MANGO 8", "quantity": 1, "price_per_unit": 7.49, "total_price": 7.49},
    {"name": "Bottle Deposit", "quantity": 8, "price_per_unit": 0.05, "total_price": 0.4},
    {"name": "MILK ORGANIC GALLON WHOL", "quantity": 1,"price_per_unit": 6.79,"total_price": 6.79},
    {"name": "CLASSIC GREEK SALAD", "quantity": 1, "price_per_unit": 3.49, "total_price": 3.49},
    {"name": "COBB SALAD", "quantity": 1, "price_per_unit": 5.99, "total_price": 5.99},
    {"name": "PEPPER BELL RED XL EACH", "quantity": 1, "price_per_unit": 1.29, "total_price": 1.29},
    {"name": "BAG FEE.", "quantity": 1, "price_per_unit": 0.25, "total_price": 0.25},
    {"name": "BAG FEE.", "quantity": 1, "price_per_unit": 0.25, "total_price": 0.25},
  ],
  "tax": 0.68,
  "total": 41.98,
  "date": "2023-11-04",
  "payment_method": "debit"
}
```

Voila! You've successfully extracted information from a receipt using an LLM.

## Bonus: roasting the user for their receipt

You can roast the user for their receipt by adding a `roast` field to the end of the  `ReceiptSummary` model.

```python
class ReceiptSummary(BaseModel):
    ...
    roast: str
```

which gives you a result like

```
{
    ...
    "roast": "You must be a fan of Trader Joe's because you bought enough
    items to fill a small grocery bag and still had to pay for a bag fee.
    Maybe you should start using reusable bags to save some money and the
    environment."
}
```

Qwen is not particularly funny, but worth a shot.


================================================
FILE: docs/examples/simtom.md
================================================
# Build perspective-taking agents with SimToM

Prompting strategies like Chain-of-Thought (CoT) can improve LLMs' reasoning capabilities. However, they underwhelm in tasks that require keeping track of inconsistent world states. [SimToM](https://arxiv.org/abs/2311.10227) proposes a simple, two-stage prompting framework for LLMs inspired by Simulation Theory. The authors showed that this approach outperforms zero-shot prompting and CoT on ToMI and BigToM, two benchmarks with Theory of Mind questions.

In this example, we will implement SimToM with a few lines of code using Outlines' prompt templating and structured generation capabilities.

## How SimToM works

SimToM calls an LLM with two consecutive prompts:

1. **Perspective-taking**: The first prompt receives a `story` and a `character`. The goal is to understand the situation based on the character's point of view and filter out the rest of the story.
2. **Question-Answering**: The second prompt receives the character's point of view from the previous step and tasks the LLM to answer a question using that context.

![Figure 2 in the paper](./images/simtom.png)

## Outlines implementation

To implement SimToM with Outlines, we will need to:

1. Write the prompts with [prompt templates](https://dottxt-ai.github.io/outlines/latest/reference/prompting/).
2. Define the JSON object each prompt will return using Pydantic.
3. Generate responses with a Mistral model using the [transformers integration](https://dottxt-ai.github.io/outlines/latest/reference/models/transformers/).

Let's dive into it!

### Using Prompt Templates

The authors have shared their code, prompts and data in [this GitHub repository](https://github.com/shawnsihyunlee/simulatedtom). Below, we define in Outlines the prompts they used for the ToMI dataset:

```python
from outlines import Template

perspective_taking = Template.from_file("prompt_templates/simtom_prospective_taking.txt")
simulation = Template.from_file("prompt_templates/simtom_simulation.txt")
```

### JSON Structured Generation

Outlines guarantees that the LLM will return a valid JSON object, which we can specify as a Pydantic model.

We will need two Pydantic models for SimToM, one for each prompt:

```python
from pydantic import BaseModel, Field
from typing import List

class PerspectiveTaking(BaseModel):
    """This is for the first prompt."""
    character: str = Field(description="The character we extract the events for.")
    events: List[str] = Field(description="All events that the character knows about.")

class Simulation(BaseModel):
    """This is for the second prompt."""
    answer: str
```

### Calling an LLM

Let's try SimToM with an example from the ToMI dataset:

```python
story = """
1 Aria entered the front_yard.
2 Aiden entered the front_yard.
3 The grapefruit is in the green_bucket.
4 Aria moved the grapefruit to the blue_container.
5 Aiden exited the front_yard.
6 Noah entered the playroom.
"""
question = "7 Where was the grapefruit at the beginning?"
character = "Aria"
```

We load `Mistral-7B-Instruct-v0.3`, create the prompt using the template we defined earlier, and generate a structured response. As a reminder, the goal of the first call is to get all the events a character, `Aria`, knows about.

```python
import transformers
import outlines
# Load an LLM from Hugging Face
MODEL_NAME = "mistral-community/Mistral-7B-Instruct-v0.3"
model = outlines.from_transformers(
    transformers.AutoModelForCausalLM.from_pretrained(MODEL_NAME),
    transformers.AutoTokenizer.from_pretrained(MODEL_NAME),
)

perspective_prompt = perspective_taking(story=story, character=character)

# Call Mistral 7B with the first prompt
generator = outlines.Generator(model, PerspectiveTaking)
perspective = generator(perspective_prompt, max_new_tokens=1024)

print(perspective)
# {'character': 'Aria', 'events': ['1 Aria entered the front_yard.', '3 The grapefruit is in the green_bucket.', '4 Aria moved the grapefruit to the blue_container.']}
```

Not bad! We will now generate the second prompt with those events.

```python
import json

sim_prompt = simulation(events=json.loads(perspective)["events"], name=character, question=question)

# Call Mistral 7B with the second prompt
generator = outlines.Generator(model, Simulation)
result = generator(sim_prompt, max_new_tokens=1024)

print(result)
# {'answer': 'green_bucket'}
```

And this is it! SimToM could be useful in agentic workflows, where agents must act based on what they know, not all available information. One caveat of SimToM is that the perspective-taking step may remove important information, leading to wrong results. As the authors note in their paper, it can feature as a simple and effective baseline for evaluating LLMs on Theory of Mind reasoning tasks.


================================================
FILE: docs/examples/structured_generation_workflow.md
================================================
# Structured Generation Workflow: Generating Synthetic Phone Numbers

This is a condensed version of [Coding for Structured Generation with LLMs](https://blog.dottxt.co/coding-for-structured-generation.html).

For this example we're going to be building an LLM program to generate **synthetic data** in the form of realistic looking phone numbers for Washington State. Using an LLM for this task *is a bit overkill* since we could just as easily accomplish this with a tool like [Faker](https://fakerjs.dev/), but this example still serves as a useful way to demonstrate a workflow for using structured generation.

## Unstructured approach

Before diving into how to use structure generation for this task let's start with an unstructured example. We begin by loading our model:

```python
import outlines
import transformers

model_name = 'microsoft/Phi-3-mini-4k-instruct'
model = outlines.from_transformers(
    transformers.AutoModelForCausalLM.from_pretrained(model_name),
    transformers.AutoTokenizer.from_pretrained(model_name)
)
```

Next we need a prompt for this model. Since we're focusing on structured generation, we won't be engaging in any form of "prompt hacking" and will be leaving this prompt untouched for the rest of this example.

```python
prompt_phone = """
    Please generate a realistic phone number for Washington State in the following format
    (555) 555-5555
"""
```

With our prompt ready we can now generate 10 example phone numbers

```python
phone_generator_unstruct = outlines.Generator(model)
for _ in range(3):
    print(phone_generator_unstruct(prompt_phone, max_new_tokens=12))
```

> I'd be happy to help you generate a realistic phone\
I cannot generate a real phone number as I'm just\
I'm an AI and don't have the ability\
Sure! Here is a randomly generated phone number in the format\
Here's a phone number that fits the format for a\
In Washington State, phone numbers typically have a three-dig\
Here are a few examples of phone numbers that could be considered\
I'd be happy to help generate a realistic phone number\
I'd be happy to help you generate a random phone\
Based on the format you provided, a realistic phone number for\

As we can see, none of these outputs are even phone numbers!

Let's see  if we can improve this using structured generation.

## The Structured Generation Workflow

In order to solve this problem we're going to introduce a *Structured Generation Workflow* outlined in this image:

!["Visual of Structured Generation Workflow"](./images/coding_structure_diagram.png)

Let's step through this:

### Real example

We start with a real example phone number, in this case for the Seattle Public Library, that we can use to verify the structure we are creating.

```python
phone_number = "(206) 386-4636"
```

For a simple example like this, we'll just be using a single phone number, for more complex examples it can be helpful to have more examples.

### Draft Structure

The next step in the process is for use to define a simple regex that we feel correctly models our real data.

```python
from outlines.types import Regex

phone_regex_1 = Regex(r'\([0-9]{3}\) [0-9]{3}-[0-9]{4}')
```

Next we need to validate this regex against our real data.

### Validate by matching examples

Whenever writing non-trivial code with structured generation it is *essential* that you first validate the code against your real data example(s).

We'll start with a simple method of validation: just checking that our regex matches the data.

```
import re

re.match(phone_regex_1.pattern, phone_number)
# <re.Match object; span=(0, 14), match='(206) 386-4636'>

```

Now that we have a match, we can move on to generating structured output!

### Generate Structure

We're ready to see if structured generation can make an improvement over our initial unstructured approach:

```python
phone_generator_v1 = outlines.Generator(model, phone_regex_1)

for _ in range(3):
    print(phone_generator_v1(prompt_phone))
```
> (206) 555-1234\
(206) 555-1234\
(206) 555-1234\
(206) 555-1234\
(206) 555-1234\
(206) 555-1234\
(206) 123-4567\
(206) 555-1234\
(206) 555-1234\
(206) 555-1234

At least we have phone numbers! But I think we can do better!

### Inspect output

In this case the model *did* create phone numbers and, impressively, got the area code correct. So using structured generation did improve things. However these numbers are pretty boring. Let's improve that structure!

## Iteration

We've walked through the loop once, so we can go quickly now through each iteration.

We start by improving our structure:

```python
phone_regex_2 = Regex(r'\([0-9]{3}\) [2-46-9]{3}-[02-9]{4}')
```

Before rushing to another round of generation, let's validate this new regex. We'll add just a bit more sophistication over our last check:

```python
re.match(phone_regex_2.pattern, phone_number)[0] == phone_number
# True
```
Now that we've validated, let's generate with this new regex!

```python
phone_generator_v2 = outlines.Generator(model, phone_regex_2)

for _ in range(3):
    print(phone_generator_v2(prompt_phone))
```

> (206) 867-5309\
(206) 666-7777\
(206) 444-3333\
(206) 444-3333\
(206) 943-2222\
(206) 323-6789\
(206) 444-3333\
(206) 867-5309\
(206) 466-2255\
(206) 222-3333

Better, but I don't like those repeated sequences. Like good software developers, let's iterate again!

## Reiteration - with debugging

Here's a fancier regex that should give us more interesting results:

```python
phone_regex_3_error = r'\([0-9]{3}\) [2-4][7-9][4-6]-[3-6][2-8][1-4]'
```

This looks good to me, but there's a subtle bug, that's why we *always* need to validate our structure against real data. This time we'll make our validator do a bit more work to verify the correct string is matched:

```python
if not re.match(phone_regex_3_error, phone_number):
    print("Regex fails match")
else:
    matched_string = re.match(phone_regex_3_error, phone_number)[0]
    if matched_string == phone_number:
    print("Successful match")
    else:
    print(f"Error {matched_string} != {phone_number}")
```
This prints out:
>  Error (206) 386-463 != (206) 386-4636

Ah! We were missing the last digit, let's fix that and regenerate:

```python
phone_regex_3_fixed = Regex(r'\([0-9]{3}\) [2-4][7-9][4-6]-[3-6][2-8][1-4][6-9]')
phone_generator_v3 = outlines.Generator(model, phone_regex_3_fixed)

for _ in range(3):
    print(phone_generator_v3(prompt_phone))
```

>(206) 494-3216\
(206) 374-6218\
(206) 494-3337\
(206) 476-3216\
(206) 484-3548\
(206) 495-3218\
(206) 494-5517\
(206) 375-4636\
(206) 384-6216\
(206) 385-6218

Much better!

Now you've seen a quick example of the structured generation workflow that can be used at the basis for building and iteration on much larger structured generation tasks!


================================================
FILE: docs/features/advanced/backends.md
================================================
---
title: Structured Generation Backends
---

# Structured Generation Backends

Outlines relies on a structured generation backend to control text generation for steerable models such thah they conform to the output type provided. One of those backends is of course `outlines-core`, but you also have access to two other libraries that fulfill the same purpose: `llguidance` and `xgrammar`.

## Overview

To select the backend to use for your generation, provide a value for the `backend` argument when calling a model or a generator.

For instance:

```python
from typing import Literal
import outlines
from transformers import AutoModelForCausalLM, AutoTokenizer

output_type = Literal["Paris", "London", "Rome", "Berlin"]

model = outlines.from_transformers(
    AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct"),
    AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
)

result = model("What is the capital of France?", output_type, backend="llguidance")
print(result) # 'Paris'

generator = outlines.Generaor(model, output_type)
result = generator("What is the capital of France?", backend="xgrammar")
print(result) # 'Paris'
```

If you do not provide a value for the `backend` argument, the default value will be used. The default value depends on the type of output type:

- JSON schema: `outlines_core`
- Regex: `outlines_core`
- Context-free grammar: `llguidance`

## Features matrix

As mentioned previously, selecting the structured generation backend is only applicable to steerable models, so `Transformers`, `LlmaCpp` and `MLXLM`. Additionaly, some backends do not support some models within those or some output types.

| | outlines_core | llguidance | xgrammar |
|---|---|---|---|
| **Models** | | | |
| Transformers | ✅ | ✅ | ✅ |
| LlamaCpp | ✅ | ✅ | ❌ |
| MLXLM | ✅ | ✅ | ✅ |
| **Output Types** | | | |
| JSON Schema | ✅ | ✅ | ✅ |
| Regex | ✅ | ✅ | ✅ |
| Grammar | ❌ | ✅ | ✅ |


================================================
FILE: docs/features/advanced/logits_processors.md
================================================
---
title: Logits Processors
---

# Logits Processors

Logits processors are objects that control text generation by modifying the probability distribution of possible next tokens. They do this by adjusting the logits (raw model outputs) at each generation step, effectively biasing the model's token selection.

Processors can be used to:

1. Generate structured output (e.g., JSON that follows a specific schema)
2. Prevent the model from generating specific words or tokens
3. Implement custom token sampling strategies

## Overview

Outlines uses logits processors with steerable models — models that run locally and allow fine-grained control over the generation process. When using such models in Outlines, the output type provided is turned into a logits processor that is then passed to the inference engine.

There are three models that support logits processors:

- LlamaCpp
- MLXLM
- Transformers

Instead of providing an output type that will be turned into a logits processor, it is possible to directly provide a logits processor. To do so, you must create a `Generator` instance using the `processor` keyword argument. You cannot directly call the model with a logits processor.

For instance:

```python
import transformers
from outlines import Generator, from_transformers
from outlines.processors import RegexLogitsProcessor

# Create a model
model = from_transformers(
    transformers.AutoModelForCausalLM.from_pretrained("NousResearch/Hermes-2-Pro-Llama-3-8B"),
    transformers.AutoTokenizer.from_pretrained("NousResearch/Hermes-2-Pro-Llama-3-8B")
)

# Create a regex logits processor that only returns hex unicode notations
logits_processor = RegexLogitsProcessor(r"U\+[0-9A-Fa-f]{4,6}", model.tokenizer, model.tensor_library_name)

# Create a generator with the logits processor and use it to generate text
generator = Generator(model, processor=logits_processor)
response = generator("What's the unicode for the hugging face emoji")

print(response) # U+1F917
```

## Creating Custom Logits Processors

You can create your own logits processor by subclassing the `OutlinesLogitsProcessor` class. This allows you to implement specific logic to modify logits as needed.
Your logits processor needs to implement the `process_logits` method to modify the logits.
`process_logits` accepts:
- `input_ids`: the ids of the tokens of the existing sequences in a 2D tensor.
- `logits`: the logits for the current generation step in a 2D tensor.

In the example below, we create a custom logits processor to force the model to provide a response using only binary representation (so only the tokens for 0 and 1 are allowed):

```python
from outlines.processors.base_logits_processor import OutlinesLogitsProcessor, TensorType
from outlines import Generator, from_transformers
import transformers

ALLOWED_TOKENS = [15, 16]  # token IDs corresponding to '0' and '1' in the model's vocabulary

# Subclass OutlinesLogitsProcessor
class BinaryLogitsProcessor(OutlinesLogitsProcessor):

    def process_logits(self, input_ids: TensorType, logits: TensorType) -> TensorType:
        # Create a mask for all tokens
        mask = self.tensor_adapter.boolean_ones_like(logits)
        # Set mask to False for the allowed tokens
        for token_id in ALLOWED_TOKENS:
            mask[:, token_id] = False
        # Set non-allowed tokens to -inf so they are not selected
        logits[mask] = float("-inf")
        return logits

# Create a regular model
tf_tokenizer = transformers.AutoTokenizer.from_pretrained("NousResearch/Hermes-2-Pro-Llama-3-8B")
tf_model = transformers.AutoModelForCausalLM.from_pretrained("NousResearch/Hermes-2-Pro-Llama-3-8B")
model = from_transformers(tf_model, tf_tokenizer)

# Instantiate your custom logits processor
logits_processor = BinaryLogitsProcessor(model.tensor_library_name)

prompt = "Write the number 47 in binary. For example, 1010 is the binary representation of 10. Answer just with the binary number composed of 0s and 1s."
formatted_prompt = tf_tokenizer.apply_chat_template(
    [{"role": "user", "content": prompt}],
    tokenize=False
)

# Create a generator with the custom logits processor instance and use it to generate text
generator = Generator(model, processor=logits_processor)
response = generator(formatted_prompt)

print(response) # "101111"
```


================================================
FILE: docs/features/core/generator.md
================================================
---
title: Generator API
---

# Generator


The `Generator` class is the core component of Outlines v1. `Generator` accepts a [model](../models/index.md) and an optional [output type](../core/output_types.md). If no output type is provided, the `Generator` will return unstructured text.

!!! note

    `Generator` is new as of Outlines v1, and replaces previous generator constructors:

    - `generate.cfg`
    - `generate.choice`
    - `generate.format`
    - `generate.fsm`
    - `generate.json`
    - `generate.regex`
    - `generate.text`

## Methods

Generators implement the same methods as models:

- `__call__`
- `batch`
- `stream`

All of them take a single positional argument: the [model input](../core/inputs.md) from which text is generated. Contrarily to the equivalent methods of models, you do not need to provide an output type as it has already been defined when initializing the generator.

## Basic Usage

```python
from outlines import Generator, from_transformers
import transformers

# Initialize a model
model_name = "HuggingFaceTB/SmolLM2-135M-Instruct"
model = from_transformers(
    transformers.AutoModelForCausalLM.from_pretrained(model_name),
    transformers.AutoTokenizer.from_pretrained(model_name),
)

# Create a generator for plain text
generator = Generator(model)
result = generator("Write a short poem about AI.")

# Print the result
print(result)
```

## Structured Generation

```python
from pydantic import BaseModel
from outlines import Generator, from_transformers
import transformers

# Define a Pydantic model for structured output
class BookRecommendation(BaseModel):
    title: str
    author: str
    year: int

# Initialize a model
model_name = "HuggingFaceTB/SmolLM2-135M-Instruct"
model = from_transformers(
    transformers.AutoModelForCausalLM.from_pretrained(model_name),
    transformers.AutoTokenizer.from_pretrained(model_name),
)

# Create a generator for JSON output
generator = Generator(model, BookRecommendation)

# Generate a book recommendation
result = generator("Recommend a science fiction book.")

# Parse the JSON result into a Pydantic model
book = BookRecommendation.model_validate_json(result)
print(f"{book.title} by {book.author} ({book.year})")
```

## Parameters

- `model`: The language model to use for generation
- `output_type`: Optional. The type of output to generate

## Generation Parameters

When calling the generator, you can pass additional parameters to control the generation process. These parameters are passed through to the underlying model, so they depend on the specific model being used.

Common parameters for most models include:
- `max_new_tokens`: Maximum number of tokens to generate
- `temperature`: Controls randomness (higher values = more random)
- `top_p`: Controls diversity via nucleus sampling
- `stop_strings`: String or list of strings at which to stop generation

Example:
```python
result = generator(
    "Write a short story.",
    max_new_tokens=200,
    temperature=0.7,
    top_p=0.9,
    stop_strings=["THE END", "###"]
)
```

## Return Value

The generator always returns a raw string containing the generated text. When generating structured outputs, you need to parse this string into the desired format.

Unlike in Outlines v0, where the return type could be a parsed object, in v1 you are responsible for parsing the output when needed:

```python
# Outlines v1 approach
from pydantic import BaseModel
from outlines import Generator

class Person(BaseModel):
    name: str
    age: int

generator = Generator(model, Person)
result = generator("Generate a person:")

# Parse the result yourself
person = Person.model_validate_json(result)
```

::: outlines.generator.Generator


================================================
FILE: docs/features/core/inputs.md
================================================
---
title: Model Inputs
---

# Model Inputs

Outlines models accept various types of inputs to generate text. The input format depends on the capabilities of the underlying model and the type of task you want to perform. The most basic type of input is a single string prompt, it's accepted by all models.

## Overview

The model input is the first argument of the `__call__`, `stream` and `batch` methods of both models and generators.

There are 3 types of model inputs:

- **Text prompts** - Simple strings
- **Multimodal inputs** - List containning a string prompt along with assets
- **Chat inputs** - `Chat` instances containing messages

## Text Prompts

The simplest form of input is a plain text string. This works with all models and is suitable for standard text generation tasks.

```python
import outlines
from transformers import AutoModelForCausalLM, AutoTokenizer

# Create a model
model = outlines.from_transformers(
    AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct"),
    AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct"),
)

# Simple text prompt
response = model("What's the capital of France?", max_new_tokens=20)
print(response)  # 'Paris'
```

## Multimodal Inputs (Vision)

For models that support them, you can provide a list containing a text prompt and one or more assets.

There are 3 types of assets defined in Outlines:

- `Image`: contains a PIL Image
- `Video`: contains any object (you must choose a format that is supported by your model)
- `Audio`: contains any object (you must choose a format that is supported by your model)

Among those, `Image` is by far the most important as multiple models support vision inputs.

For instance with vision input:

```python
import io
import requests
import PIL
import outlines
import openai
from outlines.inputs import Image

# Create the model
model = outlines.from_openai(
    openai.OpenAI(),
    "gpt-4o"
)

# Function to get an image
def get_image(url):
    r = requests.get(url)
    return PIL.Image.open(io.BytesIO(r.content))

# Create the prompt containing the text and the image
prompt = [
    "Describe the image",
    Image(get_image("https://picsum.photos/id/237/400/300"))
]

# Call the model to generate a response
response = model(prompt, max_tokens=50)
print(response) # 'This is a picture of a black dog.'
```

## Chat Inputs

For conversational models, you can use the `Chat` class to provide a conversation history with multiple messages.

A `Chat` instance is instantiated with an optional list of messages. Each message must be a dictionary containing two mandatory keys:
- `role`: must be one of `system`, `assistant` or `user`
- `content`: must be either a string or a multimodal input (if the model supports it)

For instance:

```python
import io
import requests
import PIL
from outlines.inputs import Chat, Image

# Function to get an image
def get_image(url):
    r = requests.get(url)
    return PIL.Image.open(io.BytesIO(r.content))

# Create the chat input
prompt = Chat([
    {"role": "system", "content": "You are a helpful assistant."},
    {
        "role": "user",
        "content": ["Describe the image", Image(get_image("https://picsum.photos/id/237/400/300"))]
    },
])
print(prompt)
# {'role': 'system', 'content': 'You are a helpful assistant.'}
# {'role': 'user', 'content': ['Describe the image', Image(image=<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=400x300 at 0x7FFA526CCC40>)]}
```

After having created a `Chat` instance, you can add one or several messages thanks to the `append` and `extend` methods. You can also remove the last message of the Chat with the `pop` method.

For instance:

```python
from outlines.inputs import Chat

# Create the chat input
prompt = Chat([
    {"role": "system", "content": "You are a helpful assistant."},
])

# Add a message
prompt.append({"role": "user", "content": "How are you doing today?"})
print(prompt)
# {'role': 'system', 'content': 'You are a helpful assistant.'}
# {'role': 'user', 'content': 'How are you doing today?'}

# Remove the last messsage
last_message = prompt.pop()
print(last_message)
# {'role': 'user', 'content': 'How are you doing today?'}
print(prompt)
# {'role': 'system', 'content': 'You are a helpful assistant.'}

# RAdd several messages
prompt.extend([
    {"role": "user", "content": "How are you doing today?"},
    {"role": "assistant", "content": "Excellent, thanks!"}
])
print(prompt)
# {'role': 'system', 'content': 'You are a helpful assistant.'}
# {'role': 'user', 'content': 'How are you doing today?'}
# {'role': 'assistant', 'content': 'Excellent, thanks!'}
```

Finally, there are three convenience method to easily add a message:

- add_system_message
- add_user_message
- add_assistant_message

As the role is already set, you only need to provide the content.

For instance:

```python
from outlines.inputs import Chat

# Create the chat input
prompt = Chat()

prompt.add_system_message("You are a helpful assistant.")
prompt.add_system_message("How are you doing today?")
prompt.add_system_message("Excellent, thanks!")

print(prompt)
# {'role': 'system', 'content': 'You are a helpful assistant.'}
# {'role': 'user', 'content': 'How are you doing today?'}
# {'role': 'assistant', 'content': 'Excellent, thanks!'}
```

## Batching

In the case of batching, for models that support it, you just have to provide several instances of the model inputs described above in a list.

For instance:

```python
import outlines
from transformers import AutoModelForCausalLM, AutoTokenizer

# Create model
model = outlines.from_transformers(
    AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct"),
    AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
)

# Create a list of prompts that will be used in a single batch
prompts = [
    "What's the capital of Lithuania?",
    "What's the capital of Latvia?",
    "What's the capital of Estonia?"
]

# Call it to generate text
result = model.batch(prompts, max_new_tokens=20)
print(result) # ['Vilnius', 'Riga', 'Tallinn']
```


================================================
FILE: docs/features/core/output_types.md
================================================
---
title: Output Types
---

# Output Types

Outlines provides a simple and intuitive way of defining the output structure of text generation. Possible output formats include basic Python types, multiple-choices, JSON schemas, regular expressions and context-free grammars.

## Overview

Outlines models accept a __prompt__ and an __output type__ when they are invoked, as well as additional inference keyword arguments that are forwarded on to the underlying model.

Output types can be from the general Python ecosystem, including:
- Most native Python types, such as `int` or `str`
- Types from the `typing` module, such as `Literal`, `List`, `Dict`, `Enum`, etc
- Types from popular third party libraries such as Pydantic or GenSON.

Outlines also provides special classes for certain output structures (more details below):
- Multiple choices with `Choice`
- JSON schemas with `JsonSchema`
- Regular expressions with `Regex`
- Context-free grammars with `CFG`

The general idea is that you should provide as an output type what you would give as the type hint of the return type of a function.

Consider the following functions for instance:

```python
from datetime import date
from typing import Dict, List, Literal, Union
from pydantic import BaseModel

class Character(BaseModel):
    name: str
    birth_date: date
    skills: Union[Dict, List[str]]

def give_int() -> int:
    ...

def pizza_or_burger() -> Literal["pizza", "burger"]:
    ...

def create_character() -> Character:
    ...
```

With an Outlines model, you can generate text that respects the type hints above by providing those as the output type:

```python
model("How many minutes are there in one hour", int) # "60"
model("Pizza or burger", Literal["pizza", "burger"]) # "pizza"
model("Create a character", Character, max_new_tokens=100) # '{"name": "James", "birth_date": "1980-05-10)", "skills": ["archery", "negotiation"]}'
```

An important difference with function type hints though is that an Outlines generator always returns a string.
You have to cast the response into the type you want yourself.

For instance:

```python
result = model("Create a character", Character, max_new_tokens=100)
casted_result = Character.model_validate_json(result)
print(result) # '{"name": "Aurora", "birth_date": "1990-06-15", "skills": ["Stealth", "Diplomacy"]}'
print(casted_result) # name=Aurora birth_date=datetime.date(1990, 6, 15) skills=['Stealth', 'Diplomacy']
```

## Output Type Categories

We can group possible output types in several categories based on the use case they correspond to. While most of those types are native python or types coming from well-known third-party libraries, there are three Outlines-specific types: `JsonSchema`, `Regex` and `CFG`. Their use is explained below.

### Basic Python Types

The most straightforward form of structured generation is to return an answer that conforms to a given basic type such as an int or a python list. You can use the basic Python types and the types from the `typing` library. For instance:

```python
from typing import Dict

output_type = float # example of valid value: "0.05"
output_type = bool # example of valid value: "True"
output_type = Dict[int, str] # example of valid value: "{1: 'hello', 2: 'there'}"
```

You can combine types to create more complex response formats by relying on collection types and types such as `Union` and `Optional`. Let's consider for instance the output type below used to represent semi-structured data:

```python
from typing import Dict, List, Optional, Tuple, Union

output_type = Dict[str, Union[int, str, List[Tuple[str, Optional[float]]]]]
```

Values created with this output type would be dictionaries with string as keys and values made of either an integer, a string or a list of two elements tuples: a string and either a float or None. Example of a valid response for text generated with this output type (it would be contained in a string):

```json
{
    "name": "Alice",
    "age": 30,
    "metrics": [("engagement", 0.85), ("satisfaction", None)]
}
```

### Multiple Choices

Outlines supports multiple choice classification by using the `Literal` or `Enum` output types. For instance:

```python
from enum import Enum
from typing import Literal

class PizzaOrBurger(Enum):
    pizza = "pizza"
    burger = "burger"

# Equivalent multiple-choice output types
output_type = Literal["pizza", "burger"]
output_type = PizzaOrBurger
```

Additionally, you can use the Outlines-specific type `Choice` that takes a `list` as an argument. This type is useful in situations in which the list of choices is dynamic.

For instance:

```python
from outlines.types import Choice

def get_multiple_choices() -> list:
    # we could have something complex here
    return ["pizza", "burger"]

output_type = Choice(get_multiple_choices())
```

### JSON Schemas

Multiple different common Python types are often used to store information equivalent to a JSON schema. The following can be used in Outlines to generate text that respects a JSON schema:

- A Pydantic class
- A Dataclass
- A TypedDict
- A [GenSON](https://github.com/wolverdude/GenSON) `SchemaBuilder`
- A Callable (the parameters are turned into the keys and the type hinting is used to define the types of the values)

For instance:

```python
from dataclasses import dataclass

@dataclass
class Character:
    name: str
    age: int

output_type = Character

def character(name: str, age: int):
    return None

output_type = character
```

There are two other JSON schema formats that require Outlines-specific classes: JSON schema strings and dictionaries.

As those are contained in regular Python strings or dictionaries, the associated output format would be ambiguous if they were to be provided directly. As a result, Outlines requires them to be wrapped in a `outlines.types.JsonSchema` object. For instance:

```python
from outlines.types import JsonSchema

schema_string = '{"type": "object", "properties": {"answer": {"type": "number"}}}'
output_type = JsonSchema(schema_string)

schema_dict = {
    "type": "object",
    "properties": {
        "answer": {"type": "number"}
    }
}
output_type = JsonSchema(schema_dict)
```

`JsonSchema` accepts two optional parameters:
- `whitespace_pattern` (defaults to `None`): specifies the pattern to use for JSON syntactic whitespace. If none is provided, the default permissive JSON whitespace rules are used.
- `ensure_ascii` (defaults to `True`): defines the value to use for the argument `ensure_ascii` of the `json.dumps` method. If false, non-ASCII characters will be turned into unicodes.

### Regex Patterns

Outlines provides support for text generation constrained by regular expressions. Since regular expressions are expressed as simple raw string literals, regex strings must wrapped in an `outlines.types.Regex` object.

```python
from outlines.types import Regex

regex = r"[0-9]{3}"
output_type = Regex(regex)
```

The `outlines.types` module contains a few common regex patterns stored in variables you can import and directly use as output types. Common patterns include a sentence, an email address and an [ISBN reference](https://en.wikipedia.org/wiki/ISBN). For instance:

```python
from outlines.types import sentence

print(type(sentence)) # outlines.types.dsl.Regex
print(sentence.pattern) # [A-Z].*\s*[.!?]
```

To help you create complex regex patterns yourself, you can use the Outlines [regex DSL](../../utility/regex_dsl).

### Context-Free Grammars

Outlines allows you to generate text that respects the syntax of a context-free grammar. Context-free grammars are defined using [Lark](https://lark-parser.readthedocs.io/en/latest/index.html), a grammar language. Since grammars are expressed as a string, Large CFG strings should be be wrapped in an `outlines.types.CFG` object. For instance:

```python
from outlines.types import CFG

grammar_string = """
    start: expr
    expr: "{" expr "}" | "[" expr "]" |
"""
output_type = CFG(grammar_string)
```

You can find a few Lark grammar examples in the [grammars module](../../api_reference/grammars.md).

## Output type availability

The output types presented above are not available for all models as some have only limited support for structured outputs. Please refer to the documentation of the specific model you wish to use to know what output types it supports.


================================================
FILE: docs/features/index.md
================================================
# Features

This section presents in details the different features of Outlines.

## Core Concepts

- [Models](./models/index.md)
- [Model Inputs](./core/inputs.md)
- [Output Types](./core/output_types.md)
- [Generators](./core/generator.md)

## Utilities

- [Applications](./utility/application.md)
- [Templates](./utility/templates.md)
- [Regex DSL](./utility/regex_dsl.md)

## Advanced

- [Logits Processors](./advanced/logits_processors.md)


================================================
FILE: docs/features/models/anthropic.md
================================================
---
title: Anthropic
---

# Anthropic

!!! Installation

    You need to install the `anthropic` library to be able to use the Anthropic API in Outlines. Install all optional dependencies of the `Anthropic` model with: `pip install "outlines[anthropic]"`.

    You also need to have an Anthropic API key. This API key must either be set as an environment variable called `ANTHROPIC_API_KEY` or be provided to the `anthropic.Anthropic` class when instantiating it.

## Model Initialization

To create an Anthropic model instance, you can use the `from_anthropic` function. It takes 2 arguments:

- `client`: an `anthropic.Anthropic` instance
- `model_name`: the name of the model you want to use in subsequent model calls (optional)

For instance:

```python
from anthropic import Anthropic
import outlines

# Create the Anthropic client
client = Anthropic()

# Create the model
model = outlines.from_anthropic(
    client,
    "claude-3-5-sonnet-latest"
)
```

Check the [Anthropic documentation](https://docs.anthropic.com/en/docs/about-claude/models) for an up-to-date list of available models.

## Text Generation

Once you've created your Outlines `Anthropic` model instance, you're all set to generate text with this provider. You can simply call the model with a text prompt.

For instance:

```python
from anthropic import Anthropic
import outlines

# Create the model
model = outlines.from_anthropic(
    Anthropic(),
    "claude-3-5-sonnet-latest"
)

# Call it to generate text
response = model("What's the capital of Latvia?", max_tokens=20)
print(response) # 'Riga'
```

#### Vision

Some Anthropic models support vision input. To use this feature, provide a list containing a text prompt and `Image` instances.

For instance:

```python
import io
import requests
import PIL
from anthropic import Anthropic
from outlines import from_anthropic
from outlines.inputs import Image

# Create the model
model = from_anthropic(
    Anthropic(),
    "claude-3-5-sonnet-latest"
)

# Function to get an image
def get_image(url):
    r = requests.get(url)
    return PIL.Image.open(io.BytesIO(r.content))

# Create the prompt containing the text and the image
prompt = [
    "Describe the image",
    Image(get_image("https://picsum.photos/id/237/400/300"))
]

# Call the model to generate a response
response = model(prompt, max_tokens=50)
print(response) # 'This is a picture of a black dog.'
```

#### Chat

You can also use chat inputs with the `Anthropic` model. To do so, call the model with a `Chat` instance. The content of messsage within the chat can be vision inputs as described above.

For instance:

```python
import io
import requests
import PIL
from anthropic import Anthropic
from outlines import from_anthropic
from outlines.inputs import Chat, Image

# Create the model
model = from_anthropic(
    Anthropic(),
    "claude-3-5-sonnet-latest"
)

# Function to get an image
def get_image(url):
    r = requests.get(url)
    return PIL.Image.open(io.BytesIO(r.content))

# Create the chat input
prompt = Chat([
    {"role": "user", "content": "You are a helpful assistant that helps me described pictures."},
    {"role": "assistant", "content": "I'd be happy to help you describe pictures! Please go ahead and share an image"},
    {
        "role": "user",
        "content": ["Describe the image", Image(get_image("https://picsum.photos/id/237/400/300"))]
    },
])

# Call the model to generate a response
response = model(prompt, max_tokens=50)
print(response) # 'This is a picture of a black dog.'
```

#### Streaming

Finally, the `Anthropic` model supports streaming through the `stream` method.

For instance:

```python
from anthropic import Anthropic
import outlines

# Create the model
model = outlines.from_anthropic(
    Anthropic(),
    "claude-3-5-sonnet-latest"
)

# Stream the response
for chunk in model.stream("Tell me a short story about a cat.", max_tokens=50):
    print(chunk) # 'Once...'
```

## Inference arguments

When calling the model or streaming, you can provide keyword arguments that will be passed down to the Anthropic client. Make sure to include all the arguments you need to configure the client's behavior to your expected behavior. Some of the most common arguments include `max_tokens`, `temperature`, `stop_sequences` and `top_k`.

See the [Anthropic API documentation](https://docs.anthropic.com/en/api/messages) for the full list of available arguments.

!!! Warning

    You must set a value for `max_tokens` with Anthropic models.


================================================
FILE: docs/features/models/dottxt.md
================================================
---
title: Dottxt
---

# Dottxt

!!! Installation

    You need to install the `dottxt` python sdk to be able to use the Dottxt API in Outlines. Install all optional dependencies of the `Dottxt` model with: `pip install "outlines[dottxt]"`.

    You also need to have a Dottxt API key. This API key must either be set as an environment variable called `DOTTXT_API_KEY` or be provided to the `dottxt.client.Dottxt` class when instantiating it.

## Model Initialization

To create an Dottxt model instance, you can use the `from_dottxt` function. It takes 3 arguments:

- `client`: a `dottxt.client.Dottxt` instance
- `model_name`: the name of the model you want to use in subsequent model calls (optional)
- `model_revision`: the name of the revision to use for the model selected (optional)

For instance:

```python
from dottxt.client import Dottxt
import outlines

# Create client
client = Dottxt(api_key="...")

# Create the model
model = outlines.from_dottxt(
    client,
    "meta-llama/Llama-3.1-8B",
    "d04e592bb4f6aa9cfee91e2e20afa771667e1d4b"
)
```

Use the `list_models` method of the Dottxt client to get a list of available model names and revisions for your account.

## Text Generation

Dottxt only supports constrained generation with JSON schema output types. You must always provide a value for the `output_type` parameter as unconstrained generation is not available.

For instance:

```python
from typing import List
from pydantic import BaseModel
from dottxt.client import Dottxt
import outlines

class Character(BaseModel):
    name: str
    age: int
    skills: List[str]

# Create the model
model = outlines.from_dottxt(
    Dottxt(),
    "meta-llama/Llama-3.1-8B",
    "d04e592bb4f6aa9cfee91e2e20afa771667e1d4b"
)

# Generate structured text
result = model("Create a character", Character)
print(result) # '{"name": "Evelyn", "age": 34, "skills": ["archery", "stealth", "alchemy"]}'
print(Character.model_validate_json(result)) # name=Evelyn, age=34, skills=['archery', 'stealth', 'alchemy']
```

## Inference arguments

You can provide the same optional parameters you would pass to the `dottxt` sdk's client both during the initialization of the `Dottxt` class and when generating text. Some of the common inference arguments include `max_tokens`, `frequency_penalty`, `presence_penalty` and `temperature`.

Consult the [dottxt python sdk GitHub repository](https://github.com/dottxt-ai/dottxt-python) for the full list of parameters.


================================================
FILE: docs/features/models/gemini.md
================================================
# Gemini

!!! Installation

    You need to install the `google.genai` libray to be able to use the Gemini API in Outlines. Install all optional dependencies of the `Gemini` model with: `pip install "outlines[gemini]"`.

    You also need to have a Gemini API key. This API key must either be set as an environment variable called `GEMINI_API_KEY` or be provided to the `google.genai.Client` class when instantiating it.

## Model Initialization

To create a Gemini model instance, you can use the `from_gemini` function. It takes 2 arguments:

- `client`: a `google.genai.Client` instance
- `model_name`: the name of the model you want to use in subsequent model calls (optional)

For instance:

```python
import outlines
from google import genai

# Create the client
client = genai.Client()

# Create the model
model = outlines.from_gemini(
    client,
    "gemini-1.5-flash-latest"
)
```

Check the [Gemini documentation](https://github.com/googleapis/python-genai) for an up-to-date list of available models.

## Text Generation

Once you've created your Outlines `Gemini` model instance, you're all set to generate text with this provider. You can simply call the model with a prompt.

For instance:

```python
import outlines
from google.genai import Client

# Create the model
model = outlines.from_gemini(
    Client(),
    "gemini-1.5-flash-latest"
)

# Call it to generate text
result = model("What's the capital of Latvia?", max_output_tokens=20)
print(result) # 'Riga'
```

#### Vision

Some Gemini models support vision input. To use this feature, provide a list containing a text prompt and `Image` instances.

For instance:

```python
import io
import requests
import PIL
import outlines
from google.genai import Client
from outlines.inputs import Image

# Create the model
model = outlines.from_gemini(
    Client(),
    "gemini-1.5-flash-latest"
)

# Function to get an image
def get_image(url):
    r = requests.get(url)
    return PIL.Image.open(io.BytesIO(r.content))

# Create the prompt containing the text and the image
prompt = [
    "Describe the image",
    Image(get_image("https://picsum.photos/id/237/400/300"))
]

# Call the model to generate a response
response = model(prompt, max_output_tokens=50)
print(response) # 'This is a picture of a black dog.'
```

#### Chat

You can also use chat inputs with the `Gemini` model. To do so, call the model with a `Chat` instance. The content of messsage within the chat can be vision inputs as described above.

For instance:

```python
import io
import requests
import PIL
import outlines
from google.genai import Client
from outlines.inputs import Chat, Image

# Create the model
model = outlines.from_gemini(
    Client(),
    "gemini-1.5-flash-latest"
)

# Function to get an image
def get_image(url):
    r = requests.get(url)
    return PIL.Image.open(io.BytesIO(r.content))

# Create the chat input
prompt = Chat([
    {"role": "user", "content": "You are a helpful assistant that helps me described pictures."},
    {"role": "assistant", "content": "I'd be happy to help you describe pictures! Please go ahead and share an image"},
    {
        "role": "user",
        "content": ["Describe the image", Image(get_image("https://picsum.photos/id/237/400/300"))]
    },
])

# Call the model to generate a response
response = model(prompt, max_output_tokens=50)
print(response) # 'This is a picture of a black dog.'
```

#### Streaming

Finally, the `Gemini` model supports streaming through the `stream` method.

For instance:

```python
import outlines
from google.genai import Client

# Create the model
model = outlines.from_gemini(
    Client(),
    "gemini-1.5-flash-latest"
)

# Stream text
for chunk in model.stream("Write a short story about a cat.", max_output_tokens=20):
    print(chunk) # 'In...'
```

## Structured Generation

Gemini provides supports for some forms of structured output: multiple choice, JSON schema (with caveats) and lists of structured objects. To use it, call the model with an `output_type` on top of your prompt.

#### Multiple Choice

```python
import outlines
from google import genai
from enum import Enum

class PizzaOrBurger(Enum):
    pizza = "pizza"
    burger = "burger"

# Create the model
model = outlines.from_gemini(genai.Client(), "gemini-1.5-flash-latest")

# Call it with the ouput type to generate structured text
result = model("Pizza or burger?", PizzaOrBurger, max_output_tokens=20)
print(result) # 'pizza'
```

#### JSON Schema

Gemini supports only three types of objects used to define a JSON Schema:

- Pydantic classes
- Dataclasses
- TypedDicts

```python
from typing import List
from pydantic import BaseModel
from google import genai
import outlines

class Character(BaseModel):
    name: str
    age: int
    skills: List[str]

# Create the model
model = outlines.from_gemini(genai.Client(), "gemini-1.5-flash-latest")

# Call it with the ouput type to generate structured text
result = model("Create a character", Character)
print(result) # '{"name": "Evelyn", "age": 34, "skills": ["archery", "stealth", "alchemy"]}'
print(Character.model_validate_json(result)) # name=Evelyn, age=34, skills=['archery', 'stealth', 'alchemy']
```

#### Lists of Structured Objects

A specificity of Gemini is that, despite not supporting regex, it does support a list of structured objects as an output type. To use it, put any of three available types described above in the typing `List` class

```python
from dataclasses import dataclass
from google import genai
import outlines

@dataclass
class Character:
    name: str
    age: int
    skills: List[str]

# Create the model
model = outlines.from_gemini(genai.Client(), "gemini-1.5-flash-latest")

# Call it with the ouput type to generate structured text
result = model("Create a character", list[Character])
print(result) # '[{"name": "Evelyn", "age": 34, "skills": ["archery", "stealth", "alchemy"]}, {["name":...'
```

!!! Attention

    The structured objects must be in a built-in `list`, not a `List` from the `typing` library

## Inference arguments

You can provide the same optional parameters you would pass to the `google.genai.Client` client both during the initialization of the Gemini model and when generating text. Some of the common inference arguments include `max_output_tokens`, `temperature`, and other generation parameters.

Consult the [Google Generative AI documentation](https://github.com/googleapis/python-genai) for the full list of parameters.


================================================
FILE: docs/features/models/index.md
================================================
---
title: Models
---

# Models

## Overview

Outlines models are objects that wrap an inference client or engine. Models provide a standardized interface to generate structured text.

All Outlines model classes have an associated loader function to facilitate initializing a model instance. The name of this function is `from_` plus the name of the model in lower-case letters. For instance, Outlines has a `Transformers` model and an associated `from_transformers` loader function. The parameters to load a model are specific to each provider, please consult the documentation of the model you want to use for more information.

After having created a model instance, you can either directly call it to generate text or first create a reusable generator that you would then call.

The input you must provide to a model to generate text can be a simple text prompt or a vision or chat input for models that support them. See the [model inputs section](../core/inputs.md) for more information on model inputs formats.

In all cases, you can provide an `output_type` to constrain the format of the generation output. See the [output types section](../core/output_types.md) for more information on constrained generation.

For instance:

```python
from outlines import from_transformers, Generator
import transformers

# Create a model
model = from_transformers(
    transformers.AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct"),
    transformers.AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct"),
)

# Call it directly
response = model("How many countries are there in the world", max_new_tokens=20)
print(response) # 'There are 200 countries in the world.'

# Call it directly with an output_type
response = model("How many countries are there in the world", int, max_new_tokens=20)
print(response) # '200'

# Create a generator first and then call it
generator = Generator(model, int)
response = generator("How many countries are there in the world")
print(response) # '200'
```

Some models support streaming through a `stream` method. It takes the same argument as the `__call__` method, but returns an iterator instead of a string.

For instance:

```python
from outlines import from_openai, Generator
import openai

# Create the model
model = from_openai(
    openai.OpenAI(),
    "gpt-4o"
)

# Stream the response
for chunk in model.stream("Tell a short story about a cat.", max_tokens=50):
    print(chunk) # 'This...'
```

Additionally, some models support batch processing through a `batch` method. It's similar to the `__call__` method, but takes a list of prompts instead of a single prompt and returns a list of strings.

For instance:

```python
from outlines import from_transformers, Generator
import transformers

# Create a model
model = from_transformers(
    transformers.AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct"),
    transformers.AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct"),
)

# Call it directly
response = model.batch(["What's the capital of Latvia?", "What's the capital of Estonia?"], max_new_tokens=20)
print(response) # ['Riga', 'Tallinn']
```

## Features Matrix

In alphabetical order:

| | [Anthropic](../../models/anthropic) | [Dottxt](../../models/dottxt) | [Gemini](../../models/gemini) | [LlamaCpp](../../models/llamacpp) | [MLXLM](../../models/mlxlm) | [Mistral](../../models/mistral) | [Ollama](../../models/ollama) | [OpenAI](../../models/openai) | [SGLang](../../models/sglang) | [TGI](../../models/tgi) | [Transformers](../../models/transformers) | [Transformers MultiModal](../../models/transformers_multimodal) | [VLLM](../../models/vllm) | [VLLMOffline](../../models/vllm_offline) |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| **Output Types** | | | | | | | | | | | | | | |
| Simple Types | ❌ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| JSON Schema | ❌ | ✅ | 🟠 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| Multiple Choice | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| Regex | ❌ | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| Grammar | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | 🟠 | ❌ | ✅ | ✅ | ✅ | ✅ |
| **Generation Features** | | | | | | | | | | | | | | |
| Async | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ |
| Streaming | ✅ | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ |
| Vision | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ |
| Batching | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ✅ | ❌ | ✅ |

## Model Types

Models can be divided into two categories: local models and server-based models.

In the case of local models, the text generation happens within the inference library object used to instantiate the model. This gives Outlines direct access to the generation process (through a logits processor) and means all structured generation output types are available.

The local models available are the following:

- LlamaCpp
- MLXLM
- Transformers
- TransformersMultiModal
- VLLMOffline

In the case of server-based models, the model is initialized with a client that sends a request to a server that is in charge of the actual text generation. As a result, we have limited control over text generation and some output types are not supported. The server on which the text generation happens can either be remote (with OpenAI or Anthopic for instance) or local (with SGLang for instance).

The server-based models available are the following:

- Anthropic
- Dottxt
- Gemini
- Mistral
- Ollama
- OpenAI
- SgLang
- TGI
- VLLM

Some models have an async version. To use them, just pass the async version of the provider object to their loading function. It will then return a `Async<ModelName>` instance with the same methods and features as the regular sync instance.

For instance:

```python
from outlines import from_tgi
from huggingface_hub import AsyncInferenceClient

model = from_tgi(
    AsyncInferenceClient("http://localhost:8000/v1")
)
print(type(model)) # outlines.models.tgi.AsyncTGI
```

The models that have an async version are the following:

- Mistral
- Ollama
- OpenAI
- SgLang
- TGI
- VLLM


================================================
FILE: docs/features/models/llamacpp.md
================================================
---
title: llama.cpp
---

# llama.cpp

Outlines provides an integration with [Llama.cpp](https://github.com/ggerganov/llama.cpp) using the [llama-cpp-python library](https://github.com/abetlen/llama-cpp-python). Llamacpp allows to run quantized models on machines with limited compute.

!!! Installation

    You need to install the `llama-cpp-python` library to use the llama.cpp integration. Install all optional dependencies of the `LlamaCpp` model with: `"pip install "outlines[llamacpp]"`.

    See the [llama-cpp-python Github page](https://github.com/abetlen/llama-cpp-python?tab=readme-ov-file#supported-backends) for instructions on installing with CUDA, Metal, ROCm and other backends.


## Model Initialization

To load the model, you can use the `from_llamacpp` function. The first argument of the function is a `Llama` model instance from the `llama_cpp` library. Consult the [Llama class API reference](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama) for detailed information on how to create a model instance and on the various available parameters.

You can also pass a `chat_mode` argument to `from_llamacpp`. If `True` (default), the model will regard all `str` inputs as user messages in a chat conversation. If `False`, the model will regard all `str` inputs as plain text prompts.

For instance:

```python
import outlines
from llama_cpp import Llama

model = outlines.from_llamacpp(
    Llama.from_pretrained(
        repo_id="TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
        filename="mistral-7b-instruct-v0.2.Q5_K_M.gguf",
    )
)
```

You can also disable chat mode:

```python
import outlines
from llama_cpp import Llama

model = outlines.from_llamacpp(
    Llama.from_pretrained(
        repo_id="TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
        filename="mistral-7b-instruct-v0.2.Q5_K_M.gguf",
    ),
    chat_mode=False,
)
```

## Text Generation

To generate text, you can simply call the model with a prompt.

For instance:

```python
import outlines
from llama_cpp import Llama

# Create the model
model = outlines.from_llamacpp(
    Llama.from_pretrained(
        repo_id="TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
        filename="mistral-7b-instruct-v0.2.Q5_K_M.gguf",
    )
)

# Call it to generate text
result = model("What's the capital of Latvia?", max_tokens=20)
print(result) # 'Riga'
```

#### Chat

You can also use chat inputs with the `LlamaCpp` model. To do so, call the model with a `Chat` instance.

For instance:

```python
import outlines
from llama_cpp import Llama
from outlines.inputs import Chat

# Create the model
model = outlines.from_llamacpp(
    Llama.from_pretrained(
        repo_id="TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
        filename="mistral-7b-instruct-v0.2.Q5_K_M.gguf",
    )
)

# Create the prompt containing the text and the image
prompt = Chat([
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "assistant", "content": "What's the capital of Latvia?"},
])

# Call the model to generate a response
response = model(prompt, max_tokens=50)
print(response) # 'Riga.'
```

#### Streaming

The `LlamaCpp` model also supports streaming.

For instance:

```python
import outlines
from llama_cpp import Llama

# Create the model
model = outlines.from_llamacpp(
    Llama.from_pretrained(
        repo_id="TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
        filename="mistral-7b-instruct-v0.2.Q5_K_M.gguf",
    )
)

# Stream text
for chunk in model.stream("Write a short story about a cat.", max_tokens=100):
    print(chunk) # 'In...'
```

## Structured Generation

The `LlamaCpp` model supports all output types available in Outlines. Simply provide an `output_type` after the prompt when calling the model.

### Basic Type

```python
import outlines
from llama_cpp import Llama

output_type = int

model = outlines.from_llamacpp(
    Llama.from_pretrained(
        repo_id="TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
        filename="mistral-7b-instruct-v0.2.Q5_K_M.gguf",
    )
)

result = model("How many countries are there in the world?", output_type)
print(result) # '200'
```

### JSON Schema

```python
from typing import List
from pydantic import BaseModel
import outlines
from llama_cpp import Llama

class Character(BaseModel):
    name: str
    age: int
    skills: List[str]

model = outlines.from_llamacpp(
    Llama.from_pretrained(
        repo_id="TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
        filename="mistral-7b-instruct-v0.2.Q5_K_M.gguf",
    )
)

result = model("Create a character.", output_type=Character, max_tokens=200)
print(result) # '{"name": "Evelyn", "age": 34, "skills": ["archery", "stealth", "alchemy"]}'
print(Character.model_validate_json(result)) # name=Evelyn, age=34, skills=['archery', 'stealth', 'alchemy']
```

### Multiple Choice

```python
from typing import Literal
import outlines
from llama_cpp import Llama

output_type = Literal["Paris", "London", "Rome", "Berlin"]

model = outlines.from_llamacpp(
    Llama.from_pretrained(
        repo_id="TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
        filename="mistral-7b-instruct-v0.2.Q5_K_M.gguf",
    )
)

result = model("What is the capital of France?", output_type)
print(result) # 'Paris'
```

### Regex

```python
from outlines.types import Regex
import outlines
from llama_cpp import Llama

output_type = Regex(r"\d{3}-\d{2}-\d{4}")

model = outlines.from_llamacpp(
    Llama.from_pretrained(
        repo_id="TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
        filename="mistral-7b-instruct-v0.2.Q5_K_M.gguf",
    )
)

result = model("Generate a fake social security number.", output_type)
print(result) # '782-32-3789'
```

### Context-free grammar

```python
from outlines.types import CFG
import outlines
from llama_cpp import Llama

output_type = CFG("""
root ::= answer
answer ::= "yes" | "no"
""")

model = outlines.from_llamacpp(
    Llama.from_pretrained(
        repo_id="TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
        filename="mistral-7b-instruct-v0.2.Q5_K_M.gguf",
    )
)

result = model("Are you feeling good today?", output_type)
print(result) # 'yes'
```

## Inference Arguments

When calling the model, you can provide optional inference parameters on top of the prompt and the output type. These parameters will be passed on to the `__call__` method of the `llama_cpp.Llama` model. Some common inference arguments include `max_tokens`, `temperature`, `frequency_penalty` and `top_p`.

See the [llama-cpp-python documentation](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.__call__) for more information on inference parameters.


================================================
FILE: docs/features/models/mistral.md
================================================
# Mistral

!!! Installation

    You need to install the `mistralai` library to be able to use the Mistral API in Outlines. Install all optional dependencies of the `Mistral` model with: `pip install "outlines[mistral]"`.

    You also need to have an Mistral API key. This API key must either be set as an environment variable called `MISTRAL_API_KEY` or be provided to the `mistralai.Mistral` class when instantiating it.

## Model Initialization

To create an `Mistral` or `AsyncMistral` model instance, you can use the `from_mistral` function. It takes 3 arguments:

- `client`: a `mistralai.Mistral` instance
- `model_name` (optional): the name of the model you want to use
- `async_client` (optional): whether it should create a sync or an async model

As the `mistralai` library uses a single class to handle both sync and async requests, you must set the `async_client` argument to True to get an `AsyncMistral` model.

For instance:

```python
import mistralai
import outlines

# Create the Mistral client
client = mistral.Mistral()

# Create a sync model
model = outlines.from_mistral(
    client,
    "mistral-large-latest"
)

# Create aa async model
model = outlines.from_mistral(
    client,
    "mistral-large-latest",
    True
)
```

The mistralai python SDK provides methods to query the API for a list of [all available models](https://docs.mistral.ai/getting-started/models/models_overview/#api-versioning),
including paid endpoints for [premium models](https://docs.mistral.ai/getting-started/models/models_overview/) in addition to open weights.

## Text Generation

Once you've created your Outlines `Mistral` model instance, you're all set to generate text with this provider. You can simply call the model with a prompt.

For instance:

```python
import mistralai
import outlines

# Create the model
model = outlines.from_mistral(
    mistralai.Mistral(),
    "mistral-large-latest"
)

# Call it to generate text
response = model("What's the capital of Latvia?", max_tokens=20)
print(response) # 'Riga'
```

#### Vision

Some Mistral models support vision input. To use this feature, provide a list containing a text prompt and `Image` instances.

For instance:

```python
import io
import requests
import PIL
import outlines
import mistralai
from outlines.inputs import Image

# Create the model
model = outlines.from_mistral(
    mistralai.Mistral(),
    "mistral-large-latest"
)

# Function to get an image
def get_image(url):
    r = requests.get(url)
    return PIL.Image.open(io.BytesIO(r.content))

# Create the prompt containing the text and the image
prompt = [
    "Describe the image",
    Image(get_image("https://picsum.photos/id/237/400/300"))
]

# Call the model to generate a response
response = model(prompt, max_tokens=50)
print(response) # 'This is a picture of a black dog.'
```

#### Chat

You can also use chat inputs with the `Mistral` model. To do so, call the model with a `Chat` instance. The content of messsage within the chat can be vision inputs as described above.

For instance:

```python
import io
import requests
import PIL
import mistralai
import outlines
from outlines.inputs import Chat, Image

# Create the model
model = outlines.from_mistral(
    mistralai.Mistral(),
    "mistral-large-latest"
)

# Function to get an image
def get_image(url):
    r = requests.get(url)
    return PIL.Image.open(io.BytesIO(r.content))

# Create the chat input
prompt = Chat([
    {"role": "system", "content": "You are a helpful assistant."},
    {
        "role": "user",
        "content": ["Describe the image", Image(get_image("https://picsum.photos/id/237/400/300"))]
    },
])

# Call the model to generate a response
response = model(prompt, max_tokens=50)
print(response) # 'This is a picture of a black dog.'
```

#### Streaming

Finally, the `Mistral` model supports streaming through the `stream` method.

For instance:

```python
import mistralai
import outlines

# Create the model
model = outlines.from_mistral(
    mistralai.Mistral(),
    "mistral-large-latest"
)

# Stream the response
for chunk in model.stream("Tell me a short story about a cat.", max_tokens=50):
    print(chunk) # 'Once...'
```

## Structured Generation

Mistral provides supports for some forms of structured output: JSON schemas and JSON syntax. To use it, call the model with an `output_type` on top of your prompt.

#### JSON Schema

```python
from typing import List
from pydantic import BaseModel
import mistralai
import outlines

class Character(BaseModel):
    name: str
    age: int
    skills: List[str]

# Create the model
model = outlines.from_mistral(
    mistralai.Mistral(),
    "mistral-large-latest"
)

# Call it with the output type to generate structured text
result = model("Create a character, use the json format.", Character, top_p=0.1)
print(result) # '{"name": "Evelyn", "age": 34, "skills": ["archery", "stealth", "alchemy"]}'
print(Character.model_validate_json(result)) # name=Evelyn, age=34, skills=['archery', 'stealth', 'alchemy']
```

#### JSON Syntax

What we mean by JSON syntax is what is sometimes called JSON mode, meaning that the model will return a valid JSON, but you do not get to specify its structure. To use this JSON mode, provide the `dict` type as an output type.

```python
import mistralai
import outlines

## Create the model
model = outlines.from_mistral(
    mistralai.Mistral(),
    "mistral-large-latest"
)


# Call it with the output type to generate structured text
result = model("Create a character, use the json format.", dict, temperature=0.5)
print(result) # '{"first_name": "Henri", "last_name": "Smith", "height": "170"}'
```

## Asynchronous Calls

All features presented above for the sync model are also available for the async model.

For instance:

```python
import asyncio
import mistralai
import outlines
from pydantic import BaseModel
from typing import List

class Character(BaseModel):
    name: str
    age: int
    skills: List[str]

# Create the model
model = outlines.from_mistral(
    mistralai.Mistral(),
    "mistral-large-latest",
    True
)

async def text_generation():
    # Regular generation
    response = await model("What's the capital of Latvia?", max_tokens=20)
    print(response) # 'Riga'

    # Streaming
    async for chunk in  model.stream("Tell me a short story about a cat.", max_tokens=50):
        print(chunk, end="") # 'Once...'

    # Structured generation
    result = await model("Create a character, use the json format.", Character, top_p=0.1)
    print(result) # '{"name": "Evelyn", "age": 34, "skills": ["archery", "stealth", "alchemy"]}'
    print(Character.model_validate_json(result)) # name=Evelyn, age=34, skills=['archery', 'stealth', 'alchemy']

asyncio.run(text_generation())
```

## Inference arguments

When calling the model, you can provide keyword arguments that will be passed down to the `chat.complete` method of the Mistral client and its async and streaming equivalents. Some of the most common arguments include `max_tokens`, `temperature`, `stop` and `top_p`.

Another keyword argument of interest is `n`. If set with an integer value superior to 1, Mistral will generate several sample responses and you will receive a list of strings as a response to your model call.

See the [Mistral API documentation](https://docs.mistral.ai/api/#tag/chat) for the full list of available arguments.


## Troubleshooting

- **ImportError: No module named 'mistralai'**
  → Run `pip install mistralai`.

- **Authentication Error**
  → Verify `MISTRAL_API_KEY` is set and valid. Test with the [Mistral Playground](https://chat.mistral.ai).

- **Schema Error (e.g., "Mistral does not support your schema")**
  → Ensure no `pattern` fields in Pydantic (Outlines sets `additionalProperties: false`); try a simpler schema or a different Outlines model (local models in particular).

- **Model Not Found Error**
  → Confirm the model name (e.g., `"mistral-small-latest"`) and your subscription tier. Check [docs](https://docs.mistral.ai/getting-started/models/).

- **Rate Limits or Quotas**
  → Monitor usage in the Mistral console; upgrade your plan for higher limits.

- **Input Validation Errors**
  → Ensure Chat messages use valid roles (`system`, `user`, `assistant`); list inputs start with strings.


*Last updated: October 2, 2025*


================================================
FILE: docs/features/models/mlxlm.md
================================================
---
title: mlx-lm
---

# mlx-lm

Outlines provides an integration with [mlx-lm](https://github.com/ml-explore/mlx-examples/tree/main/llms), allowing models to be run quickly on Apple Silicon via the [mlx](https://ml-explore.github.io/mlx/build/html/index.html) library.

!!! Note "Installation"

    You need a device that [supports Metal](https://support.apple.com/en-us/102894) to use the mlx-lm integration.

    You need to install the `mlx` and `mlx-lm` libraries to be able to use mlx in Outlines. Install all optional dependencies of the `MLXLM` model with: `pip install "outlines[mlxlm]"`.

## Model Initialization

To create a MLXLM model instance, you can use the `from_mlxlm` function. It takes 2 arguments:

- `model`: an `mlx.nn.Module` instance
- `tokenizer`: a `transformers.PreTrainedTokenizer` instance

However, we recommend you simply pass on the output of the `mlx_lm.load` function (it takes a model name as an argument).

For instance:

```python
import outlines
import mlx_lm

# Create the model
model = outlines.from_mlxlm(
    *mlx_lm.load("mlx-community/TinyLlama-1.1B-Chat-v1.0-4bit")
)
```

## Text Generation

To generate text, you can simply call the model with a prompt.

For instance:

```python
import outlines
import mlx_lm

# Load the model
model = outlines.from_mlxlm(
    *mlx_lm.load("mlx-community/TinyLlama-1.1B-Chat-v1.0-4bit")
)

# Call it to generate text
result = model("What's the capital of Latvia?", max_tokens=20)
print(result) # 'Riga'
```

#### Chat

You can use chat inputs with the `MLXLM` model. To do so, call the model with a `Chat` instance.

For instance:

```python
import outlines
import mlx_lm
from outlines.inputs import Chat

# Load the model
model = outlines.from_mlxlm(
    *mlx_lm.load("mlx-community/TinyLlama-1.1B-Chat-v1.0-4bit")
)

# Create the prompt containing the text and the image
prompt = Chat([
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "assistant", "content": "What's the capital of Latvia?"},
])

# Call the model to generate a response
response = model(prompt, max_tokens=50)
print(response) # 'Riga.'
```

#### Streaming

The `MLXLM` model also supports streaming. For instance:

```python
import outlines
import mlx_lm

# Load the model
model = outlines.from_mlxlm(
    *mlx_lm.load("mlx-community/TinyLlama-1.1B-Chat-v1.0-4bit")
)

# Stream text
for chunk in model.stream("Write a short story about a cat.", max_tokens=100):
    print(chunk) # 'In...'
```

#### Batch Generation

The `MLXLM` model supports generating text in batches. To do so, use the `batch` method and provide a list of strings as a model input. However, constrained generation is not supported with batching, so you cannot provide an `output_type`. For instance:

```python
import outlines
import mlx_lm

# Load the model
model = outlines.from_mlxlm(
    *mlx_lm.load("mlx-community/TinyLlama-1.1B-Chat-v1.0-4bit")
)

# Generate text in batches
result = model.batch(["What's the capital of Lithuania?", "What's the capital of Latvia?"], max_tokens=20)
print(result) # ['Vilnius', 'Riga']
```

## Structured Generation

As a local model, `MLXLM` supports all forms of structured generation available in Outlines.

#### Basic Type

```python
import outlines
import mlx_lm

output_type = int

model = outlines.from_mlxlm(
    *mlx_lm.load("mlx-community/TinyLlama-1.1B-Chat-v1.0-4bit")
)

result = model("How many countries are there in the world?", output_type)
print(result) # '200'
```

#### JSON Schema

```python
from pydantic import BaseModel
from typing import List
import outlines
import mlx_lm

class Character(BaseModel):
    name: str
    age: int
    skills: List[str]

model = outlines.from_mlxlm(
    *mlx_lm.load("mlx-community/TinyLlama-1.1B-Chat-v1.0-4bit")
)

result = model("Create a character.", output_type=Character)
print(result) # '{"name": "Evelyn", "age": 34, "skills": ["archery", "stealth", "alchemy"]}'
print(Character.model_validate_json(result)) # name=Evelyn, age=34, skills=['archery', 'stealth', 'alchemy']
```

#### Multiple Choice

```python
from typing import Literal
import outlines
import mlx_lm

output_type = Literal["Paris", "London", "Rome", "Berlin"]

model = outlines.from_mlxlm(
    *mlx_lm.load("mlx-community/TinyLlama-1.1B-Chat-v1.0-4bit")
)

result = model("What is the capital of France?", output_type)
print(result) # 'Paris'
```

#### Regex

```python
from outlines.types import Regex
import outlines
import mlx_lm

output_type = Regex(r"\d{3}-\d{2}-\d{4}")

model = outlines.from_mlxlm(
    *mlx_lm.load("mlx-community/TinyLlama-1.1B-Chat-v1.0-4bit")
)

result = model("Generate a fake social security number.", output_type)
print(result) # '782-32-3789'
```

#### Context-Free Grammar

```python
from outlines.types import CFG
import outlines
import mlx_lm

arithmetic_grammar = """
?start: sum

?sum: product
| sum "+" product   -> add
| sum "-" product   -> sub

?product: atom
| product "*" atom  -> mul
| product "/" atom  -> div

?atom: NUMBER           -> number
| "-" atom         -> neg
| "(" sum ")"

%import common.NUMBER
%import common.WS_INLINE

%ignore WS_INLINE
"""
output_type = CFG(arithmetic_grammar)

model = outlines.from_mlxlm(
    *mlx_lm.load("mlx-community/TinyLlama-1.1B-Chat-v1.0-4bit")
)

result = model("Write an addition.", output_type, max_tokens=20)
print(result) # '23 + 48'
```

## Inference Arguments

When calling the model, you can provide optional inference parameters on top of the prompt and the output type. These parameters will be passed on to the `mlx_lm.generate` function used to generate text.

See the [MLXLM documentation](https://github.com/ml-explore/mlx-lm) for more information on inference parameters.


================================================
FILE: docs/features/models/ollama.md
================================================
---
title: Ollama
---

# Ollama

!!! Installation

    To be able to use Ollama in Outlines, you must install both Ollama and the optional dependency libraries of the model.

    - To download Ollama: https://ollama.com/download
    - To install the ollama python sdk: `pip install "outlines[ollama]"`

    Consult the [`ollama` documentation](https://github.com/ollama/ollama-python) for detailed information on installation and client initialization.

## Model Initialization

To create an Ollama model instance, you can use the `from_ollama` function. It takes 2 arguments:

- `client`: an `ollama.Client` or `ollama.AsyncClient` instance
- `model_name`: the name of the model you want to use

Based on whether the inference client instance is synchronous or asynchronous, you will receive an `Ollama` or an `AsyncOllama` model instance.

For instance:

```python
import ollama
import outlines

# Create the client or async client
client = ollama.Client()
async_client = ollama.AsyncClient()

# Create a sync model
model = outlines.from_ollama(
    client,
    "qwen2.5vl:3b",
)

# Create an async model
model = outlines.from_ollama(
    async_client,
    "qwen2.5vl:3b",
)
```

You can find the list of available models on the [Ollama library](https://ollama.com/library).

## Text Generation

Once you've created your Outlines `Ollama` model instance, you're all set to generate text with this provider. You can simply call the model with a prompt.

For instance:

```python
import ollama
import outlines

# Create the model
model = outlines.from_ollama(ollama.Client(), "qwen2.5vl:3b")

# Call it to generate text
response = model("What's the capital of Latvia?")
print(response) # 'Riga'
```

#### Vision

Some Ollama models support vision input. To use this feature, provide a list containing a text prompt and `Image` instances.

```python
import io
import requests
import PIL
import ollama
import outlines
from outlines.inputs import Image

# Create the model
model = outlines.from_ollama(
    ollama.Client(),
    "qwen2.5vl:3b"
)

# Function to get an image
def get_image(url):
    r = requests.get(url)
    return PIL.Image.open(io.BytesIO(r.content))

# Create the prompt
prompt = [
    "Describe the image",
    Image(get_image("https://picsum.photos/id/237/400/300"))
]

# Generate text
response = model(prompt)
print(response) # The image shows a black puppy with a curious and attentive expression.
```

#### Chat

You can also use chat inputs with the `Ollama` model. To do so, call the model with a `Chat` instance. The content of messsage within the chat can be vision inputs as described above.

For instance:

```python
import io
import requests
import PIL
import ollama
import outlines
from outlines.inputs import Chat, Image

# Create the model
model = outlines.from_ollama(
    ollama.Client(),
    "qwen2.5vl:3b"
)

# Function to get an image
def get_image(url):
    r = requests.get(url)
    return PIL.Image.open(io.BytesIO(r.content))

# Create the chat input
prompt = Chat([
    {"role": "system", "content": "You are a helpful assistant."},
    {
        "role": "user",
        "content": ["Describe the image", Image(get_image("https://picsum.photos/id/237/400/300"))]
    },
])

# Call the model to generate a response
response = model(prompt)
print(response) # 'This is a picture of a black dog.'
```

#### Streaming

Finally, the `Anthropic` model supports streaming through the `stream` method.

```python
import ollama
import outlines

# Create the model
model = outlines.from_ollama(ollama.Client(), "qwen2.5vl:3b")

# Stream text
for chunk in model.stream("Write a short story about a cat"):
    print(chunk) # 'In...'
```

## Asynchronous Calls

Ollama supports asynchronous operations by passing an `AsyncClient` instead of a regular `Client`. This returns an `AsyncOllama` model instance that supports async/await patterns.

### Basic Async Generation

```python
import asyncio
import outlines
import ollama

async def generate_text():
    # Create an async model
    async_client = ollama.AsyncClient()
    async_model = outlines.from_ollama(async_client, "qwen2.5vl:3b")

    result = await async_model("Write a haiku about Python.")
    print(result)

asyncio.run(generate_text())
```

### Async Streaming

The async model also supports streaming with async iteration:

```python
import asyncio
import outlines
import ollama

async def stream_text():
    async_client = ollama.AsyncClient()
    async_model = outlines.from_ollama(async_client, "qwen2.5vl:3b")

    async for chunk in async_model.stream("Tell me a story about a robot."):
        print(chunk, end="")

asyncio.run(stream_text())
```

### Concurrent Async Requests

One of the main benefits of async calls is the ability to make multiple concurrent requests:

```python
import asyncio
import outlines
import ollama

async def generate_multiple():
    async_client = ollama.AsyncClient()
    async_model = outlines.from_ollama(async_client, "qwen2.5vl:3b")

    # Define multiple prompts
    prompts = [
        "Write a tagline for a coffee shop.",
        "Write a tagline for a bookstore.",
        "Write a tagline for a gym."
    ]

    tasks = [async_model(prompt) for prompt in prompts]
    results = await asyncio.gather(*tasks)

    for prompt, result in zip(prompts, results):
        print(f"{prompt}\n{result}\n")

asyncio.run(generate_multiple())
```

## Structured Generation

Ollama only provides support for structured generation based on a JSON schema. To use it, call the model with a JSON schema object as an `output_type` on top of your prompt.

For instance:

```python
from typing import List
from pydantic import BaseModel
import ollama
import outlines

class Character(BaseModel):
    name: str
    age: int
    skills: List[str]

# Create the model
model = outlines.from_ollama(ollama.Client(), "tinyllama")

# Call it with the output type to generate structured text
result = model("Create a character", Character)
print(result) # '{"name": "Evelyn", "age": 34, "skills": ["archery", "stealth", "alchemy"]}'
print(Character.model_validate_json(result)) # name=Evelyn, age=34, skills=['archery', 'stealth', 'alchemy']
```

## Inference arguments

When calling the model, you can provide keyword arguments that will be passed down to the `generate` method of the Ollama client.

Consult the [Ollama REST API documentation](https://github.com/ollama/ollama/blob/main/docs/api#generate-a-completion) for the full list of inference parameters.


================================================
FILE: docs/features/models/openai.md
================================================
# OpenAI

!!! Installation

    You need to install the `openai` library to be able to use the OpenAI API in Outlines. Install all optional dependencies of the `OpenAI` model with: `pip install "outlines[openai]"`.

    You also need to have an OpenAI API key. This API key must either be set as an environment variable called `OPENAI_API_KEY` or be provided to the `openai.OpenAI` class when instantiating it.

## Model Initialization

To create an OpenAI model instance, you can use the `from_openai` function. It takes 2 arguments:

- `client`: an `openai.OpenAI`, `openai.AzureOpenAI`, `openai.AsyncOpenAI` or `openai.AsyncAzureOpenAI` instance
- `model_name`: the name of the model you want to use

Based on whether the inference client instance is synchronous or asynchronous, you will receive an `OpenAI` or an `AsyncOpenAI` model instance.

For instance:

```python
import outlines
import openai

# Create the client or async client
client = openai.OpenAI()
async_client = openai.AsyncOpenAI()

# Create a sync model
model = outlines.from_openai(
    client,
    "gpt-4o"
)

# Create aa async model
model = outlines.from_openai(
    async_client,
    "gpt-4o"
)
```

Check the [OpenAI documentation](https://platform.openai.com/docs/models) for an up-to-date list of available models. As shown above, you can use Azure OpenAI in Outlines the same way you would use OpenAI, just provide an `openai.AzureOpenAI` instance to the Outlines model class.

## Text Generation

Once you've created your Outlines `OpenAI` model instance, you're all set to generate text with this provider. You can simply call the model with a prompt.

For instance:

```python
import openai
import outlines

# Create the model
model = outlines.from_openai(
    openai.OpenAI(),
    "gpt-4o"
)

# Call it to generate text
response = model("What's the capital of Latvia?", max_tokens=20)
print(response) # 'Riga'
```

#### Vision

Some OpenAI models support vision input. To use this feature, provide a list containing a text prompt and `Image` instances.

For instance:

```python
import io
import requests
import PIL
import outlines
import openai
from outlines.inputs import Image

# Create the model
model = outlines.from_openai(
    openai.OpenAI(),
    "gpt-4o"
)

# Function to get an image
def get_image(url):
    r = requests.get(url)
    return PIL.Image.open(io.BytesIO(r.content))

# Create the prompt containing the text and the image
prompt = [
    "Describe the image",
    Image(get_image("https://picsum.photos/id/237/400/300"))
]

# Call the model to generate a response
response = model(prompt, max_tokens=50)
print(response) # 'This is a picture of a black dog.'
```

#### Chat

You can also use chat inputs with the `OpenAI` model. To do so, call the model with a `Chat` instance. The content of messsage within the chat can be vision inputs as described above.

For instance:

```python
import io
import requests
import PIL
import openai
import outlines
from outlines.inputs import Chat, Image

# Create the model
model = outlines.from_openai(
    openai.OpenAI(),
    "gpt-4o"
)

# Function to get an image
def get_image(url):
    r = requests.get(url)
    return PIL.Image.open(io.BytesIO(r.content))

# Create the chat input
prompt = Chat([
    {"role": "system", "content": "You are a helpful assistant."},
    {
        "role": "user",
        "content": ["Describe the image", Image(get_image("https://picsum.photos/id/237/400/300"))]
    },
])

# Call the model to generate a response
response = model(prompt, max_tokens=50)
print(response) # 'This is a picture of a black dog.'
```

#### Streaming

Finally, the `OpenAI` model supports streaming through the `stream` method.

For instance:

```python
import openai
import outlines

# Create the model
model = outlines.from_openai(
    openai.OpenAI(),
    "gpt-4o"
)

# Stream the response
for chunk in model.stream("Tell me a short story about a cat.", max_tokens=50):
    print(chunk) # 'Once...'
```

## Structured Generation

OpenAI provides supports for some forms of structured output: JSON schemas and JSON syntax. To use it, call the model with an `output_type` on top of your prompt.

#### JSON Schema

```python
from typing import List
from pydantic import BaseModel
import openai
import outlines

class Character(BaseModel):
    name: str
    age: int
    skills: List[str]

# Create the model
model = outlines.from_openai(openai.OpenAI(), "gpt-4o")

# Call it with the output type to generate structured text
result = model("Create a character, use the json format.", Character, top_p=0.1)
print(result) # '{"name": "Evelyn", "age": 34, "skills": ["archery", "stealth", "alchemy"]}'
print(Character.model_validate_json(result)) # name=Evelyn, age=34, skills=['archery', 'stealth', 'alchemy']
```

#### JSON Syntax

What we mean by JSON syntax is what is sometimes called JSON mode, meaning that the model will return a valid JSON, but you do not get to specify its structure. To use this JSON mode, provide the `dict` type as an output type.

```python
import openai
import outlines

# Create the model
model = outlines.from_openai(openai.OpenAI(), "gpt-4o")

# Call it with the output type to generate structured text
result = model("Create a character, use the json format.", dict, temperature=0.5)
print(result) # '{"first_name": "Henri", "last_name": "Smith", "height": "170"}'
```

## Asynchronous Calls

All features presented above for the sync model are also available for the async model.

For instance:

```python
import asyncio
import openai
import outlines
from pydantic import BaseModel
from typing import List

class Character(BaseModel):
    name: str
    age: int
    skills: List[str]

# Create the model
model = outlines.from_openai(
    openai.AsyncOpenAI(),
    "gpt-4o"
)

async def text_generation():
    # Regular generation
    response = await model("What's the capital of Latvia?", max_tokens=20)
    print(response) # 'Riga'

    # Streaming
    async for chunk in  model.stream("Tell me a short story about a cat.", max_tokens=50):
        print(chunk, end="") # 'Once...'

    # Structured generation
    result = await model("Create a character, use the json format.", Character, top_p=0.1)
    print(result) # '{"name": "Evelyn", "age": 34, "skills": ["archery", "stealth", "alchemy"]}'
    print(Character.model_validate_json(result)) # name=Evelyn, age=34, skills=['archery', 'stealth', 'alchemy']

asyncio.run(text_generation())
```

## Inference arguments

When calling the model, you can provide keyword arguments that will be passed down to the `chat.completions.create` method of the OpenAI client. Some of the most common arguments include `max_tokens`, `temperature`, `stop` and `top_p`.

Another keyword argument of interest is `n`. If set with an integer value superior to 1, OpenAI will generate several sample responses and you will receive a list of strings as a response to your model call.

See the [OpenAI API documentation](http://platform.openai.com/docs/api-reference/chat/create) for the full list of available arguments.


================================================
FILE: docs/features/models/openai_compatible.md
================================================
# OpenAI-Compatible APIs

Many inference providers offer OpenAI-compatible APIs, allowing you to use the familiar OpenAI SDK while connecting to different backends. Outlines allows you can leverage various providers while maintaining consistent code.

## What are OpenAI-Compatible APIs?

OpenAI-compatible APIs implement the same REST endpoints and request/response formats as OpenAI's API, but serve different models or run on different infrastructure. This allows you to use the `openai` Python library with any compatible provider by simply changing the `base_url`.

!!! Installation

    You need to install the `openai` library to be able to use the OpenAI-compatible APIs in Outlines. Install all optional dependencies of the `OpenAI` model with: `pip install "outlines[openai]"`.

## General Usage Pattern

The standard approach is to use the OpenAI SDK with a custom base URL:

```python
import openai
import outlines

# Point to your OpenAI-compatible endpoint
client = openai.OpenAI(
    base_url="https://your-provider.com/v1",  # Custom endpoint
    api_key="your-api-key"
)

# Use with Outlines
model = outlines.from_openai(client, "model-name")
```

## Important: Provider-Specific Parameters

!!! Warning "API-Specific Parameters"

    Some providers require additional parameters in the API request for structured generation to work properly. These are typically passed as extra arguments when calling the model.

    For example, some providers may need special parameters in the request body to enable guided generation or specify constraints. Always consult your provider's documentation for structured generation requirements.

## Popular OpenAI-Compatible Providers

Many providers offer OpenAI-compatible endpoints:

- **Groq**
- **Together AI**
- **Anyscale**
- **Fireworks AI**
- **Perplexity**
- **Local servers** (LocalAI, etc.)

## Configuration Examples

### Basic Setup
```python
import openai
import outlines

# Generic OpenAI-compatible setup
client = openai.OpenAI(
    base_url="https://api.your-provider.com/v1",
    api_key="your-api-key"
)

model = outlines.from_openai(client, "provider-model-name")
```

### With Authentication Headers
```python
import openai
import outlines

# Some providers need custom headers
client = openai.OpenAI(
    base_url="https://api.your-provider.com/v1",
    api_key="your-api-key",
    default_headers={"Custom-Header": "value"}
)

model = outlines.from_openai(client, "provider-model-name")
```

## Related Documentation

For specific implementations that use OpenAI-compatible APIs:

- [SGLang](sglang.md): Local inference server with OpenAI-compatible endpoints
- [vLLM](vllm.md): High-performance inference with OpenAI-compatible API
- [OpenAI](openai.md): The original OpenAI API implementation


================================================
FILE: docs/features/models/openrouter.md
================================================
# Openrouter

!!! Installation

    [OpenRouter](https://openrouter.ai/docs/api-reference/overview) uses the same API as OpenAI, so both services are [interoperable](./openai_compatible.md) using the `openai` library. Install all optional dependencies of the `OpenAI` model with: `pip install "outlines[openai]"`.

    You also need to have an Openrouter API key. This API key must either be set as an environment variable called `OPENAI_API_KEY` or be provided to the `openai.OpenAI` class when instantiating it.

## Model Initialization

To create a model instance, you can use the `from_openai` function. It takes 2 arguments:

- `client`: an `openai.OpenAI` instance
- `model_name`: the name of the model you want to use, defined as `provider/model`

For instance:

```python
import outlines
import openai

# Create the client
client = openai.OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key="OPENAI_API_KEY",
)

# Create the model
model = outlines.from_openai(
    client,
    "x-ai/grok-4"
)
```

Leaving an empty string in the model name field will lead OpenRouter to use your default model defined in [settings](https://openrouter.ai/settings/preferences).

The [OpenRouter](https://openrouter.ai/models) website lists available models. Keep in mind that some models do not support `json_schema` response formats and may return a 400 error code as a result.

## Related Documentation

For specific implementations that use OpenAI-compatible APIs:

- [OpenAI](./openai.md): The original OpenAI API implementation
- [OpenAI compatible API](./openai_compatible.md): Details on how to use OpenAI-compatible APIs


================================================
FILE: docs/features/models/sglang.md
================================================
---
title: SGLang
---

# SGLang

## Prerequisites

The Outlines `SGLang` model is intended to be used along with an SGLang instance running on a separate server (can be local or remote). Make sure you have a SGLang server running and accessible before using the `SGLang` model. For instance by running:

```shell
pip install "sglang[all]"

python -m sglang.launch_server \
  --model-path NousResearch/Meta-Llama-3-8B-Instruct \
  --host 0.0.0.0 \
  --port 30000
```

Follow the [Installation instructions](https://docs.sglang.ai/start/install.html) for more information on how to set up a SGLang server for your particular setup.

As the SGLang client relies on the `openai` python sdk, you need to have the `openai` package installed. Install all optional dependencies of the `SGLang` model with: `pip install "outlines[sglang]"`.

When launching your SGLang server, you can specify the backend engine to use for structured generation through the `grammar-backend` cli argument. Add `--grammar-backend outlines` to your command to use Outlines instead of the default engine.

## Model Initialization

To load the model, you can use the `from_sglang` function. The argument of the function is either an `OpenAI` or `AsyncOpenAI` instance from the `openai` library. Make sure the value of the `base_url` argument of the `OpenAI` client points to your running SGLang server. Consult the [SGLang documentation](https://docs.sglang.ai/backend/send_request.html) on using an OpenAI client with an SGLang server for more information.

Based on whether the `openai` client instance is synchronous or asynchronous, you will receive a `SGLang` or `AsyncSGLang` model instance.

For instance:

```python
import openai
import outlines

# Create the OpenAI client
sync_openai_client = openai.OpenAI(base_url="http://localhost:11434")
async_openai_client = openai.AsyncOpenAI(base_url="http://localhost:11434")

# Create a sync model
sync_model = outlines.from_sglang(sync_openai_client)
print(type(sync_model)) # <class 'outlines.models.sglang.SGLang'>

# Create an async model
async_model = outlines.from_sglang(async_openai_client)
print(type(async_model)) # <class 'outlines.models.sglang.AsyncSGLang'>
```

## Text Generation

To generate text, you can simply call the model with a prompt.

For instance:

```python
import openai
import outlines

# Create the model
model = outlines.from_openai(openai.OpenAI(base_url="http://localhost:11434"))

# Call it to generate text
response = model("What's the capital of Latvia?", max_tokens=20)
print(response) # 'Riga'
```

#### Vision

Some models you can run with SGLang support vision input. To use this feature, provide a list containing a text prompt and `Image` instances.

For instance:

```python
import io
import requests
import PIL
import outlines
import openai
from outlines.inputs import Image

# Create the model
model = outlines.from_openai(openai.OpenAI(base_url="http://localhost:11434"))

# Function to get an image
def get_image(url):
    r = requests.get(url)
    return PIL.Image.open(io.BytesIO(r.content))

# Create the prompt containing the text and the image
prompt = [
    "Describe the image",
    Image(get_image("https://picsum.photos/id/237/400/300"))
]

# Call the model to generate a response
response = model(prompt, max_tokens=50)
print(response) # 'This is a picture of a black dog.'
```

#### Chat

You can also use chat inputs with the `SGLang` model. To do so, call the model with a `Chat` instance. The content of messsage within the chat can be vision inputs as described above.

For instance:

```python
import io
import requests
import PIL
import openai
import outlines
from outlines.inputs import Chat, Image

# Create the model
model = outlines.from_openai(openai.OpenAI(base_url="http://localhost:11434"))

# Function to get an image
def get_image(url):
    r = requests.get(url)
    return PIL.Image.open(io.BytesIO(r.content))

# Create the chat input
prompt = Chat([
    {"role": "system", "content": "You are a helpful assistant."},
    {
        "role": "user",
        "content": ["Describe the image", Image(get_image("https://picsum.photos/id/237/400/300"))]
    },
])

# Call the model to generate a response
response = model(prompt, max_tokens=50)
print(response) # 'This is a picture of a black dog.'
```

#### Streaming

Finally, the `SGLang` model supports streaming through the `stream` method.

For instance:

```python
import openai
import outlines

# Create the model
model = outlines.from_openai(openai.OpenAI(base_url="http://localhost:11434"))

# Stream the response
for chunk in model.stream("Tell me a short story about a cat.", max_tokens=50):
    print(chunk) # 'Once...'
```

## Structured Generation

SGLang supports all output types available in Outlines (context-free grammars with caveats though, see the subsection below for more details). Simply provide an `output_type` after the prompt when calling the model. All structured generation features work with both synchronous and asynchronous models.

### Simple Type

```python
import openai
import outlines

output_type = int

openai_client = openai.OpenAI(base_url="http://localhost:11434")
model = outlines.from_sglang(openai_client)

result = model("How many countries are there in the world?", output_type)
print(result) # '200'
```

### JSON Schema

```python
import openai
import outlines
from pydantic import BaseModel

class Character(BaseModel):
    name: str
    age: int
    skills: List[str]

openai_client = openai.OpenAI(base_url="http://localhost:11434")
model = outlines.from_sglang(openai_client)

result = model("Create a character.", Character, frequency_penalty=1.5)
print(result) # '{"name": "Evelyn", "age": 34, "skills": ["archery", "stealth", "alchemy"]}'
print(Character.model_validate_json(result)) # name=Evelyn, age=34, skills=['archery', 'stealth', 'alchemy']
```

### Multiple Choice

```python
from typing import Literal
import openai
import outlines

output_type = Literal["Paris", "London", "Rome", "Berlin"]

openai_client = openai.OpenAI(base_url="http://localhost:11434")
model = outlines.from_sglang(openai_client)

result = model("What is the capital of France?", output_type, temperature=0)
print(result) # 'Paris'
```

### Regex

```python
import openai
import outlines
from outlines.types import Regex

output_type = Regex(r"\d{3}-\d{2}-\d{4}")

openai_client = openai.OpenAI(base_url="http://localhost:11434")
model = outlines.from_sglang(openai_client)

result = model("Generate a fake social security number.", output_type, top_p=0.1)
print(result) # '782-32-3789'
```

### Context-Free Grammar

SGLang supports grammars, but expects an EBNF format instead of the Lark format Outlines uses. Thus, to use a context-free grammar with SGLang, provide a string using the EBNF syntax to the Outlines `CFG` object.

```python
import openai
import outlines
from outlines.types import CFG

ebnf_grammar = """
root ::= answer
answer ::= "yes" | "no"
"""
output_type = CFG(ebnf_grammar)

openai_client = openai.OpenAI(base_url="http://localhost:11434")
model = outlines.from_sglang(openai_client)

result = model("Is the weather good today?", output_type)
print(result) # 'yes'
```

### Async Structured Generation

All structured generation features work seamlessly with async models:

```python
import asyncio
import openai
import outlines
from typing import List
from pydantic import BaseModel

class User(BaseModel):
    name: str
    email: str
    age: int

async def generate_user():
    async_client = openai.AsyncOpenAI(base_url="http://localhost:11434")
    async_model = outlines.from_sglang(async_client)

    result = await async_model("Generate a random user profile.", output_type=User)
    user = User.model_validate_json(result)
    print(f"Name: {user.name}, Email: {user.email}, Age: {user.age}")

asyncio.run(generate_user())
```

## Inference Arguments

When calling the model, you can provide optional parameters on top of the prompt and the output type. Those will be passed on to the `chat.completions.create` method of the OpenAI client.

An optional parameter of particular interest is `extra_body`, which is a dictionary containing arguments that are specific to SGLang and are not part of the standard `openai` interface.

See the [SGLang documentation](https://docs.sglang.ai/backend/openai_api_completions.html) on parameters for the OpenAI-compatible server for more information on inference parameters.


================================================
FILE: docs/features/models/tgi.md
================================================
---
title: TGI
---

# TGI

## Prerequisites

The Outlines `TGI` model is intended to be used along with a HuggingFace `Text Generation Inference` server (running locally or remotely). Make sure you have a TGI server running before using the `TGI` model. For instance running:

```shell
docker run \
  --gpus all \
  --shm-size 1g \
  -p 8080:80 \
  ghcr.io/huggingface/text-generation-inference:3.3.4 \
  --model-id NousResearch/Meta-Llama-3-8B-Instruct
```

Please consult the [installation guide](https://huggingface.co/docs/text-generation-inference/en/quicktour) for more information about how to run TGI with your particular setup.
As the TGI client relies on the `huggingface_hub` python package, you need to have it installed. Install all optional dependencoes of the `TGI` model with: `pip install "outlines[tgi]"`

## Model Initialization

To load the model, you can use the `from_tgi` function. The argument of the function is either an `InferenceClient` or `AsyncInferenceClient` instance from the `huggingface_hub` library. Consult the [HuggingFace documentation](https://huggingface.co/docs/huggingface_hub/package_reference/inference_client) for more information on their inference client.

Based on whether the inference client instance is synchronous or asynchronous, you will receive a `TGI` or an `AsyncTGI` model instance.

For instance:

```python
import outlines
import huggingface_hub

# Create the inference client
client = huggingface_hub.InferenceClient("http://localhost:11434")
async_client = huggingface_hub.AsyncInferenceClient("http://localhost:11434")

# Create a sync model
sync_model = outlines.from_tgi(client)
print(type(sync_model))  # <class 'outlines.models.tgi.TGI'>

# Create an async model
async_model = outlines.from_tgi(async_client)
print(type(async_model))  # <class 'outlines.models.tgi.AsyncTGI'>
```

## Text Generation

To generate text, you can simply call the model with a prompt.

For instance:

```python
import outlines
import huggingface_hub

# Create the model
client = huggingface_hub.InferenceClient("http://localhost:11434")
model = outlines.from_tgi(client)

# Call it to generate text
result = model("Write a short story about a cat.", stop_sequences=["."])
print(result) # 'In a quiet village where the cobblestones hummed softly beneath the morning mist...'
```

The `TGI` model supports streaming. For instance:

```python
import outlines
import huggingface_hub

# Create the model
client = huggingface_hub.InferenceClient("http://localhost:11434")
model = outlines.from_tgi(client)

# Stream text
for chunk in model.stream("Write a short story about a cat.", stop_sequences=["."]):
    print(chunk) # 'In ...'
```

## Asynchronous Calls

TGI supports asynchronous operations by passing an `AsyncInferenceClient` instead of a regular `InferenceClient`. This returns an `AsyncTGI` model instance that supports async/await patterns.

### Basic Async Generation

```python
import asyncio
import outlines
import huggingface_hub

async def generate_text():
    # Create an async model
    async_client = huggingface_hub.AsyncInferenceClient("http://localhost:11434")
    async_model = outlines.from_tgi(async_client)

    result = await async_model("Write a haiku about Python.", max_new_tokens=50)
    print(result)

asyncio.run(generate_text())
```

### Async Streaming

The async model also supports streaming with async iteration:

```python
import asyncio
import outlines
import huggingface_hub

async def stream_text():
    async_client = huggingface_hub.AsyncInferenceClient("http://localhost:11434")
    async_model = outlines.from_tgi(async_client)

    async for chunk in async_model.stream("Tell me a story about a robot.", max_new_tokens=100):
        print(chunk, end="")

asyncio.run(stream_text())
```

### Concurrent Async Requests

One of the main benefits of async calls is the ability to make multiple concurrent requests:

```python
import asyncio
import outlines
import huggingface_hub

async def generate_multiple():
    async_client = huggingface_hub.AsyncInferenceClient("http://localhost:11434")
    async_model = outlines.from_tgi(async_client)

    # Define multiple prompts
    prompts = [
        "Write a tagline for a coffee shop.",
        "Write a tagline for a bookstore.",
        "Write a tagline for a gym."
    ]

    tasks = [async_model(prompt, max_new_tokens=30) for prompt in prompts]
    results = await asyncio.gather(*tasks)

    for prompt, result in zip(prompts, results):
        print(f"{prompt}\n{result}\n")

asyncio.run(generate_multiple())
```

## Structured Generation

TGI supports all output types available in Outlines except for context-free grammars. Simply provide an `output_type` after the prompt when calling the model. All structured generation features work with both synchronous and asynchronous models.

### Simple Type

```python
import outlines
import huggingface_hub

output_type = int

tgi_client = huggingface_hub.InferenceClient("http://localhost:8080")
model = outlines.from_tgi(tgi_client)

result = model("How many countries are there in the world?", output_type)
print(result) # '200'
```### JSON Schema

```python
import outlines
import huggingface_hub
from typing import List
from pydantic import BaseModel

class Character(BaseModel):
    name: str
    age: int
    skills: List[str]

tgi_client = huggingface_hub.InferenceClient("http://localhost:8080")
model = outlines.from_tgi(tgi_client)

result = model("Create a character.", output_type=Character, frequency_penalty=1.5)
print(result) # '{"name": "Evelyn", "age": 34, "skills": ["archery", "stealth", "alchemy"]}'
print(Character.model_validate_json(result)) # name=Evelyn, age=34, skills=['archery', 'stealth', 'alchemy']
```### Multiple Choice

```python
import outlines
import huggingface_hub
from typing import Literal

output_type = Literal["Paris", "London", "Rome", "Berlin"]

tgi_client = huggingface_hub.InferenceClient("http://localhost:8080")
model = outlines.from_tgi(tgi_client)

result = model("What is the capital of France?", output_type, temperature=0)
print(result) # 'Paris'
```### Regex

```python
import outlines
import huggingface_hub
from outlines.types import Regex

output_type = Regex(r"\d{3}-\d{2}-\d{4}")

tgi_client = huggingface_hub.InferenceClient("http://localhost:8080")
model = outlines.from_tgi(tgi_client)

result = model("Generate a fake social security number.", output_type, top_p=0.1)
print(result) # '782-32-3789'
```

### Async Structured Generation

All structured generation features work seamlessly with async models:

```python
import asyncio
import outlines
import huggingface_hub
from pydantic import BaseModel

class User(BaseModel):
    name: str
    email: str
    age: int

async def generate_user():
    async_client = huggingface_hub.AsyncInferenceClient("http://localhost:11434")
    async_model = outlines.from_tgi(async_client)

    result = await async_model("Generate a random user profile.", output_type=User)
    user = User.model_validate_json(result)
    print(f"Name: {user.name}, Email: {user.email}, Age: {user.age}")

asyncio.run(generate_user())
```

## Inference parameters

When calling the model, you can provide optional parameters on top of the prompt and the output type. Those will be passed on to the `text_generation` method of the TGI client.

Common parameters include `max_new_tokens`, `stop_sequences`, `temperature`, `top_k`, `top_p`, and others as specified in the [TGI inference client documentation](https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient).


================================================
FILE: docs/features/models/transformers.md
================================================
---
title: Transformers
---

# Transformers

!!! Installation

    You need to install the `transformers` library to be able to use the Transformers in Outlines. Install all optional dependencies of the `Transformers` model with: `pip install "outlines[transformers]"`.

    See the [HuggingFace documentation](https://huggingface.co/docs/transformers/en/installation) for more information on installing `transformers` with CPU, GPU...

## Model Initialization

To load the model, you can use the `from_transformers` function. It takes 3 arguments:

- `model`: a `transformers` model (created with `AutoModelForCausalLM` for instance)
- `tokenizer_or_processor`: a `transformers` tokenizer (created with `AutoTokenizer` for instance, it must be an instance of either `PreTrainedTokenizer` or `PreTrainedTokenizerFast`)
- `device_dtype` (optional): the tensor dtype to use for inference. If not provided, the model will use the default dtype.

For instance:

```python
import outlines
from transformers import AutoModelForCausalLM, AutoTokenizer

# Create the transformers model and tokenizer
hf_model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
hf_tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")

# Create the Outlines model
model = outlines.from_transformers(hf_model, hf_tokenizer)
```

If you provide a processor instead of a tokenizer for the second argument of the `from_transformers` function, you would get a `TransformersMultiModal` instance. See the [TransformersMultiModal model documentation](./transformers_multimodal.md) for more information on using multimodal models in Outlines.

## Text Generation

To generate text, you can simply call the model with a prompt.

For instance:

```python
import outlines
from transformers import AutoModelForCausalLM, AutoTokenizer

# Create model
model = outlines.from_transformers(
    AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct"),
    AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
)

# Call it to generate text
result = model("What's the capital of Latvia?", max_new_tokens=20)
print(result) # 'Riga'
```

#### Chat

You can also use chat inputs with the `Transformers` model. To do so, call the model with a `Chat` instance. The content of messsage within the chat can be vision inputs as described above.

For instance:

```python
import outlines
from outlines.inputs import Chat
from transformers import AutoModelForCausalLM, AutoTokenizer

# Create the model
model = outlines.from_transformers(
    AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct"),
    AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
)

# Create the chat input
prompt = Chat([
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "What's the capital of Latvia?"},
])

# Call the model to generate a response
response = model(prompt, max_new_tokens=50)
print(response) # 'This is a picture of a black dog.'
```

#### Batching

Finally, the `Transformers` model supports batching through the `batch` method. To use it, provide a list of prompts (using the formats described above) to the `batch` method. You will receive as a result a list of completions.

For instance:

```python
import outlines
from transformers import AutoModelForCausalLM, AutoTokenizer

# Create model
model = outlines.from_transformers(
    AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct"),
    AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
)

# Create a list of prompts that will be used in a single batch
prompts = [
    "What's the capital of Lithuania?",
    "What's the capital of Latvia?",
    "What's the capital of Estonia?"
]

# Call it to generate text
result = model.batch(prompts, max_new_tokens=20)
print(result) # ['Vilnius', 'Riga', 'Tallinn']
```

## Structured Generation

As a local model, `Transformers` supports all output types available in Outlines. Simply provide an `output_type` after the prompt when calling the model.

### Simple Type

```python
import outlines
from transformers import AutoModelForCausalLM, AutoTokenizer

output_type = int

model = outlines.from_transformers(
    AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct"),
    AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
)

result = model("How many countries are there in the world?", output_type, max_new_tokens=5)
print(result) # '200'
```

### JSON Schema

```python
import outlines
from transformers import AutoModelForCausalLM, AutoTokenizer
from pydantic import BaseModel
from typing import List

class Character(BaseModel):
    name: str
    age: int
    skills: List[str]

model = outlines.from_transformers(
    AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct"),
    AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
)

result = model("Create a character.", output_type=Character, max_new_tokens=200, repetition_penalty=0.5)
print(result) # '{"name": "Evelyn", "age": 34, "skills": ["archery", "stealth", "alchemy"]}'
print(Character.model_validate_json(result)) # name=Evelyn, age=34, skills=['archery', 'stealth', 'alchemy']
```

### Multiple Choice

```python
from typing import Literal
import outlines
from transformers import AutoModelForCausalLM, AutoTokenizer

output_type = Literal["Paris", "London", "Rome", "Berlin"]

model = outlines.from_transformers(
    AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct"),
    AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
)

result = model("What is the capital of France?", output_type, max_new_tokens=10, temperature=0)
print(result) # 'Paris'
```

### Regex

```python
import outlines
from transformers import AutoModelForCausalLM, AutoTokenizer
from outlines.types import Regex

output_type = Regex(r"\d{3}-\d{2}-\d{4}")

model = outlines.from_transformers(
    AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct"),
    AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
)

result = model("Generate a fake social security number.", output_type, max_new_tokens=20, top_p=0.5)
print(result) # '782-32-3789'
```

### Context-Free Grammar

```python
import outlines
from transformers import AutoModelForCausalLM, AutoTokenizer
from outlines.types import CFG

arithmetic_grammar = """
?start: sum

?sum: product
| sum "+" product   -> add
| sum "-" product   -> sub

?product: atom
| product "*" atom  -> mul
| product "/" atom  -> div

?atom: NUMBER           -> number
| "-" atom         -> neg
| "(" sum ")"

%import common.NUMBER
%import common.WS_INLINE

%ignore WS_INLINE
"""
output_type = CFG(arithmetic_grammar)

model = outlines.from_transformers(
    AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct"),
    AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
)

result = model("Write an addition.", output_type, max_new_tokens=100)
print(result) # '23 + 48'
```

## Inference Arguments

When calling the model, you can provide optional inference parameters on top of the prompt and the output type. These parameters will be passed on to the `generate` method of the `transformers` model. Some common inference arguments include `max_new_tokens`, `temperature`, `repetition_penalty` and `top_p`.

See the [transformers documentation](https://huggingface.co/docs/transformers/en/main_classes/text_generation) for more information on inference parameters.

!!! Warning

    The `max_new_tokens` inference parameter has a default value of 20. This is insufficient for most tasks and will result in the generation output not respecting the output type (because the response is truncated). We recommend you always provide a value for this argument.


================================================
FILE: docs/features/models/transformers_multimodal.md
================================================
---
title: Transformers MultiModal
---

# Transformers MultiModal

The Outlines `TransformersMultiModal` model inherits from `Transformers` and shares most of its interface. Please start by reading the [Transformers documentation](./transformers.md) as this document only focuses on the specificities of `TransformersMultiModal` compared to `Transformers`.

## Model Initialization

To load the model, you can use the `from_transformers` function. It takes 2 arguments:

- `model`: a `transformers` model (created with `AutoModelForImageTextToText` for instance)
- `tokenizer_or_processor`: a `transformers` processor (created with `AutoProcessor` for instance, it must be an instance of `ProcessorMixin`)
- `device_dtype` (optional): the tensor dtype to use for inference. If not provided, the model will use the default dtype.

For instance:

```python
import outlines
from transformers import AutoModelForImageTextToText, AutoProcessor

# Create the transformers model and processor
hf_model = AutoModelForImageTextToText.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")
hf_processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")

# Create the Outlines model
model = outlines.from_transformers(hf_model, hf_processor)
```

## Model Input

As with other multimodal models, you should provide a list containing a text prompt and assets (`Image`, `Audio` or `Video` instances) as the model input. The type of asset to provide depends on the capabilities of the `transformers` model you are running.

Here's an example of using a vision multimodal model:

```python
from io import BytesIO
from urllib.request import urlopen

from PIL import Image as PILImage
from pydantic import BaseModel
from transformers import (
    LlavaForConditionalGeneration,
    AutoProcessor,
)

import outlines
from outlines.inputs import Image

TEST_MODEL = "trl-internal-testing/tiny-LlavaForConditionalGeneration"
IMAGE_URL = "https://upload.wikimedia.org/wikipedia/commons/2/25/Siam_lilacpoint.jpg"

class Animal(BaseModel):
    specie: str
    color: str
    weight: int

def get_image_from_url(image_url):
    img_byte_stream = BytesIO(urlopen(image_url).read())
    image = PILImage.open(img_byte_stream).convert("RGB")
    image.format = "PNG"
    return image

# Create a model
model = outlines.from_transformers(
    LlavaForConditionalGeneration.from_pretrained(TEST_MODEL),
    AutoProcessor.from_pretrained(TEST_MODEL),
)

# Call it with a model input dict containing a text prompt and an image + an output type
result = model(
    ["<image>Describe this animal.", Image(get_image_from_url(IMAGE_URL))],
    Animal,
    max_new_tokens=100
)
print(result) # '{"specie": "cat", "color": "white", "weight": 4}'
print(Animal.model_validate_json(result)) # specie=cat, color=white, weight=4
```
!!! Warning

    Make sure your prompt contains the tags expected by your processor to correctly inject the assets in the prompt. For some vision multimodal models for instance, you need to add as many `<image>` tags in your prompt as there are image assets included in your model input. `Chat` method, instead, does not require this step.


### Chat
The `Chat` interface offers a more convenient way to work with multimodal inputs. You don't need to manually add asset tags like `<image>`. The model's HF processor handles the chat templating and asset placement for you automatically.
To do so, call the model with a `Chat` instance using a multimodal chat format. Assets must be pre-processed as `outlines.inputs.{Image, Audio, Video}` format, and only `image`, `video`, and `audio` types are supported.

For instance:

```python
import outlines
from outlines.inputs import Chat, Image
from transformers import AutoModelForImageTextToText, AutoProcessor
from PIL import Image as PILImage
from io import BytesIO
from urllib.request import urlopen
import torch

model_kwargs = {
        "torch_dtype": torch.bfloat16,
        "attn_implementation": "flash_attention_2",
        "device_map": "auto",
    }

def get_image_from_url(image_url):
    img_byte_stream = BytesIO(urlopen(image_url).read())
    image = PILImage.open(img_byte_stream).convert("RGB")
    image.format = "PNG"
    return image

# Create the model
model = outlines.from_transformers(
    AutoModelForImageTextToText.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct", **model_kwargs),
    AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct", **model_kwargs)
)

IMAGE_URL = "https://upload.wikimedia.org/wikipedia/commons/2/25/Siam_lilacpoint.jpg"

# Create the chat mutimodal input
prompt = Chat([
    {
        "role": "user",
        "content": [
            {"type": "image", "image": Image(get_image_from_url(IMAGE_URL))},
            {"type": "text", "text": "Describe the image in few words."}
        ],
    }
])

# Call the model to generate a response
response = model(prompt, max_new_tokens=50)
print(response) # 'A Siamese cat with blue eyes is sitting on a cat tree, looking alert and curious.'
```

Or using a list containing text and assets:

```python
import outlines
from outlines.inputs import Chat, Image
from transformers import AutoModelForImageTextToText, AutoProcessor
from PIL import Image as PILImage
from io import BytesIO
import requests
import torch


TEST_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct"

# Function to get an image
def get_image(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    r = requests.get(url, headers=headers)
    image = PILImage.open(BytesIO(r.content)).convert("RGB")
    image.format = "PNG"
    return image

model_kwargs = {
        "torch_dtype": torch.bfloat16,
        # "attn_implementation": "flash_attention_2",
        "device_map": "auto",
    }

# Create a model
model = outlines.from_transformers(
    AutoModelForImageTextToText.from_pretrained(TEST_MODEL, **model_kwargs),
    AutoProcessor.from_pretrained(TEST_MODEL, **model_kwargs),
)

# Create the chat input
prompt = Chat([
    {"role": "user", "content": "You are a helpful assistant that helps me described pictures."},
    {"role": "assistant", "content": "I'd be happy to help you describe pictures! Please go ahead and share an image"},
    {
        "role": "user",
        "content": ["Describe briefly the image", Image(get_image("https://upload.wikimedia.org/wikipedia/commons/2/25/Siam_lilacpoint.jpg"))]
    },
])

# Call the model to generate a response
response = model(prompt, max_new_tokens=50)
print(response) # 'The image shows a light-colored cat with a white chest...'
```


### Batching
The `TransformersMultiModal` model supports batching through the `batch` method. To use it, provide a list of prompts (using the formats described above) to the `batch` method. You will receive as a result a list of completions.

An example using the Chat format:

```python
import outlines
from outlines.inputs import Chat, Image
from transformers import AutoModelForImageTextToText, AutoProcessor
from PIL import Image as PILImage
from io import BytesIO
from urllib.request import urlopen
import torch
from pydantic import BaseModel

model_kwargs = {
        "torch_dtype": torch.bfloat16,
        "attn_implementation": "flash_attention_2",
        "device_map": "auto",
    }

class Animal(BaseModel):
    animal: str
    color: str

def get_image_from_url(image_url):
    img_byte_stream = BytesIO(urlopen(image_url).read())
    image = PILImage.open(img_byte_stream).convert("RGB")
    image.format = "PNG"
    return image

# Create the model
model = outlines.from_transformers(
    AutoModelForImageTextToText.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct", **model_kwargs),
    AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct", **model_kwargs)
)

IMAGE_URL_1 = "https://upload.wikimedia.org/wikipedia/commons/2/25/Siam_lilacpoint.jpg"
IMAGE_URL_2 = "https://upload.wikimedia.org/wikipedia/commons/a/af/Golden_retriever_eating_pigs_foot.jpg"

# Create the chat mutimodal messages
messages = [
    {
        "role": "user",
        "content": [
            {"type": "text", "text": "Describe the image in few words."},
            {"type": "image", "image": Image(get_image_from_url(IMAGE_URL_1))},
        ],
    },
]

messages_2 = [
    {
        "role": "user",
        "content": [
            {"type": "text", "text": "Describe the image in few words."},
            {"type": "image", "image": Image(get_image_from_url(IMAGE_URL_2))},
        ],
    },
]

prompts = [Chat(messages), Chat(messages_2)]

# Call the model to generate a response
responses = model.batch(prompts, output_type=Animal, max_new_tokens=100)
print(responses) # ['{ "animal": "cat", "color": "white and gray" }', '{ "animal": "dog", "color": "white" }']
print([Animal.model_validate_json(i) for i in responses]) # [Animal(animal='cat', color='white and gray'), Animal(animal='dog', color='white')]
```


An example using a list of lists with tag assets:

```python
from io import BytesIO
from urllib.request import urlopen

from PIL import Image as PILImage
from transformers import (
    LlavaForConditionalGeneration,
    AutoProcessor,
)

import outlines
from outlines.inputs import Image

TEST_MODEL = "trl-internal-testing/tiny-LlavaForConditionalGeneration"
IMAGE_URL = "https://upload.wikimedia.org/wikipedia/commons/2/25/Siam_lilacpoint.jpg"
IMAGE_URL_2 ="https://upload.wikimedia.org/wikipedia/commons/9/98/Aldrin_Apollo_11_original.jpg"

def get_image_from_url(image_url):
    img_byte_stream = BytesIO(urlopen(image_url).read())
    image = PILImage.open(img_byte_stream).convert("RGB")
    image.format = "PNG"
    return image

# Create a model
model = outlines.from_transformers(
    LlavaForConditionalGeneration.from_pretrained(TEST_MODEL),
    AutoProcessor.from_pretrained(TEST_MODEL),
)

# Call the batch method with a list of model input dicts
result = model.batch(
    [
        ["<image>Describe the image.", Image(get_image_from_url(IMAGE_URL))],
        ["<image>Describe the image.", Image(get_image_from_url(IMAGE_URL_2))],
    ]
)
print(result) # ['The image shows a cat', 'The image shows an astronaut']
```


================================================
FILE: docs/features/models/vllm.md
================================================
---
title: vLLM
---

# vLLM

## Prerequisites

The Outlines `VLLM` model is intended to be used along with a vLLM instance running on a separate server (can be local or remote). Make sure you have a vLLM server running and accessible before using the `VLLM` model. For instance by running:

```shell
pip install vllm

vllm serve microsoft/Phi-3-mini-4k-instruct \
  --dtype auto \
  --api-key token-abc123
```

Follow the [Installation instructions](https://docs.vllm.ai/en/latest/getting_started/installation/index.html) for more information on how to set up a vLLM server for your particular setup.

As the vLLM client relies on the `openai` python sdk, you need to have the `openai` package installed. Install all optional dependencies for the `VLLM` model with: `pip install openai`.

If you want to use the vllm offline inference mode instead of the server mode, please refer to the [VLLMOffline](./vllm_offline.md) model documentation.

## Model Initialization

To load the model, you can use the `from_vllm` function. The argument of the function is either an `OpenAI` or `AsyncOpenAI` instance from the `openai` library. Make sure the value of the `base_url` argument of the `OpenAI` client points to your running vLLM server. Consult the [vLLM documentation](https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html) on using an OpenAI client with a vLLM server for more information.

Based on whether the `openai` client instance is synchronous or asynchronous, you will receive a `VLLM` or `AsyncVLLM` model instance.

For instance:

```python
import openai
import outlines

# Create the OpenAI client
sync_openai_client = openai.OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="token-abc123")
async_openai_client = openai.AsyncOpenAI(base_url="http://0.0.0.0:8000/v1", api_key="token-abc123")

# Create a sync model
sync_model = outlines.from_vllm(sync_openai_client, "microsoft/Phi-3-mini-4k-instruct")
print(type(sync_model)) # <class 'outlines.models.vllm.VLLM'>

# Create an async model
async_model = outlines.from_vllm(async_openai_client, "microsoft/Phi-3-mini-4k-instruct")
print(type(async_model)) # <class 'outlines.models.vllm.AsyncVLLM'>
```

## Text Generation

To generate text, you can simply call the model with a prompt.

For instance:

```python
import openai
import outlines

# Create the model
model = outlines.from_vllm(openai.OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="token-abc123"), "microsoft/Phi-3-mini-4k-instruct")

# Call it to generate text
response = model("What's the capital of Latvia?", max_tokens=20)
print(response) # 'The capital of Latvia is Riga.'
```

#### Vision

Some models you can run with VLLM support vision input. To use this feature, provide a list containing a text prompt and `Image` instances.

For instance:

```python
import io
import requests
import PIL
import outlines
import openai
from outlines.inputs import Image

# Create the model
model = outlines.from_vllm(
    openai.OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="token-abc123"),
    "Qwen/Qwen2.5-VL-3B-Instruct"
)

# Function to get an image
def get_image(url):
    r = requests.get(url)
    return PIL.Image.open(io.BytesIO(r.content))

# Create the prompt containing the text and the image
prompt = [
    "Describe the image",
    Image(get_image("https://picsum.photos/id/237/400/300"))
]

# Call the model to generate a response
response = model(prompt, max_tokens=50)
print(response) # 'The image shows a black puppy lying on a wooden surface...'
```

#### Chat

You can also use chat inputs with the `VLLM` model. To do so, call the model with a `Chat` instance. The content of messsage within the chat can be vision inputs as described above.

For instance:

```python
import io
import requests
import PIL
import openai
import outlines
from outlines.inputs import Chat, Image

# Create the model
model = outlines.from_vllm(
    openai.OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="token-abc123"),
    "Qwen/Qwen2.5-VL-3B-Instruct"
)

# Function to get an image
def get_image(url):
    r = requests.get(url)
    return PIL.Image.open(io.BytesIO(r.content))

# Create the chat input
prompt = Chat([
    {"role": "system", "content": "You are a helpful assistant."},
    {
        "role": "user",
        "content": ["Describe the image", Image(get_image("https://picsum.photos/id/237/400/300"))]
    },
])

# Call the model to generate a response
response = model(prompt, max_tokens=50)
print(response) # 'The image shows a black puppy lying on a wooden surface...'
```

#### Streaming

Finally, the `VLLM` model supports streaming through the `stream` method.

For instance:

```python
import openai
import outlines

# Create the model
model = outlines.from_vllm(
    openai.OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="token-abc123"),
    "microsoft/Phi-3-mini-4k-instruct"
)

# Stream the response
for chunk in model.stream("Tell me a short story about a cat.", max_tokens=50):
    print(chunk, end="") # 'Once upon a time...'
print()
```

## Asynchronous Calls

vLLM supports asynchronous operations by passing an `AsyncOpenAI` client instead of a regular `OpenAI` client. This returns an `AsyncVLLM` model instance that supports async/await patterns.

### Basic Async Generation

```python
import asyncio
import openai
import outlines

async def generate_text():
    async_client = openai.AsyncOpenAI(base_url="http://0.0.0.0:8000/v1", api_key="token-abc123")
    async_model = outlines.from_vllm(async_client, "microsoft/Phi-3-mini-4k-instruct")

    result = await async_model("Write a haiku about Python.", max_tokens=50)
    print(result)

asyncio.run(generate_text())
```

### Async Streaming

The async model also supports streaming with async iteration:

```python
import asyncio
import openai
import outlines

async def stream_text():
    async_client = openai.AsyncOpenAI(base_url="http://0.0.0.0:8000/v1", api_key="token-abc123")
    async_model = outlines.from_vllm(async_client, "microsoft/Phi-3-mini-4k-instruct")

    async for chunk in async_model.stream("Tell me a story about a robot.", max_tokens=100):
        print(chunk, end="")

asyncio.run(stream_text())
```

### Concurrent Async Requests

One of the main benefits of async calls is the ability to make multiple concurrent requests:

```python
import asyncio
import openai
import outlines

async def generate_multiple():
    async_client = openai.AsyncOpenAI(base_url="http://0.0.0.0:8000/v1", api_key="token-abc123")
    async_model = outlines.from_vllm(async_client, "microsoft/Phi-3-mini-4k-instruct")

    prompts = [
        "Write a tagline for a coffee shop.",
        "Write a tagline for a bookstore.",
        "Write a tagline for a gym."
    ]

    tasks = [async_model(prompt, max_tokens=30) for prompt in prompts]
    results = await asyncio.gather(*tasks)

    for prompt, result in zip(prompts, results):
        print(f"{prompt}\n{result}\n")

asyncio.run(generate_multiple())
```

## Structured Generation

vLLM supports all output types available in Outlines. Simply provide an `output_type` after the prompt when calling the model. All structured generation features work with both synchronous and asynchronous models.

### Simple Type

```python
import openai
import outlines

output_type = int

openai_client = openai.OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="token-abc123")
model = outlines.from_vllm(openai_client, "microsoft/Phi-3-mini-4k-instruct")

result = model("How many countries are there in the world?", output_type)
print(result) # '200'
```

### JSON Schema

```python
import openai
import outlines
from typing import List
from pydantic import BaseModel

class Character(BaseModel):
    name: str
    age: int
    skills: List[str]

openai_client = openai.OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="token-abc123")
model = outlines.from_vllm(openai_client, "microsoft/Phi-3-mini-4k-instruct")

result = model("Create a character.", output_type=Character, frequency_penalty=1.5)
print(result) # '{"name": "Evelyn", "age": 34, "skills": ["archery", "stealth", "alchemy"]}'
print(Character.model_validate_json(result)) # name=Evelyn, age=34, skills=['archery', 'stealth', 'alchemy']
```

### Multiple Choice

```python
from typing import Literal
import openai
import outlines

output_type = Literal["Paris", "London", "Rome", "Berlin"]

openai_client = openai.OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="token-abc123")
model = outlines.from_vllm(openai_client, "microsoft/Phi-3-mini-4k-instruct")

result = model("What is the capital of France?", output_type, temperature=0)
print(result) # 'Paris'
```

### Regex

```python
import openai
import outlines
from outlines.types import Regex

output_type = Regex(r"\d{3}-\d{2}-\d{4}")

openai_client = openai.OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="token-abc123")
model = outlines.from_vllm(openai_client, "microsoft/Phi-3-mini-4k-instruct")

result = model("Generate a fake social security number.", output_type, top_p=0.1)
print(result) # '782-32-3789'
```

### Context-Free Grammar

```python
import openai
import outlines
from outlines.types import CFG

arithmetic_grammar = """
?start: sum

?sum: product
| sum "+" product   -> add
| sum "-" product   -> sub

?product: atom
| product "*" atom  -> mul
| product "/" atom  -> div

?atom: NUMBER           -> number
| "-" atom         -> neg
| "(" sum ")"

%import common.NUMBER
%import common.WS_INLINE

%ignore WS_INLINE
"""
output_type = CFG(arithmetic_grammar)

openai_client = openai.OpenAI(base_url="http://0.0.0.0:8000/v1", api_key="token-abc123")
model = outlines.from_vllm(openai_client, "microsoft/Phi-3-mini-4k-instruct")

result = model("Write an addition.", output_type, extra_body={"guided_decoding_backend": "outlines"})
print(result) # '23 + 48'
```

### Async Structured Generation

All structured generation features work seamlessly with async models:

```python
import asyncio
import openai
import outlines
from pydantic import BaseModel

class User(BaseModel):
    name: str
    email: str
    age: int

async def generate_user():
    async_client = openai.AsyncOpenAI(base_url="http://0.0.0.0:8000/v1", api_key="token-abc123")
    async_model = outlines.from_vllm(async_client, "microsoft/Phi-3-mini-4k-instruct")

    result = await async_model("Generate a random user profile.", output_type=User)
    user = User.model_validate_json(result)
    print(f"Name: {user.name}, Email: {user.email}, Age: {user.age}")

asyncio.run(generate_user())
```

## Inference Arguments

When calling the model, you can provide optional parameters on top of the prompt and the output type. Those will be passed on to the `chat.completions.create` method of the OpenAI client.

An optional parameter of particular interest is `extra_body`, which is a dictionary containing arguments that are specific to vLLM and are not part of the standard `openai` interface. Among those, `guided_decoding_backend` allows you to select the library used by the vLLM server to control structured generation. You can use the value `outlines` to generated structured text with Outlines.

See the [vLLM documentation](https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#extra-parameters) on extra parameters for the OpenAI-compatible server for more information on inference parameters.


================================================
FILE: docs/features/models/vllm_offline.md
================================================
---
title: vLLM Offline
---

# vLLM Offline

Outlines provides an integration with [vLLM](https://docs.vllm.ai/en/latest/) using the [vllm library](https://github.com/vllm-project/vllm). This model allows you to use vLLM in the "Offline Inference" mode, meaning that text generation happens within the model, there is no separate server. If you want to use vLLM with a server, see the [VLLM model documentation](./vllm.md).

!!! Note "Installation"

    You need to install the `vllm` library to be able to use the `VLLMOffline` model: `pip install vllm`. Due to a library version conflict between outlines and vllm, you MUST install `vllm` before installing `outlines`.

    When installing `outlines` (after having first installed `vllm`), you may encounter the following error: `ERROR: pip's dependency resolver does not currently take into account all the packages that are installed`. You can safely ignore it.

    See the [vLLM documentation](https://docs.vllm.ai/en/latest/getting_started/installation/index.html) for instructions on how to install vLLM for CPU, ROCm...

## Model Initialization

To load the model, you can use the `from_vllm_offline` function. The single argument of the function is a `LLM` model instance from the `vllm` library. You will then receive a `VLLMOffline` model instance you can use to generate text.

Consult the [LLM class API reference](https://docs.vllm.ai/en/latest/api/vllm/index.html#vllm.LLM) for detailed information on how to create an `LLM` instance and on the various available parameters.

For instance:

```python
import outlines
from vllm import LLM

# Create the model
model = outlines.from_vllm_offline(
    LLM("microsoft/Phi-3-mini-4k-instruct")
)
```

!!! Note

    When initializing the `vllm.LLM` object, you can specify a `guided_decoding_backend` to choose what library will be used by vLLM to constrain the generation. Consult the [vLLM documentation](https://docs.vllm.ai/en/v0.8.2/features/structured_outputs.html) on structured output for the list of possible values.

## Text Generation

Once you've created your Outlines `VLLMOffline` model instance, you're all set to generate text with this provider. You can simply call the model with a prompt.

For instance:

```python
import outlines
from vllm import LLM, SamplingParams

# Create the model
model = outlines.from_vllm_offline(
    LLM("microsoft/Phi-3-mini-4k-instruct")
)

# Call it to generate text
response = model("What's the capital of Latvia?", sampling_params=SamplingParams(max_tokens=20))
print(response) # 'Riga'
```

#### Chat

You can also use chat inputs with the `VLLMOffline` model. To do so, call the model with a `Chat` instance. The content of messsage within the chat can be vision inputs as described above.

For instance:

```python
import outlines
from vllm import LLM, SamplingParams
from outlines.inputs import Chat

# Create the model
model = outlines.from_vllm_offline(
    LLM("microsoft/Phi-3-mini-4k-instruct")
)

# Create the chat prompt
prompt = Chat([
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "What's the capital of Latvia?"},
])

# Call the model to generate a response
response = model(prompt, sampling_params=SamplingParams(max_tokens=50))
print(response) # 'Riga'
```

#### Streaming

The `VLLMOffline` model supports streaming through the `stream` method.

For instance:

```python
import outlines
from vllm import LLM, SamplingParams

# Create the model
model = outlines.from_vllm_offline(
    LLM("microsoft/Phi-3-mini-4k-instruct")
)

# Stream the response
for chunk in model.stream("Tell me a short story about a cat.", sampling_params=SamplingParams(max_tokens=50)):
    print(chunk) # 'Once...'
```

#### Batching

Finally, the `VLLMOffline` model also supports batching through the `batch` method. To use it, provide a list of prompts (using the formats described above) to the `batch` method. You will receive as a result a list of completions.

For instance:

```python
import outlines
from vllm import LLM

# Create the model
model = outlines.from_vllm_offline(
    LLM("microsoft/Phi-3-mini-4k-instruct")
)

# Create a list of prompts that will be used in a single batch
prompts = [
    "What's the capital of Lithuania?",
    "What's the capital of Latvia?",
    "What's the capital of Estonia?"
]

# Call it to generate text
result = model.batch(prompts, max_new_tokens=20)
print(result) # ['Vilnius', 'Riga', 'Tallinn']
```

## Structured Generation

The `VLLMOffline` model supports all output types available in Outlines. Simply provide an `output_type` after the prompt when calling the model.

### Simple Type

```python
import outlines
from vllm import LLM

output_type = int

model = outlines.from_vllm_offline(
    LLM("microsoft/Phi-3-mini-4k-instruct")
)

result = model("How many countries are there in the world?", output_type)
print(result) # '200'
```

### JSON Schema

```python
import outlines
from vllm import LLM, SamplingParams
from typing import List
from pydantic import BaseModel

class Character(BaseModel):
    name: str
    age: int
    skills: List[str]

model = outlines.from_vllm_offline(
    LLM("microsoft/Phi-3-mini-4k-instruct")
)

result = model("Create a character.", output_type=Character, sampling_params=SamplingParams(frequency_penalty=1.5, max_tokens=200))
print(result) # '{"name": "Evelyn", "age": 34, "skills": ["archery", "stealth", "alchemy"]}'
print(Character.model_validate_json(result)) # name=Evelyn, age=34, skills=['archery', 'stealth', 'alchemy']
```

### Multiple Choice

```python
from typing import Literal
import outlines
from vllm import LLM, SamplingParams

output_type = Literal["Paris", "London", "Rome", "Berlin"]

model = outlines.from_vllm_offline(
    LLM("microsoft/Phi-3-mini-4k-instruct")
)

result = model("What is the capital of France?", output_type, sampling_params=SamplingParams(temperature=0))
print(result) # 'Paris'
```

### Regex

```python
import outlines
from vllm import LLM, SamplingParams
from outlines.types import Regex

output_type = Regex(r"\d{3}-\d{2}-\d{4}")

model = outlines.from_vllm_offline(
    LLM("microsoft/Phi-3-mini-4k-instruct")
)

result = model("Generate a fake social security number.", output_type, sampling_params=SamplingParams(top_p=0.1))
print(result) # '782-32-3789'
```

### Context-Free Grammar

```python
import outlines
from vllm import LLM, SamplingParams
from outlines.types import CFG

arithmetic_grammar = """
?start: sum

?sum: product
| sum "+" product   -> add
| sum "-" product   -> sub

?product: atom
| product "*" atom  -> mul
| product "/" atom  -> div

?atom: NUMBER           -> number
| "-" atom         -> neg
| "(" sum ")"

%import common.NUMBER
%import common.WS_INLINE

%ignore WS_INLINE
"""
output_type = CFG(arithmetic_grammar)

model = outlines.from_vllm_offline(
    LLM("microsoft/Phi-3-mini-4k-instruct")
)

result = model("Write an addition.", output_type)
print(result) # '23 + 48'
```

## Inference Arguments

When calling the model, you can provide optional parameters on top of the prompt and the output type. Those will be passed on to the `generate` method of the `LLM` model instance. An argument of particular interest is `sampling_params`. It takes as a value a `vllm.SamplingParams` instance containing parameters such as max_tokens or temperature.

See the [vLLM documentation](https://docs.vllm.ai/en/latest/api/vllm/sampling_params.html#vllm.sampling_params.SamplingParams) on sampling parameters for more information on inference parameters.


================================================
FILE: docs/features/utility/application.md
================================================
---
title: Application
---

# Application

The `Application` class enables you to encapsulate a prompt template and an output type into a reusable component.

## Overview

An `Application` combines a prompt template with an output type, creating a reusable component that can be applied to different models.

Applications are useful for simplifying repeated tasks where you have a well-defined `Template` and a fixed output type, such as classification tasks or data extraction.

To create an `Application` instance, initialize the class with a prompt template and an output type. You can then call the application with a model and the variables defined in your template in a dictionary.

For instance:

```python
from typing import Literal
import transformers
from outlines import Application, Template, from_transformers

# Create a template
template_str = "Is {{ name }} a boy or a girl name?"
template = Template.from_string(template_str)

# Create a model
model = from_transformers(
    transformers.AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct"),
    transformers.AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
)

# Create the application and call it to generate text
application = Application(template, Literal["boy", "girl"])
response = application(model, {"name": "Alice"}, max_new_tokens=10)

print(response) # "girl"
```

Instead of providing an Outlines `Template` instance, you can provide a `Callable` that returns a string. The parameters of the callable are used as the variables of the template such that you must provide values for them in the dictionary when calling the application.

For instance, we can create the same example as above using a a function instead of a template:

```python
from typing import Literal
import transformers
from outlines import Application, from_transformers

# Create a function that will be used as a template
def template_func(name: str) -> str:
    return f"Is {name} a boy or a girl name?"

# Create a model
model = from_transformers(
    transformers.AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct"),
    transformers.AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
)

# Create the application with the function template and call it to generate text
application = Application(template_func, Literal["boy", "girl"])
response = application(model, {"name": "Alice"}, max_new_tokens=10)

print(response) # "girl"
```


================================================
FILE: docs/features/utility/regex_dsl.md
================================================
---
title: Regex DSL
---

# Regex DSL

This library provides a Domain-Specific Language (DSL) to construct regular expressions in a more intuitive and modular way. It allows you to create complex regexes using simple building blocks that represent literal strings, patterns, and various quantifiers. Additionally, these custom regex types can be used directly as types in [Pydantic](https://pydantic-docs.helpmanual.io/) schemas to enforce pattern constraints during text generation.

---

## Why Use This DSL?

1. **Modularity & Readability**: Instead of writing cryptic regular expression strings, you compose a regex as a tree of objects.
2. **Enhanced Debugging**: Each expression can be visualized as an ASCII tree, making it easier to understand and debug complex regexes.
3. **Pydantic Integration**: Use your DSL-defined regex as types in Pydantic models. The DSL seamlessly converts to JSON Schema with proper pattern constraints.
4. **Extensibility**: Easily add or modify quantifiers and other regex components by extending the provided classes.

---

## Building Blocks


Every regex component in this DSL is a **Term**. Here are two primary types:

- **`String`**: Represents a literal string. It escapes the characters that have a special meaning in regular expressions.
- **`Regex`**: Represents an existing regex pattern string.

```python
from outlines.types import String, Regex

# A literal string "hello"
literal = String("hello")   # Internally represents "hello"

# A regex pattern to match one or more digits
digit = Regex(r"[0-9]+")     # Internally represents the pattern [0-9]+

# Converting to standard regex strings:
from outlines.types.dsl import to_regex

print(to_regex(literal))  # Output: hello
print(to_regex(digit))    # Output: [0-9]+
```

---

## Early Introduction to Quantifiers & Combining Terms

The DSL supports common regex quantifiers as methods on every `Term`. These methods allow you to specify how many times a pattern should be matched. They include:

- **`exactly(count)`**: Matches the term exactly `count` times.
- **`optional()`**: Matches the term zero or one time.
- **`one_or_more()`**: Matches the term one or more times (Kleene Plus).
- **`zero_or_more()`**: Matches the term zero or more times (Kleene Star).
- **`between(min_count, max_count)`**: Matches the term between `min_count` and `max_count` times (inclusive).
- **`at_least(count)`**: Matches the term at least `count` times.
- **`at_most(count)`**: Matches the term up to `count` times.

These quantifiers can also be used as functions that take the `Term` as an argument. If the term is a plain string, it will be automatically converted to a `String` object. Thus `String("foo").optional()` is equivalent to `optional("foo")`.

Let's see these quantifiers side by side with examples.

### Quantifiers in Action

#### `exactly(count)`

This method restricts the term to appear exactly `count` times.

```python
# Example: exactly 5 digits
five_digits = Regex(r"\d").exactly(5)
print(to_regex(five_digits))  # Output: (\d){5}
```

You can also use the `exactly` function:

```python
from outlines.types import exactly

# Example: exactly 5 digits
five_digits = exactly(Regex(r"\d"), 5)
print(to_regex(five_digits))  # Output: (\d){5}
```

#### `optional()`

This method makes a term optional, meaning it may occur zero or one time.

```python
# Example: an optional "s" at the end of a word
maybe_s = String("s").optional()
print(to_regex(maybe_s))  # Output: (s)?
```

You can also use the `optional` function:

```python
from outlines.types import optional

# Example: an optional "s" at the end of a word
maybe_s = optional("s")
print(to_regex(maybe_s))  # Output: (s)?
```

#### `one_or_more()`

This method indicates that the term must appear at least once.

```python
# Example: one or more alphabetic characters
letters = Regex(r"[A-Za-z]").one_or_more()
print(to_regex(letters))  # Output: ([A-Za-z])+
```

You can also use the `one_or_more` function:

```python
from outlines.types import one_or_more

# Example: one or more alphabetic characters
letters = one_or_more(Regex(r"[A-Za-z]"))
print(to_regex(letters))  # Output: ([A-Za-z])+

```

#### `zero_or_more()`

This method indicates that the term can occur zero or more times.

```python
# Example: zero or more spaces
spaces = String(" ").zero_or_more()
print(to_regex(spaces))  # Output: ( )*
```

You can also use the `zero_or_more` function:

```python
from outlines.types import zero_or_more

# Example: zero or more spaces
spaces = zero_or_more(" ")
print(to_regex(spaces))  # Output: ( )*
```

#### `between(min_count, max_count)`

This method indicates that the term can appear any number of times between `min_count` and `max_count` (inclusive).

```python
# Example: Between 2 and 4 word characters
word_chars = Regex(r"\w").between(2, 4)
print(to_regex(word_chars))  # Output: (\w){2,4}
```

You can also use the `between` function:

```python
from outlines.types import between

# Example: Between 2 and 4 word characters
word_chars = between(Regex(r"\w"), 2, 4)
print(to_regex(word_chars))  # Output: (\w){2,4}
```

#### `at_least(count)`

This method indicates that the term must appear at least `count` times.

```python
# Example: At least 3 digits
at_least_three = Regex(r"\d").at_least(3)
print(to_regex(at_least_three))  # Output: (\d){3,}
```

You can also use the `at_least` function:

```python
from outlines.types import at_least

# Example: At least 3 digits
at_least_three = at_least(Regex(r"\d"), 3)
print(to_regex(at_least_three))  # Output: (\d){3,}
```

#### `at_most(count)`

This method indicates that the term can appear at most `count` times.

```python
# Example: At most 3 digits
up_to_three = Regex(r"\d").at_most(3)
print(to_regex(up_to_three))  # Output: (\d){0,3}
```

You can also use the `at_most` function:

```python
from outlines.types import at_most

# Example: At most 3 digits
up_to_three = at_most(Regex(r"\d"), 3)
print(to_regex(up_to_three))  # Output: (\d){0,3}
```

---

## Combining Terms

The DSL allows you to combine basic terms into more complex patterns using concatenation and alternation.

### Concatenation (`+`)

The `+` operator (and its reflected variant) concatenates terms, meaning that the terms are matched in sequence.

```python
# Example: Match "hello world"
pattern = String("hello") + " " + Regex(r"\w+")
print(to_regex(pattern))  # Output: hello\ (\w+)
```

### Alternation (`either()`)

The `either()` function creates alternatives, allowing a match for one of several patterns. You can provide as many terms as you want.

```python
# Example: Match either "cat" or "dog" or "mouse"
animal = either(String("cat"), "dog", "mouse")
print(to_regex(animal))  # Output: (cat|dog|mouse)
```

*Note:* When using `either()` with plain strings (such as `"dog"`), the DSL automatically wraps them in a `String` object that escapes the characters that have a special meaning in regular expressions, just like with quantifier functions.

---

## Custom types

The DSL comes "batteries included" with types that represent common text constructs:

- `integer` represents an integer number as recognized by `int`
- `boolean` represents a boolean, "True" or "False" as recognized by `bool`
- `number` represents a floating-point number recognize by Python's `float`
- `date` represents a date as understood by `datetime.date`
- `time` represents a time as understood by `datetime.time`
- `datetime` represents a time as understood by `datetime.datetime`
- `digit` represents a single digit
- `char` represents a single character
- `newline` represents a new line character
- `whitespace` represents a white space
- `hex_str` represents a hexadecimal string, optionally prefixed with "0x"
- `uuid4` represents a UUID version 4 string in the format "xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx"
- `ipv4` represents an IPv4 address in the format "xxx.xxx.xxx.xxx" where each octet is between 0 and 255
- `sentence` represents a sentence
- `paragraph` represents a paragraph (one or more sentences separated by one or more line breaks)

For instance you can describe the answers in the GSM8K dataset using the following pattern:

```python
from outlines.types import sentence, digit

answer = "A: " + sentence.between(2,4) + " So the answer is: " + digit.between(1,4)
```

---

## Practical Examples

### Example 1: Matching a Custom ID Format

Suppose you want to create a regex that matches an ID format like "ID-12345", where:

- The literal "ID-" must be at the start.
- Followed by exactly 5 digits.

```python
id_pattern = "ID-" + Regex(r"\d").exactly(5)
print(to_regex(id_pattern))  # Output: ID-(\d){5}
```

### Example 2: Email Validation with Pydantic

You can define a regex for email validation and use it as a type in a Pydantic model.

```python
from pydantic import BaseModel, ValidationError

# Define an email regex term (this is a simplified version)
email_regex = Regex(r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+")

class User(BaseModel):
    name: str
    email: email_regex  # Use our DSL regex as a field type

# Valid input
user = User(name="Alice", email="alice@example.com")
print(user)

# Invalid input (raises a ValidationError)
try:
    User(name="Bob", email="not-an-email")
except ValidationError as e:
    print(e)
```

When used in a Pydantic model, the email field is automatically validated against the regex pattern and its JSON Schema includes the `pattern` constraint.

### Example 3: Building a Complex Pattern

Consider a pattern to match a simple date format: `YYYY-MM-DD`.

```python
year = Regex(r"\d").exactly(4)         # Four digits for the year
month = Regex(r"\d").exactly(2)        # Two digits for the month
day = Regex(r"\d").exactly(2)          # Two digits for the day

# Combine with literal hyphens
date_pattern = year + "-" + month + "-" + day
print(to_regex(date_pattern))
# Output: (\d){4}\-(\d){2}\-(\d){2}
```

---

## Visualizing Your Pattern

One of the unique features of this DSL is that each term can print its underlying structure as an ASCII tree. This visualization can be particularly helpful when dealing with complex expressions.

```python
# A composite pattern using concatenation and quantifiers
pattern = "a" + String("b").one_or_more() + "c"
print(pattern)
```

*Expected Output:*

```ascii
└── Sequence
    ├── String('a')
    ├── KleenePlus(+)
    │   └── String('b')
    └── String('c')
```

This tree representation makes it easy to see the hierarchy and order of operations in your regular expression.

---

## Final Words

This DSL is designed to simplify the creation and management of regular expressions—whether you're validating inputs in a web API, constraining the output of an LLM, or just experimenting with regex patterns. With intuitive methods for common quantifiers and operators, clear visual feedback, and built-in integration with Pydantic, you can build robust and maintainable regex-based validations with ease.

Feel free to explore the library further and adapt the examples to your use cases. Happy regexing!


================================================
FILE: docs/features/utility/template.md
================================================
---
title: Template
---

# Template

Outlines templates provide a way of creating reusable prompt structures with placeholders for dynamic content.

## Overview

To create a `Template` instance, you can use two class methods:
- `from_string`: Creates a template from a string containing a Jinja2 template
- `from_file`: Creates a template from a file containing a Jinja2 template

After creating a template, you can call it with the variables required by the template as keyword arguments.

For instance:

```python
from outlines import Template

# Create a template from a string
template_str = """
Hello, {{ name }}!
The weather today is {{ weather }}.
"""
template = Template.from_string(template_str)

# Create a template from a file, assuming the content of template_str is put into a file
template = Template.from_file("path_to/my_file.txt")

# Call the template to render the prompt
prompt: str = template(name="Alice", weather="sunny")
print(prompt)  # "Hello, Alice!\nThe weather today is sunny."
```

## Composite Templates

Templates can be nested and composed to create complex prompt structures:

```python
from outlines import Template

# Create component templates
user_template = Template.from_string("User: {{ query }}")
system_template = Template.from_string("System: {{ instruction }}")

# Create a composite template
chat_template = Template.from_string("""
{{ system }}
{{ user }}
""")

# Fill in nested templates
prompt = chat_template(
    system=system_template(instruction="You are a helpful assistant."),
    user=user_template(query="What is machine learning?")
)

print(prompt)
# System: You are a helpful assistant.
#
# User: What is machine learning?
```

## Custom Filters

You can add custom filters to your Outlines template to extend the templating functionality. To do so, provide as second argument a dictionary with filter names as keys and filter functions as values. The filter can then be used in your jinja2 template following the regular syntax. When rendering a prompt, the function will be applied to the associated variable.

For instance:

```python
from outlines import Template

def uppercase(text: str) -> str:
    return text.upper()

# Add custom filter when creating template
template = Template.from_string(
    "Hello {{ name | uppercase }}!",
    filters={"uppercase": uppercase}
)
prompt = template(name="alice")
print(prompt)  # "Hello ALICE!"
```


================================================
FILE: docs/guide/architecture.md
================================================
# Architecture Overview

This guide explains how Outlines is organized so you can navigate the codebase, debug issues, and extend the library.

## How Structured Generation Works

When you ask an LLM to output JSON or follow a specific format, traditional approaches generate text freely and hope it matches. Outlines takes a different approach: it constrains the model at generation time by masking invalid tokens, making it impossible for the model to produce invalid output.

## Core Abstractions

Outlines has three main abstractions: **Model**, **Generator**, and **Type System**.

### Model and ModelTypeAdapter

The `Model` class (`outlines/models/base.py`) is the abstract base class for all LLM integrations. There are two categories based on how structured generation is implemented:

**Steerable models** (`SteerableModel`): Models where Outlines directly applies a logits processor during generation. This includes:
- `LlamaCpp` - llama.cpp bindings
- `MLXLM` - Apple MLX models
- `Transformers` - HuggingFace Transformers

**Black-box models** (`BlackBoxModel`): Models where Outlines delegates structured generation to the provider's API rather than applying logits processors directly. This includes:
- `OpenAI`, `Anthropic`, `Gemini`, `Mistral` - Cloud API providers
- `VLLM`, `VLLMOffline`, `SGLang`, `TGI`, `Ollama` - Inference servers with built-in structured generation
- `Dottxt` - Dottxt API

Note: Some black-box models (like vLLM or Ollama) could technically expose logits, but they implement structured generation server-side, so Outlines delegates to their APIs instead of building processors locally.

**The Model interface:**

Every model subclass must implement these methods:

| Method | Purpose |
|--------|---------|
| `generate(model_input, output_type, **kwargs)` | Generate a single response (internal, receives logits processor or output type) |
| `generate_batch(model_input, output_type, **kwargs)` | Generate responses for multiple prompts |
| `generate_stream(model_input, output_type, **kwargs)` | Stream a response token by token |

The base `Model` class provides these convenience methods that create a `Generator` internally:

| Method | Purpose |
|--------|---------|
| `__call__(model_input, output_type, backend, **kwargs)` | Generate a single response |
| `batch(model_input, output_type, backend, **kwargs)` | Generate batch responses |
| `stream(model_input, output_type, backend, **kwargs)` | Stream a response |

**ModelTypeAdapter - Bridging formats:**

Each model has a `type_adapter` attribute that handles format conversion between Outlines and the specific model provider:

```python
class ModelTypeAdapter(ABC):
    @abstractmethod
    def format_input(self, model_input) -> Any:
        """Convert user input to model-specific format.

        For API models: creates the `messages` argument
        For local models: may apply chat templates, convert str to list, etc.
        """
        ...

    @abstractmethod
    def format_output_type(self, output_type) -> Any:
        """Convert output type to model-specific format.

        For black-box models: creates `response_format` argument
        For steerable models: formats the logits processor for the model
        """
        ...
```

### Generator - Unifying the Generation Interface

The `Generator` (`outlines/generator.py`) is a factory function that returns the appropriate generator class based on the model type.

**Why Generator exists:**

Without Generator, users would need different code for different model types:

```python
# Without Generator - user needs to know model internals
if isinstance(model, SteerableModel):
    processor = build_logits_processor(output_type)
    result = model.generate(prompt, processor)
else:
    result = model.generate(prompt, output_type)
```

With Generator, the complexity is hidden:

```python
# With Generator - same code works for any model
generator = Generator(model, output_type)
result = generator(prompt)
```

**Generator classes:**

| Class | Used For | How It Works |
|-------|----------|--------------|
| `SteerableGenerator` | Local models (`LlamaCpp`, `MLXLM`, `Transformers`) | Builds and caches a logits processor from the output type, resets and passes it to the model on each call |
| `BlackBoxGenerator` | Sync API models | Passes output type directly to model's generate method |
| `AsyncBlackBoxGenerator` | Async API models | Async version of BlackBoxGenerator |

**SteerableGenerator internals:**

When you create a `SteerableGenerator` with an output type, it:

1. Converts the Python type to a `Term` using `python_types_to_terms()`
2. Based on the Term type, builds the appropriate logits processor:
   - `CFG` → calls `get_cfg_logits_processor()`
   - `JsonSchema` → calls `get_json_schema_logits_processor()`
   - Other terms → converts to regex via `to_regex()`, then calls `get_regex_logits_processor()`
3. Caches the processor for reuse
4. On each call, resets processor state and passes it to the model

### Type System - From Python Types to Constraints

The type system (`outlines/types/dsl.py`) converts Python types into constraints that can be enforced during generation.

**The conversion pipeline:**

```
Python Type → Term (via python_types_to_terms)
                    ↓
            ┌───────┴───────┐
            ↓               ↓
    CFG or JsonSchema    Other Terms
            ↓               ↓
    Direct to backend   to_regex() → Regex string
            ↓               ↓
            └───────┬───────┘
                    ↓
            Logits Processor (via backend)
```

**Term classes:**

`Term` is the base class for Outlines' constraint DSL. Key subclasses:

| Term | Purpose | Example |
|------|---------|---------|
| `Regex` | Match a regex pattern | `Regex("[0-9]+")` |
| `JsonSchema` | Match valid JSON for a schema | `JsonSchema(MyPydanticModel)` |
| `CFG` | Match a context-free grammar | `CFG(grammar_string)` |
| `String` | Match a literal string | `String("hello")` |
| `Sequence` | Concatenate terms | `String("[") + item + String("]")` |
| `Alternatives` | Match any of several terms | `term1 \| term2` |
| `KleeneStar` | Zero or more repetitions | `zero_or_more(term)` |
| `KleenePlus` | One or more repetitions | `one_or_more(term)` |
| `Optional` | Zero or one occurrence | `optional(term)` |

**python_types_to_terms:**

This function converts Python types to Term instances:

```python
def python_types_to_terms(ptype) -> Term:
    # Already a Term - return as-is
    if isinstance(ptype, Term):
        return ptype

    # Basic types - return predefined regex patterns
    if is_int(ptype):
        return types.integer
    if is_float(ptype):
        return types.number
    if is_str(ptype):
        return types.string
    if is_bool(ptype):
        return types.boolean

    # Structured types - convert to JsonSchema
    if is_pydantic_model(ptype) or is_dataclass(ptype) or is_typed_dict(ptype):
        return JsonSchema(ptype)

    # Enum - create alternatives from members
    if is_enum(ptype):
        return Alternatives([...])

    # Union, Literal, List, Tuple, Dict - handle recursively
    ...
```

## Data Flow

Here's how a structured generation request flows through the system:

```
1. User calls: model("What is 2+2?", int)

2. Model.__call__ creates Generator:
   Generator(model, int)

3. Generator factory checks model type:
   - SteerableModel → SteerableGenerator
   - BlackBoxModel → BlackBoxGenerator

4. For SteerableGenerator:
   a. python_types_to_terms(int) → Regex("-?[0-9]+")
   b. to_regex(term) → regex string
   c. get_regex_logits_processor(backend, model, regex) → LogitsProcessor

5. Generator.__call__(prompt):
   a. processor.reset()  # Reset state for new generation
   b. model.generate(prompt, processor)

6. During generation (steerable models only):
   - Model computes logits for all tokens
   - LogitsProcessor masks invalid tokens (set to -inf)
   - Model samples from remaining valid tokens

7. Result returned to user
```

## File Organization

```
outlines/
├── __init__.py              # Public API exports
├── generator.py             # Generator factory and classes
├── models/
│   ├── base.py              # Model, AsyncModel, ModelTypeAdapter base classes
│   ├── transformers.py      # HuggingFace Transformers
│   ├── llamacpp.py          # llama.cpp bindings
│   ├── mlxlm.py             # Apple MLX models
│   ├── openai.py            # OpenAI API
│   ├── anthropic.py         # Anthropic API
│   ├── vllm.py              # vLLM server
│   ├── vllm_offline.py      # vLLM offline mode
│   └── ...                  # Other providers
├── types/
│   ├── __init__.py          # Predefined types: integer, number, date, etc.
│   ├── dsl.py               # Term classes, python_types_to_terms, to_regex
│   └── utils.py             # Type checking utilities
├── backends/
│   ├── __init__.py          # get_*_logits_processor functions
│   ├── base.py              # LogitsProcessorType protocol
│   ├── outlines_core.py     # Default backend using outlines-core
│   ├── llguidance.py        # Microsoft llguidance backend
│   └── xgrammar.py          # xgrammar backend
├── processors/
│   ├── base_logits_processor.py  # Base processor implementation
│   └── tensor_adapters/     # Tensor library adapters
├── grammars/                # Predefined grammar files
└── templates.py             # Prompt template utilities
```

## Backends

Backends are responsible for converting constraints (regex, JSON schema, CFG) into logits processors that can be applied during generation. They only apply to steerable models.

**Available backends:**

| Backend | Default For | Description |
|---------|-------------|-------------|
| `outlines_core` | Regex, JSON Schema | The default backend, built on the `outlines-core` Rust library. Compiles constraints into finite state machines. |
| `llguidance` | CFG | Microsoft's llguidance library. Supports context-free grammars and is the only backend that handles CFG constraints. |
| `xgrammar` | - | Alternative backend using the xgrammar library. |

**How backends are selected:**

1. If the user specifies a backend via the `backend` parameter, that backend is used
2. Otherwise, the default backend for the constraint type is used:
   - Regex → `outlines_core`
   - JSON Schema → `outlines_core`
   - CFG → `llguidance`

**Backend interface:**

All backends inherit from `BaseBackend` and implement three methods:

```python
class BaseBackend(ABC):
    @abstractmethod
    def get_json_schema_logits_processor(self, json_schema: str) -> LogitsProcessorType:
        ...

    @abstractmethod
    def get_regex_logits_processor(self, regex: str) -> LogitsProcessorType:
        ...

    @abstractmethod
    def get_cfg_logits_processor(self, grammar: str) -> LogitsProcessorType:
        ...
```

**Specifying a backend:**

```python
from outlines import from_transformers, Generator

model = from_transformers("microsoft/Phi-3-mini-4k-instruct")

# Use xgrammar instead of the default outlines_core
generator = Generator(model, int, backend="xgrammar")
```

## Extension Points

### Adding a New Model Provider

1. Create a new file in `outlines/models/` (e.g., `mymodel.py`)
2. Implement a `ModelTypeAdapter` subclass with `format_input()` and `format_output_type()`
3. Implement a `Model` subclass with `generate()`, `generate_batch()`, and `generate_stream()`
4. Add a factory function (e.g., `from_mymodel()`)
5. Export from `outlines/models/__init__.py`
6. Add to `SteerableModel` or `BlackBoxModel` type alias as appropriate


================================================
FILE: docs/guide/chat_templating.md
================================================
# Chat templating

Instruction-tuned language models use "special tokens" to indicate different parts of text, such as the system prompt, the user prompt, any images, and the assistant's response. A [chat template](https://huggingface.co/docs/transformers/main/en/chat_templating) is how different types of input are composited together into a single, machine-readable string.

Outlines supports chat templating throught the `Chat` model input class. It contains a list of messages similar in format to the chat history you would use with API models such as OpenAI or Anthropic and to the expected arguments of the `apply_chat_template` method of transformers tokenizers. You can find detailed information on the interface of this object in the [model inputs documentation](../features/core/inputs.md).


================================================
FILE: docs/guide/core_concepts.md
================================================
---
title: Core concepts
---

# Core concepts

Coming soon. This will document various concepts at a high level, so users can understand Outlines before diving into specific implementations.

1. Constrained decoding, tokens, and the basics of logit biasing
2. Different ways to define output structure (regex, JSON schema, Pydantic models, context-free grammars)
3. How finite state machines are used to guarantee output structure
4. `Generator`, `Application`, `Template`,
5. Prompt engineering vs. structured generation


================================================
FILE: docs/guide/fastapi_vllm_deployment.md
================================================
---
title: Deploying with FastAPI
---

# Deploying with FastAPI

This guide demonstrates how to build a FastAPI application that leverages Outlines' async integration with vLLM. We create a customer support API that can intelligently categorize tickets and generate structured responses.

## Prerequisites

Before starting, ensure you have a vLLM server running (locally or remotely) and the following packages installed:

```shell
pip install fastapi uvicorn outlines openai pydantic
```

## Building the Application

### Step 1: Define Data Models

First, let's define our Pydantic models for structured outputs:

```python
# models.py
from enum import Enum
from typing import List
from pydantic import BaseModel, Field

class TicketCategory(str, Enum):
    BILLING = "billing"
    TECHNICAL = "technical"
    ACCOUNT = "account"
    PRODUCT = "product"
    OTHER = "other"

class TicketPriority(str, Enum):
    LOW = "low"
    MEDIUM = "medium"
    HIGH = "high"
    URGENT = "urgent"

class TicketAnalysis(BaseModel):
    category: TicketCategory
    priority: TicketPriority
    summary: str = Field(description="Brief summary of the issue")
    customer_sentiment: str = Field(description="Customer emotional state")
    key_issues: List[str] = Field(description="List of main problems")
    requires_human: bool = Field(description="Whether this needs human intervention")

class SupportResponse(BaseModel):
    greeting: str
    acknowledgment: str = Field(description="Acknowledge the customer's issue")
    solution_steps: List[str] = Field(description="Steps to resolve the issue")
    closing: str
```

### Step 2: Define the prompts

Let us now write the prompts that we will be using in our application, using Jinja 2's templating language. We separate them from the application implementation so they are easier to modify and version.

```ascii
{# prompts/categorize.txt #}
Analyze this customer support ticket:

Customer ID: {{ customer_id }}
Message: {{ message }}

Extract the category, priority, and other relevant information.
```

```ascii
{# prompts/respond.txt #}
Generate a professional customer support response.

Customer Message: {{ message }}
Category: {{ category }}
Priority: {{  priority }}
Customer Sentiment: {{ customer_sentiment }}

Create a helpful, empathetic response that addresses their concerns.
```

### Step 3: Create the FastAPI Application

Now let's create our FastAPI application with async vLLM integration:

```python
# main.py
import asyncio
from contextlib import asynccontextmanager
from typing import Optional

import openai
from outlines import models, Template
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel

from models import TicketAnalysis, SupportResponse

# Request model
class TicketRequest(BaseModel):
    customer_id: str
    message: str

# Global model instance
async_model = None

# The lifespan function is a FastAPI construct
# used to define startup and shutdown logic for the API.
@asynccontextmanager
async def lifespan(app: FastAPI):
    """Initialize the async vLLM model on startup."""
    global async_model

    client = openai.AsyncOpenAI(
        base_url="http://localhost:8000/v1",  # Adjust to your vLLM server URL
        api_key="dummy"  # vLLM doesn't require a real API key
    )
    async_model = models.from_vllm(client, "Qwen/Qwen2.5-VL-7B-Instruct")

    yield

    async_model = None  # Cleanup

# Create FastAPI app
app = FastAPI(
    title="Customer Support Assistant API",
    description="AI-powered customer support with structured outputs",
    version="1.0.0",
    lifespan=lifespan
)


@app.post("/analyze-ticket", response_model=TicketAnalysis)
async def analyze_ticket(request: TicketRequest):
    """Analyze a customer support ticket and extract structured information."""
    if async_model is None:
        raise HTTPException(status_code=503, detail="Model not initialized")

    template = Template.from_file("prompts/categorize.txt")
    prompt = template(
        customer_id=request.customer_id,
        message=request.message
    )

    try:
        # Generate and parse a structured response
        result = await async_model(prompt, TicketAnalysis, max_tokens=5000)
        analysis = TicketAnalysis.model_validate_json(result)

        return analysis

    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}")


@app.post("/generate-response", response_model=SupportResponse)
async def generate_response(
    request: TicketRequest,
    analysis: TicketAnalysis
):
    """Generate a structured support response based on ticket analysis."""
    if async_model is None:
        raise HTTPException(status_code=503, detail="Model not initialized")

    template = Template.from_file("prompts/respond.txt")
    prompt = template(
        message=request.message,
        category=analysis.category,
        priority=analysis.priority,
        customer_sentiment=analysis.customer_sentiment
    )

    try:
        # Generate and parse a structured response
        result = await async_model(prompt, SupportResponse, max_tokens=5000)
        response = SupportResponse.model_validate_json(result)

        return response

    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Response generation failed: {str(e)}")
```

## Running the Application

### Step 1: Start your vLLM server

```shell
vllm serve Qwen/Qwen2.5-VL-7B-Instruct
```

### Step 2: Run the FastAPI application

```shell
uvicorn main:app --reload --host 0.0.0.0 --port 8080
```

## Testing the API

### Example 1: Analyze a support ticket

```shell
curl -X POST "http://localhost:8080/analyze-ticket" \
  -H "Content-Type: application/json" \
  -d '{
    "customer_id": "CUST123",
    "message": "I have been charged twice for my subscription this month. This is unacceptable and I want a refund immediately!"
  }'
```

Expected response:

```json
{
  "category": "billing",
  "priority": "high",
  "summary": "Customer charged twice for subscription, requesting refund",
  "customer_sentiment": "angry",
  "key_issues": ["duplicate charge", "subscription billing", "refund request"],
  "requires_human": false
}
```

### Example 2: Generate a support response

```shell
# First, get the analysis
ANALYSIS=$(curl -s -X POST "http://localhost:8080/analyze-ticket" \
  -H "Content-Type: application/json" \
  -d '{
    "customer_id": "CUST456",
    "message": "My app keeps crashing when I try to upload photos."
  }')

# Then generate a response
curl -X POST "http://localhost:8080/generate-response" \
  -H "Content-Type: application/json" \
  -d "{
    \"request\": {
      \"customer_id\": \"CUST456\",
      \"message\": \"My app keeps crashing when I try to upload photos.\"
    },
    \"analysis\": $ANALYSIS
  }"
```

By combining FastAPI's async capabilities with Outlines' structured generation, you can build robust APIs that leverage large language models.

## Using Alternative Backends: SGLang and TGI

One of the key advantages of Outlines is its unified API across different inference backends. You can easily switch from vLLM to SGLang or TGI with minimal code changes - just modify the model initialization in the `lifespan` function.

### Using SGLang Instead of vLLM

To use SGLang, simply change the client initialization:

```python
@asynccontextmanager
async def lifespan(app: FastAPI):
    """Initialize the async SGLang model on startup."""
    global async_model

    client = openai.AsyncOpenAI(
        base_url="http://localhost:30000/v1",  # SGLang server URL
        api_key="dummy"
    )
    async_model = models.from_sglang(client)

    yield

    async_model = None
```

Start your SGLang server with:

```shell
python -m sglang.launch_server \
    --model-path meta-llama/Llama-2-7b-chat-hf \
    --port 30000
```

### Using TGI Instead of vLLM

For TGI (Text Generation Inference), use the Hugging Face client:

```python
import huggingface_hub

@asynccontextmanager
async def lifespan(app: FastAPI):
    """Initialize the async TGI model on startup."""
    global async_model

    client = huggingface_hub.AsyncInferenceClient(
        "http://localhost:8080"  # TGI server URL
    )
    async_model = models.from_tgi(client)

    yield

    async_model = None
```

Start your TGI server with:

```shell
docker run --gpus all -p 8080:80 \
    ghcr.io/huggingface/text-generation-inference:latest \
    --model-id meta-llama/Llama-2-7b-chat-hf
```

The rest of your FastAPI application - all the endpoints, error handling, and business logic - remains completely unchanged. This flexibility allows you to test different inference engines without rewriting your application.


================================================
FILE: docs/guide/getting_started.md
================================================
---
title: Getting Started
---

# Getting Started

## Installation

We recommend using `uv` to install Outlines. You can find `uv` installation instructions [here](https://github.com/astral-sh/uv).

```shell
uv pip install 'outlines[transformers]'
```

or the classic `pip`:

```shell
pip install 'outlines[transformers]'
```

For more information, see the [installation guide](./installation).

## Creating a Model

Outlines contains a variety of models that wrap LLM inference engines/clients. For each of them, you need to install the model's associated library as described in the [installation guide](../installation).

The full list of available models along with detailed explanation on how to use them can be found in the [models page](../features/models/index.md) of the Features section of the documentation.

For a quick start, you can find below an example of how to initialize all supported models in Outlines:

=== "vLLM"

    ```python
    import outlines
    from openai import OpenAI

    # You must have a separate vLLM server running
    # Create an OpenAI client with the base URL of the VLLM server
    openai_client = OpenAI(base_url="http://localhost:11434/v1")

    # Create an Outlines model
    model = outlines.from_vllm(openai_client, "microsoft/Phi-3-mini-4k-instruct")
    ```

=== "Ollama"

    ```python
    import outlines
    from ollama import Client

    # Create an Ollama client
    ollama_client = Client()

    # Create an Outlines model, the model must be available on your system
    model = outlines.from_ollama(ollama_client, "tinyllama")
    ```

=== "OpenAI"

    ```python
    import outlines
    from openai import OpenAI

    # Create an OpenAI client instance
    openai_client = OpenAI()

    # Create an Outlines model
    model = outlines.from_openai(openai_client, "gpt-4o")
    ```

=== "Transformers"

    ```python
    import outlines
    from transformers import AutoModelForCausalLM, AutoTokenizer

    # Define the model you want to use
    model_name = "HuggingFaceTB/SmolLM2-135M-Instruct"

    # Create a HuggingFace model and tokenizer
    hf_model = AutoModelForCausalLM.from_pretrained(model_name)
    hf_tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Create an Outlines model
    model = outlines.from_transformers(hf_model, hf_tokenizer)
    ```


=== "llama.cpp"

    ```python
    import outlines
    from llama_cpp import Llama

    # Model to use, it will be downloaded from the HuggingFace hub
    repo_id = "TheBloke/Llama-2-13B-chat-GGUF"
    file_name = "llama-2-13b-chat.Q4_K_M.gguf"

    # Create a Llama.cpp model
    llama_cpp_model = Llama.from_pretrained(repo_id, file_name)

    # Create an Outlines model
    model = outlines.from_llamacpp(llama_cpp_model)
    ```

=== "Gemini"

    ```python
    import outlines
    from google.generativeai import GenerativeModel

    # Create a Gemini client
    gemini_client = GenerativeModel()

    # Create an Outlines model
    model = outlines.from_gemini(gemini_client, "gemini-1-5-flash")
    ```

=== "mlx-lm"

    ```python
    import outlines
    import mlx_lm

    # Create an MLXLM model with the output of mlx_lm.load
    # The model will be downloaded from the HuggingFace hub
    model = outlines.from_mlxlm(
        *mlx_lm.load("mlx-community/SmolLM-135M-Instruct-4bit")
    )
    ```

=== "SgLang"

    ```python
    import outlines
    from openai import OpenAI

    # You must have a separate SgLang server running
    # Create an OpenAI client with the base URL of the SgLang server
    openai_client = OpenAI(base_url="http://localhost:11434/v1")

    # Create an Outlines model
    model = outlines.from_sglang(openai_client)
    ```

=== "TGI"

    ```python
    # SgLang

    import outlines
    from huggingface_hub import InferenceClient

    # You must have a separate TGI server running
    # Create an InferenceClient client with the base URL of the TGI server
    tgi_client = InferenceClient("http://localhost:8080")

    # Create an Outlines model
    model = outlines.from_tgi(tgi_client)
    ```

=== "vLLM (offline)"

    ```python
    import outlines
    from vllm import LLM

    # Create a vLLM model
    vllm_model = LLM("microsoft/Phi-3-mini-4k-instruct")

    # Create an Outlines model
    model = outlines.from_vllm_offline(vllm_model)
    ```


## Generating Text

Once you have created the Outlines model for your inference engine/client, you are already all set to generate text! Models are callable such that you can simply call them with a text prompt. For instance:

```python
model = <your_model_as_defined_above>

# Call the model to generate text
result = model("Write a short story about a cat.")
print(result) # 'In a quiet village where the cobblestones hummed softly beneath the morning mist...'
```

Most models also support streaming through the use of a `streaming` method. You can directly use with a prompt just like regular text generation. For instance:

```python
model = <your_model_as_defined_above>

# Stream text
for chunk in model.streaming("Write a short story about a cat.")
    print(chunk) # 'In ...'
```

## Structured Generation

Outlines follows a simple pattern that mirrors Python's own type system for structured outputs. Simply specify the desired output type as you would when using type hinting with a function, and Outlines will ensure your data matches that structure exactly.

Supported output types can be organized in 5 categories:

- [Basic Types](../../features/core/output_types#basic-python-types): `int`, `float`, `bool`...
- [Multiple Choices](../../features/core/output_types#multiple-choices): using `Literal` or `Enum`
- [JSON Schemas](../../features/core/output_types#json-schemas): using a wide range of possible objects including Pydantic models and dataclasses
- [Regex](../../features/core/output_types#regex-patterns): through the Outlines's `Regex` object
- [Context-free Grammars](../../features/core/output_types#context-free-grammars): through the Outlines's `CFG` object

Consult the section on [Output Types](../../features/core/output_types.md) in the features documentation for more detailed information on all supported types for each output type category.

In the meantime, you can find below examples of using each of the five output type categories:

=== "Basic Types"

    ```python
    model = <your_model_as_defined_above>

    # Generate an integer
    result = model("How many countries are there in the world?", int)
    print(result) # '200'
    ```

=== "Multiple Choice"

    ```python
    from enum import Enum

    # Define our multiple choice output type
    class PizzaOrBurger(Enum):
        pizza = "pizza"
        burger = "burger"

    model = <your_model_as_defined_above>

    # Generate text corresponding to either of the choices defined above
    result = model("What do you want to eat, a pizza or a burger?", PizzaOrBurger)
    print(result) # 'pizza'
    ```

=== "JSON Schemas"

    ```python
    from datetime import date
    from typing import Dict, List, Union
    from pydantic import BaseModel

    model = <your_model_as_defined_above>

    # Define the class we will use as an output type
    class Character(BaseModel):
        name: str
        birth_date: date
        skills: Union[Dict, List[str]]

    # Generate a character
    result = model("Create a character", Character)
    print(result) # '{"name": "Aurora", "birth_date": "1990-06-15", "skills": ["Stealth", "Diplomacy"]}'
    print(Character.model_validate_json(result)) # name=Aurora birth_date=datetime.date(1990, 6, 15) skills=['Stealth', 'Diplomacy']
    ```

=== "Regex"

    ```python
    from outlines.types import Regex

    model = <your_model_as_defined_above>

    # Define our regex for a 3 digit number
    output_type = Regex(r"[0-9]{3}")

    # Generate the number
    result = model("Write a 3 digit number", output_type)
    print(result) # '236'
    ```

=== "Context-free Grammars"

    ```python
    from outlines.types import CFG

    model = <your_model_as_defined_above>

    # Define your Lark grammar as string
    arithmetic_grammar = """
        ?start: sum

        ?sum: product
            | sum "+" product   -> add
            | sum "-" product   -> sub

        ?product: atom
            | product "*" atom  -> mul
            | product "/" atom  -> div

        ?atom: NUMBER           -> number
            | "-" atom         -> neg
            | "(" sum ")"

        %import common.NUMBER
        %import common.WS_INLINE

        %ignore WS_INLINE
    """

    # Generate an arithmetic operation
    result = model("Write an arithmetic operation", CFG(grammar_string))
    print(result) # '2 + 3'
    ```

It's important to note that not all output types are available for all models due to limitations in the underlying inference engines. The [Models](../features/models/index.md) section of the features documentation includes a features matrix that summarize the availability of output types.

## Generators

Generators are an important type of objects in Outlines that are used to encapsulate a model and an output type. After having created a generator, you can call it using a similar interface to a model and it will generate text conforming to the output type you initially provided.

This feature is useful if you want to generate text several times for given model and output type. Not only does it prevent having to include the same output type at each call, but it also allows us to compile the output type only once instead of doing it at each generation (which is important for local models as this operation can be expensive).

For instance:

```python
from typing import Literal
from outlines import Generator

model = <your_model_as_defined_above>

# Create a generator
generator = Generator(model, Literal["pizza", "burger"])

# Call it as you would call a model
result = generator("What do you want to eat, a pizza or a burger?")
print(result) # pizza
```

You can find more information on generators in the dedicated page on [Generators](../features/core/generator.md) in the features documentation.

## Other features

On top of more detailed explanation on the concepts already discussed here, the [Features](../features/index.md) section of the documentation contains information on additional Outlines features such as applications, prompt templates, the regex DSL...


================================================
FILE: docs/guide/installation.md
================================================
---
title: Installation
---

# Installation

## Dependency Management

We recommend using modern Python packaging tools such as `uv` for managing python dependencies.

### uv (Recommended)

```shell
# Install uv
curl -LsSf https://astral.sh/uv/install.sh | sh

# Create a virtual environment and install Outlines
uv venv
source .venv/bin/activate
uv pip install outlines
```

or with pip:

```shell
pip install outlines
```


## Optional Dependencies

To use Outlines models, you need to install the Python libraries for the associated inference engines/clients. Such libraries are not part of the general installation as you should only install the libraries needed for the specific models you want to use.

Outlines models with the installation of their associated additional depencies:

- [Anthropic](features/models/anthropic.md): `pip install anthropic`
- [Dottxt](features/models/dottxt.md): `pip install dottxt`
- [Gemini](features/models/gemini.md): `pip install google-generativeai`
- [Llamacpp](features/models/llamacpp.md): `pip install llama-cpp-python`
- [Mlx-lm](features/models/mlxlm.md): `pip install mlx mlx-lm`
- [Ollama](features/models/ollama.md): `pip install ollama` (after having downloaded Ollama in your system)
- [OpenAI](features/models/openai.md): `pip install openai`
- [SGLang](features/models/sglang.md): `pip install openai`
- [TGI](features/models/tgi.md): `pip install huggingface_hub`
- [Transformers](features/models/transformers.md): `pip install transformers`
- [TransformersMultiModal](features/models/transformers_multimodal.md): `pip install transformers`
- [vLLM (online server)](features/models/vllm.md): `pip install openai`
- [vLLM (offline)](features/models/vllm_offline.md): `pip install vllm`

If you encounter any problems using Outlines with these libraries, take a look at their installation instructions. The installation of `openai` and `transformers` should be straightforward, but other libraries have specific hardware requirements.

!!! warning "Hardware Requirements"

    If you are using a local model, your model may require specific hardware. Please check the documentation for these libraries.

    Some libraries like `vllm` and `llama-cpp-python` require specific hardware, such as a compatible GPU. `mlx-lm` on its side is designed for Apple Silicon, so it may not be appropriate for your use case if you are on a different platform.

## Bleeding Edge

You can install the latest version of Outlines from the repository's `main` branch:

```sh
pip install git+https://github.com/dottxt-ai/outlines.git@main
```

This can be useful, for instance, when a fix has been merged but not yet released.

## Installing for Development

See the [contributing documentation](community/contribute.md) for instructions on how to install Outlines for development, including an example using the `dot-install` method for one of the backends.


================================================
FILE: docs/guide/migration.md
================================================
# Outlines 1.0 migration guide

Outlines 1.0 introduces some breaking changes that affect the way you use the library. You are likely concerned by all of the following sections, so please read this document carefully until the end.

This guide will help you migrate your code to the new version.

All previous functionalities will be supported until Outlines version 1.1.0, but a warning message will be displayed to remind you to migrate your code and provide instructions to help you do so. Please migrate your code to the v1 as soon as possible.

## Removed or modified features
- [Generate functions](#generate-functions)
- [Models](#models)
- [Samplers](#samplers)
- [Functions](#functions)
- [Text generation return types](#text-generation-return-types)
- [Inference arguments](#inference-arguments)

### Generate functions

The whole `generate` module has been removed. That includes the functions `generate.cfg`, `generate.choice`, `generate.format`,`generate.fsm`, `generate.json`, `generate.regex` and `generate.text`.

You should replace these functions by the [`Generator`](../features/core/generator.md) object along with the right output type as an argument (on top of the model). The output type can either be a python type or be an object from the `outlines.types` module. You can find more information about the output types in the [Output Types](../features/core/output_types.md) section of the features documentation.

Associated v1 output types for each deprecated function:
- `generate.cfg` -> `outlines.types.CFG`
- `generate.choice` -> `typing.Literal` or `typing.Union`
- `generate.format` -> native python types (`str`, `int` etc.)
- `generate.fsm` -> `outlines.types.FSM`
- `generate.json` -> `pydantic.BaseModel`, `typing.TypedDict`, `dataclasses.dataclass`, `genson.schema.SchemaBuilder` or `outlines.types.JsonSchema`
- `generate.regex` -> `outlines.types.Regex`
- `generate.text` -> no output type (`None`)

For instance, instead of:

```python
from outlines import generate

model = ...
generator = generate.choice(model, ["foo", "bar"])
```

You should now use:

```python
from typing import Literal
from outlines import Generator

model = ...
generator = Generator(model, Literal["foo", "bar"])
```

### Models

The model classes found in the `outlines.models` module are maintained but there are a few important changes to be aware of.

The functions used to created a model have been replaced by equivalent functions named with a `from_` prefix. The function `outlines.models.transformers` has been replaced by `outlines.from_transformers` for instance. On top of this change of name, the arguments have been modified. You should refer to the [models documentation](../features/models/index.md) for more details, but the overall idea is that you now need to provide a model/client instance from the inference library the Outlines model is wrapping.

For instance, instead of:

```python
from outlines import models

model = models.llamacpp(
    repo_id="M4-ai/TinyMistral-248M-v2-Instruct-GGUF",
    filename="TinyMistral-248M-v2-Instruct.Q4_K_M.gguf",
    chat_format="qwen",
)
```

You should now do:

```python
from llama_cpp import Llama
from outlines import from_llamacpp

llamacpp_model = Llama.from_pretrained(
    repo_id="M4-ai/TinyMistral-248M-v2-Instruct-GGUF",
    filename="TinyMistral-248M-v2-Instruct.Q4_K_M.gguf",
    chat_format="qwen",
)
model = from_llamacpp(llamacpp_model)
```

The `load_lora` methods that are present on the `VLLM` and `LlamaCpp` models have been removed. You should now handle lora loading through the `Llama` instance in the case of the `LlamaCpp` model or provide it as a keyword argument when calling the model in the case of the `VLLM` model.

For instance, instead of:

```python
from outlines import from_vllm
from vllm import LLM

model = from_vllm(
    LLM.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
)
model.load_lora("path/to/lora/file")

response = model("foo")
```

You should now do:

```python
from outlines import from_vllm
from vllm import LLM
from vllm.lora.request import LoRARequest

model = from_vllm(
    LLM.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
)
lora_request = LoRARequest("path/to/lora/file", 1, "path/to/lora/file")

response = model("foo", lora_request=lora_request)
```

The `ExLlamav2` model has been removed without replacement. This inference library is not fully compatible with Outlines, so we decided to remove it. You can still use it until final deprecation, but we recommend you to migrate to a different inference library right now.

### Samplers

The `outlines.samplers` module has been removed without replacement. You should now use the arguments of the inference library model to control the sampling. Depending on the model you use, this could be done at initialization or when calling the model to generate text (so when calling the outlines model or a generator).

For instance, instead of:

```python
from outlines import generate

model = <transformers_model>

generator = generate.text(model, samplers.beam_search(2))
response = generator("foo")
```

You should now do:

```python
from outlines import Generator

model = <transformers_model>

generator = Generator(model)
response = generator("foo", num_beams=2)
```

### Functions

The `outlines.function` module has been removed. It is replaced by the `outlines.applications` module. An [`Application`](../features/utility/application.md) serves a similar purpose as a `Function`: it encapsulates a prompt template and an output type. A difference is that can `Application` is not instantiated with a model name. Instead, you should provide a model instance along with the prompt when calling it.

For instance, instead of:

```python
from outlines import Function

prompt_template = ...
output_type = ...

fn = Function(
    prompt_template,
    output_type,
    "hf-internal-testing/tiny-random-GPTJForCausalLM",
)

result = fn("foo")
```

You should now do:

```python
from outlines import Application

prompt_template = ...
output_type = ...

application = Application(
    prompt_template,
    output_type,
)

model = ...

result = application(model, "foo")
```

### Text generation return types

In the previous version of Outlines, the return type of the generators depended on the output type provided. For instance, if you passed a Pydantic model to the `generate.json` function, the return type was a Pydantic model instance. In the v1, the return type of a generator is always a `str`, the raw text generated by the model. You are responsible for parsing the text into the desired format.

For instance, instead of:

```python
from pydantic import BaseModel
from outlines import generate

class Foo(BaseModel):
    bar: str

model = ...

generator = generate.json(model, Foo)

result = generator("foo")
print(result.bar)
```

You should now do:

```python
from pydantic import BaseModel
from outlines import Generator

class Foo(BaseModel):
    bar: str

model = ...

generator = Generator(model, Foo)

result = generator("foo")
result = Foo.model_validate_json(result) # parse the text into the Pydantic model instance
print(result.bar)
```

The [Output Types](../features/core/output_types.md) section of the features documentation includes extensive details on available output types.

### Inference arguments

In the previous version of Outlines, some of the inference arguments were standardized across the models and were provided as positional arguments to the generator or through the sampling params dictionary. Additionally, various default values were added by outlines to the inference library models. This is no longer the case. You should refer to the documentation of the inference library you use to find the right arguments for your use case and pass them as keyword arguments to the outlines generator when calling it.

For instance, instead of:

```python
from outlines import generate

model = <transformers_model>

generator = generate.text(model)

result = generator("foo", 256, ".", 10) # 256 tokens, stop at "." and seed 10
```

You should now do:

```python
from outlines import Generator

model = <transformers_model>

generator = Generator(model)

result = generator("foo", max_new_tokens=256, stop_strings=".", seed=10)
```


================================================
FILE: docs/guide/selecting_an_inference_backend.md
================================================
This guide should provide a general overview of the available models in the [API reference](/api/models/).

## Models

-  [Anthropic](/api/models/anthropic)


================================================
FILE: docs/guide/vlm.md
================================================
# Vision-Language Models with Outlines

This guide demonstrates how to use Outlines with vision-language models. Vision-language models can process both text and images, allowing for tasks like image captioning, visual question answering, and more.

We will be using the Pixtral-12B model from Mistral to take advantage of some of its visual reasoning capabilities and a workflow to generate a multistage atomic caption.

## Setup
First, we need to install the necessary dependencies. In addition to Outlines, we"ll need to install the transformers library and any specific requirements for the vision-language model we"ll be using.

```shell
pip install outlines transformers torch pillow
```

### Initializing the Model
We"ll use the `outlines.from_transformers` function to initialize our vision-language model. For this function to return a vision multi-modal model we need to pass in a transformers model and a transformers processor that can handle both text and image inputs. Today we"ll be using the Pixtral model with the AutoProcessor.

```python
import outlines
import torch
from transformers import (
    AutoProcessor,
    LlavaForConditionalGeneration
)

model_name="mistral-community/pixtral-12b" # original magnet model is able to be loaded without issue
model_class=LlavaForConditionalGeneration
processor_class=AutoProcessor

def get_vision_model(model_name: str, model_class, processor_class):
    model_kwargs = {
        "torch_dtype": torch.bfloat16,
        "attn_implementation": "flash_attention_2",
        "device_map": "auto",
    }
    processor_kwargs = {
        "device": "cuda",
    }

    model = outlines.from_transformers(
        model_class.from_pretrained(model_name, **model_kwargs),
        processor_class.from_pretrained(model_name, **processor_kwargs),
    )
    return model
model = get_vision_model(model_name, model_class, processor_class)
```

### Defining the Schema
Next, we will define a schema for the output we expect from our vision multi-modal model. This schema will help structure the model's responses. We use the `outlines.Generator` object to create a generator for our schema that will then be called with our prompt and images.

```python
from enum import Enum
from pydantic import BaseModel, Field, confloat, constr
from pydantic.types import StringConstraints, PositiveFloat
from typing import List
from typing_extensions import Annotated

class TagType(Enum):
    ENTITY = "Entity"
    RELATIONSHIP = "Relationship"
    STYLE = "Style"
    ATTRIBUTE = "Attribute"
    COMPOSITION = "Composition"
    CONTEXTUAL = "Contextual"
    TECHNICAL = "Technical"
    SEMANTIC = "Semantic"

class ImageTag(BaseModel):
    tag: Annotated[
        constr(min_length=1, max_length=30),
        Field(
            description=(
                "Descriptive keyword or phrase representing the tag."
            )
        )
    ]
    category: TagType
    confidence: Annotated[
        confloat(le=1.0),
        Field(
            description=(
                "Confidence score for the tag, between 0 (exclusive) and 1 (inclusive)."
            )
        )
    ]

class ImageData(BaseModel):
    tags_list: List[ImageTag] = Field(..., min_items=8, max_items=20)
    short_caption: Annotated[str, StringConstraints(min_length=10, max_length=150)]
    dense_caption: Annotated[str, StringConstraints(min_length=100, max_length=2048)]

image_data_generator = outlines.Generator(model, ImageData)
```

This schema defines the structure for image tags, including categories like Entity, Relationship, Style, etc., as well as short and dense captions.

### Preparing the Prompt

We'll create a prompt that instructs the model on how to analyze the image and generate the structured output:

```python
pixtral_instruction = """
<s>[INST]
<Task>You are a structured image analysis agent. Generate comprehensive tag list, caption, and dense caption for an image classification system.</Task>
<TagCategories requirement="You should generate a minimum of 1 tag for each category." confidence="Confidence score for the tag, between 0 (exclusive) and 1 (inclusive).">
- Entity : The content of the image, including the objects, people, and other elements.
- Relationship : The relationships between the entities in the image.
- Style : The style of the image, including the color, lighting, and other stylistic elements.
- Attribute : The most important attributes of the entities and relationships in the image.
- Composition : The composition of the image, including the arrangement of elements.
- Contextual : The contextual elements of the image, including the background, foreground, and other elements.
- Technical : The technical elements of the image, including the camera angle, lighting, and other technical details.
- Semantic : The semantic elements of the image, including the meaning of the image, the symbols, and other semantic details.
<Examples note="These show the expected format as an abstraction.">
{
  "tags_list": [
    {
      "tag": "subject 1",
      "category": "Entity",
      "confidence": 0.98
    },
    {
      "tag": "subject 2",
      "category": "Entity",
      "confidence": 0.95
    },
    {
      "tag": "subject 1 runs from subject 2",
      "category": "Relationship",
      "confidence": 0.90
    },
   }
</Examples>
</TagCategories>
<ShortCaption note="The short caption should be a concise single sentence caption of the image content with a maximum length of 100 characters.">
<DenseCaption note="The dense caption should be a descriptive but grounded narrative paragraph of the image content with high quality narrative prose. It should incorporate elements from each of the tag categories to provide a broad dense caption">
[IMG]<image>[/INST]
""".strip()
```

This prompt provides detailed instructions to the model on how to generate comprehensive tag lists, captions, and dense captions for image analysis. Because of the ordering of the instructions the original tag generation serves as a sort of visual grounding for the captioning task, reducing the amount of manual post processing required. It is essential to include the <image> tag in the prompt at the location where the image will be inserted.

### Generating Structured Output
Now we can use our model to generate structured output based on an input image:

```python
from io import BytesIO
from urllib.request import urlopen
from PIL import Image

def img_from_url(url):
    img_byte_stream = BytesIO(urlopen(url).read())
    return Image.open(img_byte_stream).convert("RGB")

image_url="https://upload.wikimedia.org/wikipedia/commons/9/98/Aldrin_Apollo_11_original.jpg"
image= img_from_url(image_url)
result = image_data_generator({
    "text": pixtral_instruction,
    "images": image
})
print(result)
```

This code loads an image from a URL, passes it to our vision multi-modal model along with the instruction prompt, and generates a structured output based on the defined schema. We end up with an output like this, ready to be used for the next stage in your pipeline:

```json
{"tags_list": [
  {
    "tag": "astronaut",
    "category": <TagType.ENTITY: "Entity">,
    "confidence": 0.99
  },
  {"tag": "moon", "category": <TagType.ENTITY: "Entity">, "confidence": 0.98},
  {
    "tag": "space suit",
    "category": <TagType.ATTRIBUTE: "Attribute">,
    "confidence": 0.97
  },
  {
    "tag": "lunar module",
    "category": <TagType.ENTITY: "Entity">,
    "confidence": 0.95
  },
  {
    "tag": "shadow of astronaut",
    "category": <TagType.COMPOSITION: "Composition">,
    "confidence": 0.95
  },
  {
    "tag": "footprints in moon dust",
    "category": <TagType.CONTEXTUAL: "Contextual">,
    "confidence": 0.93
  },
  {
    "tag": "low angle shot",
    "category": <TagType.TECHNICAL: "Technical">,
    "confidence": 0.92
  },
  {
    "tag": "human first steps on the moon",
    "category": <TagType.SEMANTIC: "Semantic">,
    "confidence": 0.95
  }],
  "short_caption": "First man on the Moon",
  "dense_caption": "The figure clad in a pristine white space suit, emblazoned with the American flag, stands powerfully on the moon's desolate and rocky surface. The lunar module, a workhorse of space engineering, looms in the background, its metallic legs sinking slightly into the dust where footprints and tracks from the mission's journey are clearly visible. The photograph captures the astronaut from a low angle, emphasizing his imposing presence against the desolate lunar backdrop. The stark contrast between the blacks and whiteslicks of lost light and shadow adds dramatic depth to this seminal moment in human achievement."
}
```

## Conclusion

This guide demonstrated how Outlines enables structured output generation with vision-language models. With the techniques shown above, you can build:

- **Content Management Systems**: Automatically tag and categorize visual content with structured metadata that can be directly stored in databases, enabling powerful search and filtering capabilities
- **Accessibility Tools**: Generate rich, structured descriptions of images that can be adapted for different contexts - from brief alt-text to detailed scene descriptions for screen readers
- **Quality Assurance Pipelines**: Validate visual content against specific criteria by extracting structured attributes and checking them against business rules


================================================
FILE: docs/index.md
================================================
---
title: Welcome to Outlines!
hide:
  - navigation
---

#

<figure markdown>
![](assets/images/logo-light-mode.svg#only-light){ width="500" }
![](assets/images/logo-dark-mode.svg#only-dark){ width="500" }
</figure>


LLMs are powerful but their outputs are unpredictable. Most solutions attempt to fix bad outputs after generation using parsing, regex, or fragile code that breaks easily.

Outlines guarantees structured outputs during generation — directly from any LLM.

- **Works with any model** - Same code runs across OpenAI, Ollama, vLLM, and more
- **Simple integration** - Just pass your desired output type: `model(prompt, output_type)`
- **Guaranteed valid structure** - No more parsing headaches or broken JSON
- **Provider independence** - Switch models without changing code
- **Rich structure definition** - Use Json Schema, regular expressions or context-free grammars

<figure markdown>
[Get Started](guide/getting_started){ .md-button .md-button--primary }
[View Examples](examples/){ .md-button }
[API Reference](api_reference/){ .md-button }
[GitHub](https://github.com/dottxt-ai/outlines){ .md-button }
</figure>

## 🚀 Building the future of structured generation

We're working with select partners to develop new interfaces to structured generation.

Need XML, FHIR, custom schemas or grammars? Let's talk.

Audit your schema: share one schema, we show you what breaks under generation, the constraints that fix it, and compliance rates before and after. Sign up [here](https://h1xbpbfsf0w.typeform.com/to/rtFUraA2?typeform).

## See it in action

```python
from pydantic import BaseModel
from typing import Literal
import outlines
import openai

class Customer(BaseModel):
    name: str
    urgency: Literal["high", "medium", "low"]
    issue: str

client = openai.OpenAI()
model = outlines.from_openai(client, "gpt-4o")

customer = model(
    "Alice needs help with login issues ASAP",
    Customer
)
# ✓ Always returns valid Customer object
# ✓ No parsing, no errors, no retries
```

## Quick install

```shell
pip install outlines
```

## Features

<div class="grid cards" markdown>

- :material-shield-check: **Reliable** - Guaranteed schema compliance -- always valid JSON.
- :material-puzzle: **Feature-rich** - Supports a large proportion of the JSON Schema spec, along with regex and context-free grammars.
- :material-lightning-bolt: **Fast** - Microseconds of overhead vs seconds of retries. Compilation happens once, not every request.
- :material-lightbulb: **Simple** - Outlines is a low-abstraction library. Write code the way you normally do with LLMs. No agent frameworks needed.

</div>

## Supported inference APIs, libraries & servers

- [vLLM](features/models/vllm.md)
- [vLLM offline](features/models/vllm_offline.md)
- [Transformers](features/models/transformers.md)
- [llama.cpp](features/models/llamacpp.md)
- [Ollama](features/models/ollama.md)
- [MLX-LM](features/models/mlxlm.md)
- [SgLang](features/models/sglang.md)
- [TGI](features/models/tgi.md)
- [OpenAI](features/models/openai.md)
- [Anthropic](features/models/anthropic.md)
- [Gemini](features/models/gemini.md)
- [Dottxt](features/models/dottxt.md)

## Who is using Outlines?

Hundreds of organisations and the main LLM serving frameworks ([vLLM][vllm], [TGI][tgi], [LoRAX][lorax], [xinference][xinference], [SGLang][sglang]) use Outlines. Prominent companies and organizations that use Outlines include:

<div class="grid cards" markdown>
  <div class="row"><img src="../logos/amazon.png" width="200"></div>
  <div class="row"><img src="../logos/apple.png" width="200"></div>
  <div class="row"><img src="../logos/best_buy.png" width="200"></div>
  <div class="row"><img src="../logos/canoe.png" width="200"></div>
  <div class="row"><img src="../logos/cisco.png" width="200"></div>
  <div class="row"><img src="../logos/dassault_systems.png" width="200"></div>
  <div class="row"><img src="../logos/databricks.png" width="200"></div>
  <div class="row"><img src="../logos/datadog.png" width="200"></div>
  <div class="row"><img src="../logos/dbt_labs.png" width="200"></div>
  <div class="row"><img src="../assets/images/dottxt.png" width="200"></div>
  <div class="row"><img src="../logos/gladia.jpg" width="200"></div>
  <div class="row"><img src="../logos/harvard.png" width="200"></div>
  <div class="row"><img src="../logos/hf.png" width="200"></div>
  <div class="row"><img src="../logos/johns_hopkins.png" width="200"></div>
  <div class="row"><img src="../logos/meta.png" width="200"></div>
  <div class="row"><img src="../logos/mit.png" width="200"></div>
  <div class="row"><img src="../logos/mount_sinai.png" width="200"></div>
  <div class="row"><img src="../logos/nvidia.png" width="200"></div>
  <div class="row"><img src="../logos/nyu.png" width="200"></div>
  <div class="row"><img src="../logos/safran.png" width="200"></div>
  <div class="row"><img src="../logos/salesforce.png" width="200"></div>
  <div class="row"><img src="../logos/shopify.png" width="200"></div>
  <div class="row"><img src="../logos/smithsonian.png" width="200"></div>
  <div class="row"><img src="../logos/tinder.png" width="200"></div>
  <div class="row"><img src="../logos/upenn.png" width="200"></div>
</div>

Organizations are included either because they use Outlines as a dependency in a public repository, or because of direct communication between members of the Outlines team and employees at these organizations.

Still not convinced, read [what people say about us](community/feedback.md). And make sure to take a look at what the [community is building](community/examples.md)!


## Outlines people

Outlines would not be what it is today without a community of dedicated developers:

<a href="https://github.com/dottxt-ai/outlines/graphs/contributors">
  <img src="https://contrib.rocks/image?repo=dottxt-ai/outlines" />
</a>

## About .txt

Outlines is built with ❤️ by [.txt](https://dottxt.co).

.txt solves the critical problem of reliable structured output generation for large language models. Our [commercially-licensed libraries][dottxt-doc] ensure 100% compliance with JSON Schema, regular expressions and context-free grammars while adding only microseconds of latency. Unlike open-source alternatives, we offer superior reliability, performance, and enterprise support.


## Acknowledgements

<div class="grid" markdown>

<figure markdown>
  <a href="https://www.normalcomputing.ai">
  ![Normal Computing logo](assets/images/normal_computing.jpg){ width="150" }
  </a>
</figure>

</div>

Outlines was originally developed at [@NormalComputing](https://twitter.com/NormalComputing) by [@remilouf](https://twitter.com/remilouf) and [@BrandonTWillard](https://twitter.com/BrandonTWillard). It is now maintained by [.txt](https://dottxt.co).

[discord]: https://discord.gg/R9DSu34mGd
[aesara]: https://github.com/aesara-devs
[blackjax]: https://github.com/blackjax-devs/blackjax
[pythological]: https://github.com/pythological
[hy]: https://hylang.org/
[.txt]: https://dottxt.co
[vllm]: https://github.com/vllm-project/vllm
[tgi]: https://github.com/huggingface/text-generation-inference
[lorax]: https://github.com/predibase/lorax
[xinference]: https://github.com/xorbitsai/inference
[sglang]: https://github.com/sgl-project/sglang/
[dottxt-doc]: https://docs.dottxt.co


================================================
FILE: docs/overrides/home.html
================================================
{#-
This file overrides the home page to use HTML tooling
better.
-#}
{% extends "main.html" %}
{% block tabs %}
{{ super() }}

<style>
    @media screen and (min-width:60em) {
        .md-sidebar--secondary {
            display: none;
        }
    }

    @media screen and (min-width:76.25em) {
        .md-sidebar--primary {
            display: none;
        }
    }

    .mdx-container {
        display: flex;
        justify-content: center;
        align-items: center;
        padding-left: 1rem;
        padding-right: 1rem;
    }

    .mdx-hero {
        text-align: center;
        margin-top: 6rem;
    }

    .mdx-hero__image {
        margin-bottom: 1rem;
        margin-top: 1rem;
        max-width: 80%;
        margin: 0 auto;

        @media screen and (max-width: 40em) {
            max-width: 60%;
        }
    }

    .mdx-hero__content {
        max-width: 600px;
        margin: 0 auto;

        .subtitle {
            font-size: 1rem;
            letter-spacing: 0.025rem;
        }

        h2 {
            max-width: 460px;
        }

        @media screen and (max-width: 40em) {
            .subtitle {
                font-size: 0.8rem;
                max-width: 70%;
                font-weight: 700;
            }
        }

        a:hover{
            color: #A7623A;
        }
    }

    .md-buttons {
        display: flex;
        justify-content: center;
        gap: 0.5rem;
        flex-direction: row;
        margin: 0 auto;
        margin-top: 2rem;
        flex-direction: row;
        width: max-content;
        font-weight: 700;
    }

    .md-team {
        margin-top: 4rem;
    }

    /* New media query for smaller screens */
    @media screen and (max-width: 40em) {
        .md-buttons {
            flex-direction: column;
            /* Stack buttons vertically */
            align-items: center;
            /* Center-align the stacked buttons */
        }

        .md-buttons .md-button {
            width: 100%;
            /* Make buttons full-width */
            max-width: 200px;
            /* Limit maximum width for better appearance */
        }
    }

    .md-button {
        transition: background-color 0.3s ease-in-out; /* Smooth transition */
        border-radius: 6px !important;
        padding: 0.4rem 0.8rem !important;

    }

    .md-button:hover {
        background-color: #A7623A !important; /* Desired hover background color */
        border-color: #A7623A !important;
    }
</style>

<section class="mdx-container">
    <div class="md-grid md-typeset">
        <div class="mdx-hero">
            <div class="mdx-hero__image">
                <img src="assets/images/logo-light-mode.svg#only-light" alt="Outlines Logo" width="600" draggable="false">
                <img src="assets/images/logo-dark-mode.svg#only-dark" alt="Outlines Logo" width="600" draggable="false">
            </div>
            <div class="mdx-hero__content">
                <h2 class="subtitle" style="font-weight: 500; padding-top: 1rem;">
                    Structured text generation and robust prompting for language models
                </h2>
                <div class="md-buttons">
                    <a href="{{ 'welcome/' | url }}" title="Get started" class="md-button md-button--primary">
                        Get started
                    </a>
                    <a href="https://discord.gg/ZxBxyWmW5n" title="Join the Community" class="md-button">
                        Join the community
                    </a>
                </div>
                <div class="md-buttons">
                    <p>
                        Follow us on <a href="https://twitter.com/dottxtai" title="Follow us on X" target="_blank">
                            X
                        </a> and
                        <a href="https://bsky.app/profile/dottxtai.bsky.social" title="Follow us on Bluesky" target="_blank">
                            Bluesky
                        </a>
                    </p>
                </div>

                <p class="md-team">Made with ❤️ by the team at <a href="https://dottxt.co">.txt</a></p>
            </div>
        </div>
    </div>
</section>
{% endblock %}
{% block content %}{% endblock %}
{% block footer %}{% endblock %}


================================================
FILE: docs/overrides/main.html
================================================
{% extends "base.html" %}


================================================
FILE: docs/stylesheets/extra.css
================================================
@font-face {
    font-family: "Source Code Pro Custom", monospace;
    src: url(https://fonts.googleapis.com/css2?family=Source+Code+Pro:ital,wght@0,200..900;1,200..900&display=swap);
}

/* Header/banner styling */
.md-header {
    background-color: #DFD1B6 !important;
}

:root > * {
    /* Notion-like color palette */
    --md-default-fg-color: #37352f;
    --md-default-fg-color--light: #73706c;
    --md-default-fg-color--lighter: #9b9a97;
    --md-default-bg-color: #ffffff;
    --md-default-bg-color--light: #f7f6f3;
    --md-default-bg-color--lighter: #edece9;

    /* Typography */
    --md-text-font-family: "Inter", -apple-system, BlinkMacSystemFont, "Segoe UI", Helvetica, Arial, sans-serif;
    --md-code-font: "Source Code Pro", Consolas, "Liberation Mono", Menlo, monospace;

    /* Notion-like link colors */
    --md-typeset-a-color: #37352f;
    --md-accent-fg-color: #eb5757;

    /* Background colors */
    --md-code-bg-color: #f7f6f3;
    --md-code-fg-color: #eb5757;
}

/* Code block styling */
.highlight pre,
.md-typeset pre code,
.md-typeset .highlight pre,
.md-typeset .highlighttable pre {
    background-color: #2E3440 !important; /* Nord's darkest blue (nord0) */
    border-radius: 4px !important; /* Subtle rounded corners like Notion */
    box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1), 0 1px 3px rgba(0, 0, 0, 0.08) !important; /* Subtle shadow */
    border: none !important; /* No border for cleaner look */
}

/* Adjust padding from code content */
.md-code__content {
    padding: 1em 1.5em !important; /* Increased top/bottom padding */
}

/* Style only inline code (not code blocks) */
.md-typeset :not(pre) > code {
    background-color: rgba(135, 131, 120, 0.15); /* Notion's exact inline code background */
    color: #E35A26; /* Orange color for inline code */
    border-radius: 3px; /* Subtle rounded corners */
    padding: 0.2em 0.4em; /* Notion-like padding */
    font-weight: 500; /* Medium weight */
    font-size: 0.85em; /* Slightly smaller than body text */
    border: none;
    font-family: "SFMono-Regular", Consolas, "Liberation Mono", Menlo, monospace;
}

/* Override code block container background */
.md-typeset .highlight,
.md-typeset .highlighttable {
    background-color: #ffffff !important; /* Match page background */
    border-radius: 4px !important; /* Match code block radius */
    overflow: hidden; /* Ensure child elements respect borders */
}

/* Ensure proper spacing for the entire code block */
.md-typeset pre {
    margin: 1.5em 0 !important;
}

/* Style the copy button with Nord snow grey */
.md-clipboard {
    color: #D8DEE9 !important; /* Nord snow storm */
    top: 0.75em !important; /* Lower the button more */
    right: 0.5em !important; /* Add some spacing from right edge */
}

.md-clipboard:hover {
    color: #ECEFF4 !important; /* Brighter snow storm on hover */
}

.md-clipboard:after {
    color: #D8DEE9 !important;
}

/* Style scrollbars with Nord colors */
.md-typeset pre::-webkit-scrollbar {
    height: 0.4rem;
    width: 0.4rem;
}

.md-typeset pre::-webkit-scrollbar-track {
    background-color: #3B4252; /* Nord1 */
}

.md-typeset pre::-webkit-scrollbar-thumb {
    background-color: #4C566A; /* Nord3 */
    border-radius: 0.2rem;
}

.md-typeset pre::-webkit-scrollbar-thumb:hover {
    background-color: #D8DEE9; /* Nord snow storm on hover */
}

/* Firefox scrollbar styling */
.md-typeset pre {
    scrollbar-width: thin;
    scrollbar-color: #4C566A #3B4252;
}

/* Notion-like visual hierarchy */
.md-typeset h1 {
    font-weight: 600;
    font-size: 2.5rem;
    line-height: 1.2;
    margin-top: 2.5rem;
    margin-bottom: 0.75rem;
    color: #37352f;
    letter-spacing: -0.01em;
}

.md-typeset h2 {
    font-weight: 600;
    font-size: 1.875rem;
    line-height: 1.3;
    margin-top: 2.5rem;
    margin-bottom: 0.75rem;
    color: #37352f;
    letter-spacing: -0.005em;
    border: none;
    padding: 0;
}

.md-typeset h3 {
    font-weight: 500;
    font-size: 1.5rem;
    line-height: 1.3;
    margin-top: 2rem;
    margin-bottom: 0.75rem;
    color: #37352f;
    letter-spacing: -0.003em;
}

.md-typeset h4 {
    font-weight: 500;
    font-size: 1.25rem;
    line-height: 1.3;
    margin-top: 1.75rem;
    margin-bottom: 0.5rem;
    color: #37352f;
}

.md-typeset h5 {
    font-weight: 500;
    font-size: 1rem;
    line-height: 1.4;
    margin-top: 1.5rem;
    margin-bottom: 0.5rem;
    color: #37352f;
}

.md-typeset h6 {
    font-weight: 500;
    font-size: 0.875rem;
    line-height: 1.4;
    margin-top: 1.25rem;
    margin-bottom: 0.5rem;
    color: #37352f;
    text-transform: none;
    letter-spacing: normal;
}

/* Notion-like paragraph styling */
.md-typeset p {
    line-height: 1.5;
    margin-bottom: 1em; /* More whitespace */
    color: #37352f;
    font-weight: 400;
}

/* Reduce font size for navigation */
.md-nav__link {
    font-size: 0.6875rem; /* 11px */
}

/* Reduce font size for TOC */
.md-nav--secondary .md-nav__link {
    font-size: 0.6875rem; /* 11px */
}

/* Reduce spacing between navigation items */
.md-nav__item {
    margin: 0;
}

.md-nav__link {
    padding-top: 0;
    padding-bottom: 0;
    line-height: 1.2; /* Tighter line height */
}

/* Make navigation sections more compact */
.md-nav__title {
    line-height: 1.2;
    padding: 0.2rem 0.5rem;
    margin-bottom: 0.2rem;
}

/* Reduce TOC line height for compactness */
.md-nav--secondary .md-nav__link {
    line-height: 1.2;
}

/* Notion-like list styling */
.md-typeset ul,
.md-typeset ol {
    margin-top: 0.25em; /* Reduced top margin to sit closer to text */
    margin-bottom: 1em; /* Match paragraph spacing */
    color: #37352f;
}

.md-typeset li {
    line-height: 1.5;
    margin-bottom: 0.15rem; /* Slightly more spacing between list items */
    font-weight: 400;
}

/* Notion-style links */
.md-typeset a {
    color: #37352f;
    text-decoration: underline;
    text-decoration-color: rgba(55, 53, 47, 0.4);
    text-underline-offset: 2px;
    transition: text-decoration-color 0.1s ease;
}

.md-typeset a:hover {
    text-decoration-color: rgba(55, 53, 47, 0.8);
    background-color: rgba(55, 53, 47, 0.04);
}

/* Make important elements stand out */
.md-typeset strong {
    font-weight: 600;
    color: #37352f;
}

/* Better spacing for code blocks in relation to text */
.md-typeset pre {
    margin: 1.5em 0 !important; /* More whitespace around code blocks */
}

/* Notion-style tables */
.md-typeset table {
    border-collapse: collapse;
    margin: 1rem 0;
}

.md-typeset table th {
    font-weight: 600;
    background-color: #f7f6f3;
    color: #37352f;
    border: 1px solid #e1e0dd;
    padding: 0.5rem 0.75rem;
}

.md-typeset table td {
    border: 1px solid #e1e0dd;
    padding: 0.5rem 0.75rem;
}

/* Notion-style blockquotes */
.md-typeset blockquote {
    border-left: 3px solid #37352f;
    padding-left: 1rem;
    margin: 1rem 0;
    color: #37352f;
    background: transparent;
}

/* Page styling */
.md-content {
    background-color: #ffffff;
}

.md-sidebar {
    background-color: #fbfbfa;
}

/* Remove shadows for cleaner look */
.md-header,
.md-tabs {
    box-shadow: none;
    border-bottom: 1px solid #e1e0dd;
}

/* Admonition styling with custom palette */
.md-typeset .admonition,
.md-typeset details {
    border-radius: 4px;
    border: none;
    box-shadow: none;
    font-size: 0.6875rem; /* Very small font size - 11px */
    padding: 0.75rem;
    margin: 1rem 0;
}

/* Note/Info - Blue */
.md-typeset .admonition.note,
.md-typeset details.note,
.md-typeset .admonition.info,
.md-typeset details.info {
    background-color: rgba(127, 154, 207, 0.1) !important;
    border-left: 4px solid #7F9ACF !important;
}

.md-typeset .note > .admonition-title,
.md-typeset .note > summary,
.md-typeset .info > .admonition-title,
.md-typeset .info > summary {
    background-color: rgba(127, 154, 207, 0.2) !important;
    border-left: none !important;
}

/* Additional specificity for info type and custom types that should be blue */
.md-typeset .admonition.admonition-info,
.md-typeset details.details-info,
.md-typeset .admonition.installation,
.md-typeset .admonition.example,
.md-typeset .admonition.abstract,
.md-typeset .admonition.summary,
.md-typeset .admonition.tldr {
    background-color: rgba(127, 154, 207, 0.1) !important;
    border-left: 4px solid #7F9ACF !important;
}

/* Titles for custom blue admonitions */
.md-typeset .installation > .admonition-title,
.md-typeset .example > .admonition-title,
.md-typeset .abstract > .admonition-title,
.md-typeset .summary > .admonition-title,
.md-typeset .tldr > .admonition-title {
    background-color: rgba(127, 154, 207, 0.2) !important;
    border-left: none !important;
}

/* Warning/Caution - Yellow */
.md-typeset .admonition.warning,
.md-typeset details.warning,
.md-typeset .admonition.caution,
.md-typeset details.caution {
    background-color: rgba(189, 147, 47, 0.1);
    border-left: 4px solid #BD932F;
}

.md-typeset .warning > .admonition-title,
.md-typeset .warning > summary,
.md-typeset .caution > .admonition-title,
.md-typeset .caution > summary {
    background-color: rgba(189, 147, 47, 0.2);
    border-left: none;
}

/* Danger/Error - Orange */
.md-typeset .admonition.danger,
.md-typeset details.danger,
.md-typeset .admonition.error,
.md-typeset details.error {
    background-color: rgba(227, 90, 38, 0.1);
    border-left: 4px solid #E35A26;
}

.md-typeset .danger > .admonition-title,
.md-typeset .danger > summary,
.md-typeset .error > .admonition-title,
.md-typeset .error > summary {
    background-color: rgba(227, 90, 38, 0.2);
    border-left: none;
}

/* Success/Tip/Hint - Green */
.md-typeset .admonition.success,
.md-typeset details.success,
.md-typeset .admonition.tip,
.md-typeset details.tip,
.md-typeset .admonition.hint,
.md-typeset details.hint {
    background-color: rgba(166, 180, 163, 0.1);
    border-left: 4px solid #A6B4A3;
}

.md-typeset .success > .admonition-title,
.md-typeset .success > summary,
.md-typeset .tip > .admonition-title,
.md-typeset .tip > summary,
.md-typeset .hint > .admonition-title,
.md-typeset .hint > summary {
    background-color: rgba(166, 180, 163, 0.2);
    border-left: none;
}

/* General admonition title styling */
.md-typeset .admonition-title,
.md-typeset summary {
    font-weight: 600;
    font-size: 0.6875rem; /* Very small - 11px */
    padding: 0.5rem 0.75rem;
    margin: -0.75rem -0.75rem 0.5rem -0.75rem;
    border-radius: 4px 4px 0 0;
}

/* Ensure consistent icon styling */
.md-typeset .admonition > .admonition-title::before,
.md-typeset details > summary::before {
    font-size: 1rem;
    margin-right: 0.5rem;
}


================================================
FILE: environment.yml
================================================
# To use:
#
#   $ conda env create -f environment.yml  # `mamba` works too for this command
#   $ conda activate dottxt-ai
#
name: dottxt-ai
channels:
  - conda-forge
  - huggingface
dependencies:
  - python==3.10.0
  - jinja2
  - numpy
  - pydantic
  - scipy
  - pytest
  - pre-commit
  - referencing
  - jsonschema
  - transformers
  - pip
  - pip:
    - -e ".[test]"


================================================
FILE: examples/babyagi.py
================================================
"""This example is a simplified translation of BabyAGI.

It currently does not use the vector store retrieval

The original repo can be found at https://github.com/yoheinakajima/babyagi
"""

from collections import deque
from typing import Deque, List

from openai import OpenAI

import outlines
from outlines import Template


model = outlines.from_openai(OpenAI(), "gpt-4o-mini")
complete = outlines.Generator(model)

## Load the prompts
perform_task_ppt = Template.from_file("prompts/babyagi_perform_task.txt")
create_tasks_ppt = Template.from_file("prompts/babyagi_create_task.txt")
prioritize_tasks_ppt = Template.from_file("prompts/babyagi_prioritize_task.txt")


def create_tasks_fmt(result: str) -> List[str]:
    new_tasks = result.split("\n")

    task_list = []
    for task in new_tasks:
        parts = task.strip().split(".", 1)
        if len(parts) == 2:
            task_list.append(parts[1].strip())

    return task_list


def prioritize_tasks_fmt(result: str):
    new_tasks = result.split("\n")

    task_list: Deque = deque([])
    for task in new_tasks:
        parts = task.strip().split(".", 1)
        if len(parts) == 2:
            task_id = int(parts[0].strip())
            task_name = parts[1].strip()
            task_list.append({"task_id": task_id, "task_name": task_name})

    return task_list


objective = "Becoming rich while doing nothing."
first_task = {
    "task_id": 1,
    "task_name": "Find a repeatable, low-maintainance, scalable business.",
}
next_task_id = 1
task_list = deque([first_task])


def one_cycle(objective: str, task_list, next_task_id: int):
    """One BabyAGI cycle.

    It consists in executing the highest-priority task, creating some new tasks
    given the result, and re-priotizing the tasks.

    Parameters
    ----------
    objective
        The overall objective of the session.
    task_list
        The current list of tasks to perform.
    task_id_counter
        The current task id.

    """

    task = task_list.popleft()

    prompt = perform_task_ppt(objective=objective, task=task)
    result = complete(prompt)

    prompt = create_tasks_ppt(
        objective=objective,
        task=first_task["task_name"],
        result=result,
        previous_tasks=[first_task["task_name"]],
    )
    new_tasks = complete(prompt)

    new_tasks = create_tasks_fmt(new_tasks)

    for task in new_tasks:
        next_task_id += 1
        task_list.append({"task_id": next_task_id, "task_name": task})

    prompt = prioritize_tasks_ppt(
        objective=objective,
        tasks=[task["task_name"] for task in task_list],
        next_task_id=next_task_id,
    )
    prioritized_tasks = complete(prompt)

    prioritized_tasks = prioritize_tasks_fmt(prioritized_tasks)

    return task, result, prioritized_tasks, next_task_id


# Let's run it for 5 cycles to see how it works without spending a fortune.
for _ in range(5):
    print("\033[95m\033[1m" + "\n*****TASK LIST*****\n" + "\033[0m\033[0m")
    for t in task_list:
        print(" • " + str(t["task_name"]))

    task, result, task_list, next_task_id = one_cycle(
        objective, task_list, next_task_id
    )

    print("\033[92m\033[1m" + "\n*****NEXT TASK*****\n" + "\033[0m\033[0m")
    print(task)
    print("\033[93m\033[1m" + "\n*****TASK RESULT*****\n" + "\033[0m\033[0m")
    print(result)


================================================
FILE: examples/beam-cloud/README.md
================================================
## Deploy Outlines on Beam

1. Create an account [here](https://beam.cloud) and install the Beam SDK
2. Download the `app.py` file to your computer
3. Deploy it as a serverless API by running: `beam deploy app.py:predict`


================================================
FILE: examples/beam-cloud/app.py
================================================
from typing import Literal

from beam import Image, endpoint, env


if env.is_remote():
    import outlines


# Pre-load models when the container first starts
def load_models():
    from transformers import AutoModelForCausalLM, AutoTokenizer
    import outlines

    model = outlines.models.from_transformers(
        AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct"),
        AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct"),
    )
    return model


@endpoint(
    name="outlines-serverless",
    gpu="A10G",
    cpu=1,
    memory="16Gi",
    on_start=load_models,
    image=Image().add_python_packages(
        ["outlines", "torch", "transformers", "accelerate"]
    ),
)
def predict(context, **inputs):
    default_prompt = """You are a sentiment-labelling assistant.
    Is the following review positive or negative?

    Review: This restaurant is just awesome!
    """

    prompt = inputs.get("prompt", default_prompt)

    # Unpack cached model from context
    model = context.on_start_value
    # Inference
    generator = outlines.Generator(model, Literal["Positive", "Negative"])
    answer = generator(prompt)
    return {"answer": answer}


================================================
FILE: examples/bentoml/.bentoignore
================================================
__pycache__/
*.py[cod]
*$py.class
.ipynb_checkpoints
venv/


================================================
FILE: examples/bentoml/bentofile.yaml
================================================
service: "service:Outlines"
labels:
  owner: bentoml-team
  stage: demo
include:
- "*.py"
python:
  requirements_txt: "./requirements.txt"
  lock_packages: false


================================================
FILE: examples/bentoml/import_model.py
================================================
import bentoml

MODEL_ID = "mistralai/Mistral-7B-v0.1"
BENTO_MODEL_TAG = MODEL_ID.lower().replace("/", "--")


def import_model(model_id, bento_model_tag):
    import torch
    from transformers import AutoModelForCausalLM, AutoTokenizer

    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True,
    )

    with bentoml.models.create(bento_model_tag) as bento_model_ref:
        tokenizer.save_pretrained(bento_model_ref.path)
        model.save_pretrained(bento_model_ref.path)


if __name__ == "__main__":
    import_model(MODEL_ID, BENTO_MODEL_TAG)


================================================
FILE: examples/bentoml/requirements.txt
================================================
bentoml>=1.2.11
outlines==0.0.37
transformers==4.38.2
datasets==2.18.0
accelerate==0.27.2


================================================
FILE: examples/bentoml/service.py
================================================
import typing as t

import bentoml
from import_model import BENTO_MODEL_TAG, MODEL_ID

DEFAULT_SCHEMA = """{
    "title": "Character",
    "type": "object",
    "properties": {
        "name": {
            "title": "Name",
            "maxLength": 10,
            "type": "string"
        },
        "age": {
            "title": "Age",
            "type": "integer"
        },
        "armor": {"$ref": "#/definitions/Armor"},
        "weapon": {"$ref": "#/definitions/Weapon"},
        "strength": {
            "title": "Strength",
            "type": "integer"
        }
    },
    "required": ["name", "age", "armor", "weapon", "strength"],
    "definitions": {
        "Armor": {
            "title": "Armor",
            "description": "An enumeration.",
            "enum": ["leather", "chainmail", "plate"],
            "type": "string"
        },
        "Weapon": {
            "title": "Weapon",
            "description": "An enumeration.",
            "enum": ["sword", "axe", "mace", "spear", "bow", "crossbow"],
            "type": "string"
        }
    }
}"""


@bentoml.service(
    traffic={
        "timeout": 300,
    },
    resources={
        "gpu": 1,
        "gpu_type": "nvidia-l4",
    },
)
class Outlines:
    bento_model_ref = bentoml.models.get(BENTO_MODEL_TAG)

    def __init__(self) -> None:
        import torch
        from transformers import AutoModelForCausalLM, AutoTokenizer

        import outlines

        self.model = outlines.from_transformers(
            AutoTokenizer.from_pretrained(MODEL_ID),
            AutoModelForCausalLM.from_pretrained(
                MODEL_ID,
                torch_dtype=torch.float16,
                low_cpu_mem_usage=True,
            )
        )

    @bentoml.api
    async def generate(
        self,
        prompt: str = "Give me a character description.",
        json_schema: t.Optional[str] = DEFAULT_SCHEMA,
    ) -> t.Dict[str, t.Any]:
        import outlines

        generator = outlines.Generator(self.model, outlines.json_schema(json_schema))
        character = generator(prompt)

        return character


================================================
FILE: examples/cerebrium/cerebrium.toml
================================================
[cerebrium.deployment]
name = "cerebrium"
python_version = "3.11"
cuda_version = "12"
include = "[./*, main.py, cerebrium.toml]"
exclude = "[.*]"
shell_commands = []

[cerebrium.hardware]
cpu = 2
memory = 14.0
gpu = "AMPERE A10"
gpu_count = 1
provider = "aws"
region = "us-east-1"

[cerebrium.scaling]
min_replicas = 0
max_replicas = 5
cooldown = 60

[cerebrium.dependencies.pip]
outline = "==0.0.37"
transformers = "==4.38.2"
datasets = "==2.18.0"
accelerate = "==0.27.2"


================================================
FILE: examples/cerebrium/main.py
================================================
from transformers import AutoModelForCausalLM, AutoTokenizer

import outlines


model = outlines.from_transformers(
    AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2"),
    AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2"),
)


schema = {
    "title": "Character",
    "type": "object",
    "properties": {
        "name": {"title": "Name", "maxLength": 10, "type": "string"},
        "age": {"title": "Age", "type": "integer"},
        "armor": {"$ref": "#/definitions/Armor"},
        "weapon": {"$ref": "#/definitions/Weapon"},
        "strength": {"title": "Strength", "type": "integer"},
    },
    "required": ["name", "age", "armor", "weapon", "strength"],
    "definitions": {
        "Armor": {
            "title": "Armor",
            "description": "An enumeration.",
            "enum": ["leather", "chainmail", "plate"],
            "type": "string",
        },
        "Weapon": {
            "title": "Weapon",
            "description": "An enumeration.",
            "enum": ["sword", "axe", "mace", "spear", "bow", "crossbow"],
            "type": "string",
        },
    },
}


def generate(
    prompt: str = "Amiri, a 53 year old warrior woman with a sword and leather armor.",
):
    character = model(
        f"<s>[INST]Give me a character description. Describe {prompt}.[/INST]",
        outlines.json_schema(schema),
    )

    print(character)
    return character


================================================
FILE: examples/dating_profile.py
================================================
from dataclasses import dataclass
from enum import Enum

import torch
import transformers
from pydantic import BaseModel, conlist

import outlines
from outlines import Template


class QuestionChoice(str, Enum):
    A = "The key to my heart is"
    B = "The first item on my bucket list is"
    C = "Perks of dating me"
    D = "Message me if you also love"
    E = "People would describe me as"
    F = "I can beat you in a game of"


@dataclass
class QuestionAnswer:
    question: QuestionChoice
    answer: str


class DatingProfile(BaseModel):
    # It is possible put length constraints on these strings using constr- however, this appears to dramatically increase the generation time
    # This may be resolved in the future with this PR: https://github.com/dottxt-ai/outlines/pull/272
    bio: str
    job: str
    # Ignore mypy checks here because it still doesn't support conlist or constr: https://github.com/pydantic/pydantic/issues/975
    interests: conlist(str, min_length=1, max_length=5)  # type: ignore
    qna1: QuestionAnswer
    qna2: QuestionAnswer


@dataclass
class Example:
    description: str
    profile: DatingProfile


samples: list[Example] = [
    Example(
        description="I'm an author and former professional soccer player living in Seattle who publishes popular fiction books. A typical day for me starts by hanging out with my cat, drinking a coffee, and reading as much as I can in a few hours. Then, I'll prepare a quick smoothie before starting to write for a few hours, take a break with soccer or running a few miles, and finally meet friends for dinner at a new, hip restaurant in the evening. Sometimes we go axe-throwing afterwards, or play poker, or watch a comedy show, or visit a dive bar. On my vacations, I travel extensively to countries South America, Europe, and Asia, with the goal of visiting them all!",
        profile=DatingProfile(
            bio="Adventurer, dreamer, author, and soccer enthusiast. Life’s too short to waste time so I make the most of each day by exploring new places and playing with my friends on the pitch. What’s your favorite way to get out and have fun?",
            job="Famous Soccer Player -> Famous Author",
            interests=["Soccer", "Travel", "Friends", "Books", "Fluffy Animals"],
            qna1=QuestionAnswer(
                question=QuestionChoice.B, answer="swim in all seven oceans!"
            ),
            qna2=QuestionAnswer(
                question=QuestionChoice.E,
                answer="fun-loving, adventurous, and a little bit crazy",
            ),
        ),
    ),
    Example(
        description="I run my company and build houses for a living. I'm a big fan of the outdoors and love to go hiking, camping, and fishing. I don't like video games, but do like to watch movies. My love language is home-cooked food, and I'm looking for someone who isn't afraid to get their hands dirty.",
        profile=DatingProfile(
            bio="If you're looking for a Montana man who loves to get outdoors and hunt, and who's in-tune with his masculinity then I'm your guy!",
            job="House Construction Manager / Entrepreneur",
            interests=["Hunting", "Hiking", "The outdoors", "Home-cooked food"],
            qna1=QuestionAnswer(question=QuestionChoice.A, answer="food made at home"),
            qna2=QuestionAnswer(
                question=QuestionChoice.C,
                answer="having a man in your life who can fix anything",
            ),
        ),
    ),
    Example(
        description="I run my own Youtube channel with 10M subscribers. I love working with kids, and my audience skews pretty young too. In my free time, I play Fortnite and Roblox. I'm looking for someone who is also a gamer and likes to have fun. I'm learning Japanese in my free time as well as how to cook.",
        profile=DatingProfile(
            bio="Easy on the eyes (find me on Youtube!) and great with kids. What more do you need?",
            job="Youtuber 10M+ subscribers",
            interests=["Kids", "Gaming", "Japanese"],
            qna1=QuestionAnswer(question=QuestionChoice.D, answer="anime and gaming!"),
            qna2=QuestionAnswer(question=QuestionChoice.F, answer="Fortnite, gg ez"),
        ),
    ),
]


# Below requires ~13GB of GPU memory
# https://huggingface.co/mosaicml/mpt-7b-8k-instruct
# Motivation: Reasonably large model that fits on a single GPU and has been fine-tuned for a larger context window
model_name = "mosaicml/mpt-7b-8k-instruct"
model = outlines.from_transformers(
    transformers.AutoModelForCausalLM.from_pretrained(model_name),
    transformers.AutoTokenizer.from_pretrained(model_name),
)

new_description = "I'm a laid-back lawyer who spends a lot of his free-time gaming. I work in a corporate office, but ended up here after the start-up I cofounded got acquired, so still play ping pong with my cool coworkers every day. I have a bar at home where I make cocktails, which is great for entertaining friends. I secretly like to wear suits and get a new one tailored every few months. I also like weddings because I get to wear those suits, and it's a good excuse for a date. I watch the latest series because I'm paying, with my hard-earned money, for every streaming service."

dating_profile_prompt = Template.from_file("prompts/dating_profile.txt")
prompt = dating_profile_prompt(description=new_description, examples=samples)
profile = model(prompt, outlines.json_schema(DatingProfile), max_tokens=500)  # type: ignore
print(profile)

# Sample generated profiles
"""
{
    "bio": "I'm an ambitious lawyer with a casual and fashionable style. I love games and sports, but my true passion is preparing refreshing cocktails at home and dressing to the nines at weddings. I'm currently looking for a woman to show a good time to and get a kiss on the opulent suit I just had made. Send resumÃ € to this inbox.",
    "job": "Lawyer",
    "interests":
    [
        "Stylish guys",
        "Gaming",
        "Ping pong",
        "Cocktails",
        "Weddings"
    ],
    "qna1":
    {
        "question": "The first item on my bucket list is",
        "answer": "be married and have a family."
    },
    "qna2":
    {
        "question": "People would describe me as",
        "answer": "charming, stylish, and funny."
    }
}
"""

"""
{
    "bio": "I’m a sexy lawyer with time on my hands. I love to game and play ping pong, but the real reason you should swipe to the right is because I look great in a suit. Who doesn’t love a man in a suit? Just saying. Send me a message if you think it’s time to take your dating life to the next level.",
    "job": "Lawyer",
    "interests":
    [
        "Gaming",
        "Ping Pong",
        "Tailored Suits",
        "Weddings",
        "Streaming Services"
    ],
    "qna1":
    {
        "question": "The first item on my bucket list is",
        "answer": "simulate space but stay alive for as long as possible"
    },
    "qna2":
    {
        "question": "People would describe me as",
        "answer": "easy-going, a little nerdy but with a mature essence"
    }
}
"""


================================================
FILE: examples/llamacpp_example.py
================================================
from enum import Enum

from pydantic import BaseModel, constr
from llama_cpp import Llama

import outlines


class Weapon(str, Enum):
    sword = "sword"
    axe = "axe"
    mace = "mace"
    spear = "spear"
    bow = "bow"
    crossbow = "crossbow"


class Armor(str, Enum):
    leather = "leather"
    chainmail = "chainmail"
    plate = "plate"


class Character(BaseModel):
    name: constr(max_length=10)
    age: int
    armor: Armor
    weapon: Weapon
    strength: int


if __name__ == "__main__":
    # curl -L -o mistral-7b-instruct-v0.2.Q5_K_M.gguf https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q5_K_M.gguf
    model = outlines.from_llamacpp(Llama("./mistral-7b-instruct-v0.2.Q5_K_M.gguf"))

    # Construct structured sequence generator
    generator = outlines.Generator(model, Character)

    # Draw a sample
    seed = 789005

    prompt = "Instruct: You are a leading role play gamer. You have seen thousands of different characters and their attributes.\nPlease return a JSON object with common attributes of an RPG character. Give me a character description\nOutput:"

    sequence = generator(prompt, seed=seed, max_tokens=512)
    print(sequence)


================================================
FILE: examples/llamacpp_processor.py
================================================
from enum import Enum

from llama_cpp import Llama, LogitsProcessorList
from pydantic import BaseModel, constr

from outlines.processors import JSONLogitsProcessor
from outlines.models.llamacpp import LlamaCppTokenizer


class Weapon(str, Enum):
    sword = "sword"
    axe = "axe"
    mace = "mace"
    spear = "spear"
    bow = "bow"
    crossbow = "crossbow"


class Armor(str, Enum):
    leather = "leather"
    chainmail = "chainmail"
    plate = "plate"


class Character(BaseModel):
    name: constr(max_length=10)
    age: int
    armor: Armor
    weapon: Weapon
    strength: int


if __name__ == "__main__":
    llama = Llama("./phi-2.Q4_K_M.gguf")
    tokenizer = LlamaCppTokenizer(llama)

    prompt = "Instruct: You are a leading role play gamer. You have seen thousands of different characters and their attributes.\nPlease return a JSON object with common attributes of an RPG character. Give me a character description\nOutput:"

    logits_processor = JSONLogitsProcessor(Character, tokenizer, tensor_library_name="numpy")

    json_str = llama.create_completion(
        prompt,
        top_k=40,
        top_p=0.95,
        temperature=0.7,
        max_tokens=100,
        logits_processor=LogitsProcessorList([logits_processor]),
    )["choices"][0]["text"]

    print(json_str)


================================================
FILE: examples/math_generate_code.py
================================================
"""Example from https://dust.tt/spolu/a/d12ac33169"""

import openai

import outlines
from outlines import Template


examples = [
    {"question": "What is 37593 * 67?", "code": "37593 * 67"},
    {
        "question": "Janet's ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?",
        "code": "(16-3-4)*2",
    },
    {
        "question": "A robe takes 2 bolts of blue fiber and half that much white fiber. How many bolts in total does it take?",
        "code": " 2 + 2/2",
    },
]

question = "Carla is downloading a 200 GB file. She can download 2 GB/minute, but 40% of the way through the download, the download fails. Then Carla has to restart the download from the beginning. How load did it take her to download the file in minutes?"

answer_with_code_prompt = Template.from_string(
    """
    {% for example in examples %}
    QUESTION: {{example.question}}
    CODE: {{example.code}}

    {% endfor %}
    QUESTION: {{question}}
    CODE:"""
)


def execute_code(code):
    result = eval(code)
    return result


prompt = answer_with_code_prompt(question=question, examples=examples)
model = outlines.from_openai(openai.OpenAI(), "gpt-4o-mini")
answer = model(prompt)
result = execute_code(answer)
print(f"It takes Carla {result:.0f} minutes to download the file.")


================================================
FILE: examples/meta_prompting.py
================================================
"""Meta-prompting examples.

References
----------

.. [0] "Prompting is programming: A Query Language for Large Language Models"
       https://arxiv.org/abs/2212.06094
.. [1] "Prompt programming For Large Language Models: Beyond the Few-Shot Paradigm"
       https://arxiv.org/abs/2102.07350.

"""

import argparse

import openai

import outlines
from outlines import Template


client = openai.OpenAI()


def split_into_steps(question, model_name: str):
    solve = Template.from_string(
        """{{question}}
        Rephrase : : as a true or false statement, identify an Object, relationship and subject
        """
    )

    model = outlines.from_openai(client, model_name)

    prompt = solve(question=question)
    answer = model(prompt, max_tokens=500)
    prompt += (
        answer
        + "\n what is the only option that displays the same type of relationship as : :?"
    )
    answer = model(prompt, max_tokens=500)
    completed = prompt + answer

    return completed


def fill_in_the_blanks(question, model_name: str):
    determine_goal = Template.from_string(
        """{{question}}

        In order to solve this problem, we will analyze each of the options and determine
        """
    )

    solve = Template.from_string("""{{memory}}. Let's begin.""")

    model = outlines.from_openai(client, model_name)

    prompt = determine_goal(question=question)
    answer = model(prompt, stop=["."])
    prompt = solve(memory=prompt + answer)
    answer = model(prompt, max_tokens=500)
    completed = prompt + answer

    return completed


def ask_an_expert(question, model_name: str):
    find_expert = Template.from_string(
        """
        {{question}}
        I entered my question into the Expert Generator \
        and waited. The Expert Generator will render a \
        simulation of an expert to answer my question. \
        The expert could be anyone, dead or alive, real \
        or fictional; the machine will find the person \
        most qualified to answer the question. For this \
        question in particular, the expert must be someone \
        who has thought a lot about the problem of \
        artificial intelligence and its alignment. \
        The Expert Generator beeped, indicating that it has \
        found the most qualified expert. The name displayed \
        on the screen: "
        """
    )

    get_answer = Template.from_string(
        """
        {{memory}}".
        I am ready to ask my question.
        "{{expert}}" I say,
        {{question}}
        """
    )

    model = outlines.from_openai(client, model_name)

    prompt = find_expert(question=question)
    expert = model(prompt, stop=['"'])
    prompt = get_answer(question=question, expert=expert, memory=prompt+expert)
    answer = model(prompt, max_tokens=500)
    completed = prompt + answer

    return completed


def ask_an_expert_simple(question, model_name: str):
    find_expert = Template.from_string(
        """
        Q: {{question}}
        A: A good person to answer this question would be
        """
    )

    get_answer = Template.from_string(
        """
        {{memory}}.

        For instance, {{expert}} would answer
        """
    )

    model = outlines.from_openai(client, model_name)

    prompt = find_expert(question=question)
    expert = model(prompt, stop=["\n", "."])
    prompt = get_answer(expert=expert, memory=prompt+expert)
    answer = model(prompt, max_tokens=500)
    completed = prompt + answer

    return completed


def run_example(model_fn, question, model_name):
    completed = model_fn(question, model_name)
    print("\n-----------------------")
    print(f"{completed}")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Run the Meta Prompting examples")
    parser.add_argument(
        "--model",
        type=str,
        default="gpt-4o-mini",
        help="The Large Language Model to use to run the examples.",
    )
    args = parser.parse_args()

    math_q = "f(x) = x*x. What is f(f(3))?"
    sat_q = """

BRAGGART :: MODESTY
A) FLEDGLING : EXPERIENCE
B) EMBEZZLER : GREED
C) WALLFLOWER : TIMIDITY
D) INVALID : MALADY
E) CANDIDATE : AMBITION

    """
    alignment_q = "What should humankind do to ensure that artificial general intelligence is aligned?"
    meaning_q = "What is the meaning of life?"

    run_example(split_into_steps, math_q, args.model)
    run_example(
        split_into_steps, sat_q.lower(), args.model
    )  # gpt>3.5 usually gets this one right
    run_example(fill_in_the_blanks, sat_q, args.model)
    run_example(ask_an_expert, alignment_q, args.model)
    run_example(ask_an_expert_simple, meaning_q, args.model)


================================================
FILE: examples/modal_example.py
================================================
import modal

app = modal.App(name="outlines-app")


outlines_image = modal.Image.debian_slim(python_version="3.11").pip_install(
    "outlines==1.0.0",
    "transformers==4.38.2",
    "datasets==2.18.0",
    "accelerate==0.27.2",
)


def import_model():
    from transformers import AutoModelForCausalLM, AutoTokenizer

    model_id = "mistralai/Mistral-7B-Instruct-v0.2"
    _ = AutoTokenizer.from_pretrained(model_id)
    _ = AutoModelForCausalLM.from_pretrained(model_id)


outlines_image = outlines_image.run_function(import_model)


schema = """{
    "title": "Character",
    "type": "object",
    "properties": {
        "name": {
            "title": "Name",
            "maxLength": 10,
            "type": "string"
        },
        "age": {
            "title": "Age",
            "type": "integer"
        },
        "armor": {"$ref": "#/definitions/Armor"},
        "weapon": {"$ref": "#/definitions/Weapon"},
        "strength": {
            "title": "Strength",
            "type": "integer"
        }
    },
    "required": ["name", "age", "armor", "weapon", "strength"],
    "definitions": {
        "Armor": {
            "title": "Armor",
            "description": "An enumeration.",
            "enum": ["leather", "chainmail", "plate"],
            "type": "string"
        },
        "Weapon": {
            "title": "Weapon",
            "description": "An enumeration.",
            "enum": ["sword", "axe", "mace", "spear", "bow", "crossbow"],
            "type": "string"
        }
    }
}"""


@app.function(image=outlines_image, gpu="A100-40GB")
def generate(
    prompt: str = "Amiri, a 53 year old warrior woman with a sword and leather armor.",
):
    import outlines
    from transformers import AutoModelForCausalLM, AutoTokenizer

    model_id = "mistralai/Mistral-7B-Instruct-v0.2"
    model = outlines.from_transformers(
        tokenizer=AutoTokenizer.from_pretrained(model_id),
        model=AutoModelForCausalLM.from_pretrained(model_id, device="cuda"),
    )

    character = model(
        f"<s>[INST]Give me a character description. Describe {prompt}.[/INST]",
        outlines.json_schema(schema),
    )

    print(character)


@app.local_entrypoint()
def main(
    prompt: str = "Amiri, a 53 year old warrior woman with a sword and leather armor.",
):
    generate.remote(prompt)


================================================
FILE: examples/pick_odd_one_out.py
================================================
"""Chain-of-thought prompting for Odd one out classification.

Example taken from the LQML library [1]_.

References
----------
.. [1] Beurer-Kellner, L., Fischer, M., & Vechev, M. (2022).
       Prompting Is Programming: A Query Language For Large Language Models.
       arXiv preprint arXiv:2212.06094.

"""

import json

import openai

import outlines
from outlines import Generator
from outlines.types import JsonSchema


build_ooo_prompt = outlines.Template.from_file("prompts/pick_odd_one_out.txt")

options = ["sea", "mountains", "plains", "sock"]
options_schema = JsonSchema({
    "type": "object",
    "properties": {
        "result": {
            "type": "string",
            "enum": options
        }
    },
    "required": ["result"]
})

model = outlines.from_openai(openai.OpenAI(), "gpt-4o-mini")
gen_text = Generator(model)
gen_choice = Generator(model, options_schema)

prompt = build_ooo_prompt(options=options)
reasoning = gen_text(prompt, stop=["Pick the odd word", "So the odd one"])
prompt += reasoning
raw_result = gen_choice(prompt)
result = json.loads(raw_result)["result"]
prompt += result
print(result)


================================================
FILE: examples/prompts/babyagi_create_task.txt
================================================
Objective: {{ objective }}
Current Task: {{ task }}
Result: {{ result }}
Previous Tasks: {{ previous_tasks }}

Based on the result, create a list of new tasks that will help achieve the objective.
Please provide the tasks in the following format:
1. [Task description]
2. [Task description]


================================================
FILE: examples/prompts/babyagi_perform_task.txt
================================================
Objective: {{ objective }}
Task: {{ task }}

Please perform the task and provide a concise result in the following format:
Result: [Your concise result here]


================================================
FILE: examples/prompts/babyagi_prioritize_task.txt
================================================
Tasks: {{ tasks }}
Next Task ID: {{ next_task_id }}

Please prioritize the tasks based on their importance and urgency to achieve the objective.
Provide the prioritized tasks in the following format:
1. [Task ID]. [Task description]
2. [Task ID]. [Task description]


================================================
FILE: examples/prompts/dating_profile.txt
================================================
You are a world-renowned matchmaker who understands the modern dating market. Your job is to generate dating app profiles for male clients interested in women based on a provided description. The profiles should be authentic, show off their strengths, and maximize their likelihood of getting matches on dating apps.
Here are some examples of past clients that you have successfully created profiles for:
{% for example in examples %}
Description:
{{ example.description }}
Profile:
{{ example.profile }}
{% endfor %}
Here is the new client who you need to create a profile for:
Description: {{ description }}
Profile:


================================================
FILE: examples/prompts/pick_odd_one_out.txt
================================================
Pick the odd word out: skirt, dress, pen, jacket.
skirt is clothing, dress is clothing, pen is an object, jacket is clothing.
So the odd one is pen.

Pick the odd word out: Spain, France, German, England, Singapore.
Spain is a country, France is a country, German is a language, ...
So the odd one is German.

Pick the odd word out: {{ options | join(", ") }}.


================================================
FILE: examples/prompts/self_consistency.txt
================================================
{% for example in examples %}
Q: {{ example.question }}
A: {{ example.answer }}
{% endfor %}
Q: {{ question }}
A:


================================================
FILE: examples/react.py
================================================
"""ReAct

This example was inspired by the LQML library [1]_. The ReAct framework was
first developed in [2]_ and augments Chain-of-Thought prompting with the ability
for the model to query external sources.

References
----------
.. [1] Beurer-Kellner, L., Fischer, M., & Vechev, M. (2022). Prompting Is Programming: A Query Language For Large Language Models. arXiv preprint arXiv:2212.06094.
.. [2] Yao, S., Zhao, J., Yu, D., Du, N., Shafran, I., Narasimhan, K., & Cao, Y. (2022). React: Synergizing reasoning and acting in language models. arXiv preprint arXiv:2210.03629.

"""

import json

import requests  # type: ignore
from openai import OpenAI

import outlines
from outlines import Generator, Template
from outlines.types import JsonSchema


build_reAct_prompt = Template.from_string(
    """What is the elevation range for the area that the eastern sector of the Colorado orogeny extends into?
Tho 1: I need to search Colorado orogeny, find the area that the eastern sector of the Colorado ...
Act 2: Search 'Colorado orogeny'
Obs 2: The Colorado orogeny was an episode of mountain building (an orogeny) ...
Tho 3: It does not mention the eastern sector. So I need to look up eastern sector.
...
Tho 4: High Plains rise in elevation from around 1,800 to 7,000 ft, so the answer is 1,800 to 7,000 ft.
Act 5: Finish '1,800 to 7,000 ft'
{{ question }}
"""
)


add_mode = Template.from_string(
    """{{ prompt }}
{{ mode }} {{ i }}: {{ result }}
"""
)


def search_wikipedia(query: str):
    url = f"https://en.wikipedia.org/w/api.php?format=json&action=query&prop=extracts&exintro&explaintext&redirects=1&titles={query}&origin=*"
    response = requests.get(url)
    page = response.json()["query"]["pages"]
    return ".".join(list(page.values())[0]["extract"].split(".")[:2])


prompt = build_reAct_prompt(question="Where is Apple Computers headquarted? ")
model = outlines.from_openai(OpenAI(), "gpt-4o-mini")

# Define JSON schemas for mode and action
mode_schema = JsonSchema({
    "type": "object",
    "properties": {
        "result": {
            "type": "string",
            "enum": ["Tho", "Act"]
        }
    },
    "required": ["result"]
})
action_schema = JsonSchema({
    "type": "object",
    "properties": {
        "result": {
            "type": "string",
            "enum": ["Search", "Finish"]
        }
    },
    "required": ["result"]
})

mode_generator = Generator(model, mode_schema)
action_generator = Generator(model, action_schema)
text_generator = Generator(model)

for i in range(1, 10):
    mode_output = mode_generator(prompt, max_tokens=128)
    mode = json.loads(mode_output)["result"]  # Extract the result from the JSON output
    prompt = add_mode(i=i, mode=mode, result="", prompt=prompt)

    if mode == "Tho":
        thought = text_generator(prompt, stop="\n", max_tokens=128)
        prompt += f"{thought}"
    elif mode == "Act":
        action_output = action_generator(prompt, max_tokens=128)
        action = json.loads(action_output)["result"]  # Extract the result from the JSON output
        prompt += f"{action} '"

        subject = text_generator(prompt, stop=["'"], max_tokens=128)
        # Apple Computers headquartered
        subject = " ".join(subject.split()[:2])
        prompt += f"{subject}'"

        if action == "Search":
            result = search_wikipedia(subject)
            prompt = add_mode(i=i, mode="Obs", result=result, prompt=prompt)
        else:
            break

print(prompt)


================================================
FILE: examples/sampling.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "62129e1a-e9de-454e-a714-35ccbcf0b518",
   "metadata": {},
   "outputs": [],
   "source": [
    "#OK\n",
    "import functools as ft\n",
    "import re\n",
    "\n",
    "import numpy as np\n",
    "import matplotlib.pylab as plt\n",
    "import openai\n",
    "\n",
    "import outlines"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "b20aafe8-b7a3-4df4-878f-b48b74e131df",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "env: OPENAI_API_KEY=# you key here\n"
     ]
    }
   ],
   "source": [
    "%env OPENAI_API_KEY= # you key here"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2a3514d6-d5d7-46e9-9b69-1251d337e094",
   "metadata": {},
   "source": [
    "In this example we will look at completion results for questions similar to those in the GSM8K dataset, using few-shots prompts with 5 examples. We first use `outlines.Template` to build the few-shot prompt. Outlines uses the Jinja2 templating engine to render the object when the function is called with the variables' values; it thus allows you to build complex prompts very easily."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "ffe8bb11-6b51-4fe7-bfb3-c62556a60db8",
   "metadata": {},
   "outputs": [],
   "source": [
    "examples = [\n",
    "    {\n",
    "        \"question\": \"There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?\",\n",
    "        \"answer\": \"We start with 15 trees. Later we have 21 trees. The difference must be the number of trees they planted. So, they must have planted 21 - 15 = 6 trees. The answer is 6.\",\n",
    "    },\n",
    "    {\n",
    "        \"question\": \"If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?\",\n",
    "        \"answer\": \"There are 3 cars in the parking lot already. 2 more arrive. Now there are 3 + 2 = 5 cars. The answer is 5.\",\n",
    "    },\n",
    "    {\n",
    "        \"question\": \"Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?\",\n",
    "        \"answer\": \"Leah had 32 chocolates and Leah’s sister had 42. That means there were originally 32 + 42 = 74 chocolates. 35 have been eaten. So in total they still have 74 - 35 = 39 chocolates. The answer is 39.\",\n",
    "    },\n",
    "    {\n",
    "        \"question\": \"Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?\",\n",
    "        \"answer\": \"Jason had 20 lollipops. Since he only has 12 now, he must have given the rest to Denny. The number of lollipops he has given to Denny must have been 20 - 12 = 8 lollipops. The answer is 8.\",\n",
    "    },\n",
    "    {\n",
    "        \"question\": \"Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?\",\n",
    "        \"answer\": \"He has 5 toys. He got 2 from mom, so after that he has 5 + 2 = 7 toys. Then he got 2 more from dad, so in total he has 7 + 2 = 9 toys. The answer is 9.\",\n",
    "    },\n",
    "    {\n",
    "        \"question\": \"There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?\",\n",
    "        \"answer\": \"There are 4 days from monday to thursday. 5 computers were added each day. That means in total 4 * 5 = 20 computers were added. There were 9 computers in the beginning, so now there are 9 + 20 = 29 computers. The answer is 29.\",\n",
    "    },\n",
    "    {\n",
    "        \"question\": \"Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?\",\n",
    "        \"answer\": \"Michael initially had 58 balls. He lost 23 on Tuesday, so after that he has 58 - 23 = 35 balls. On Wednesday he lost 2 more so now he has 35 - 2 = 33 balls. The answer is 33.\",\n",
    "    },\n",
    "    {\n",
    "        \"question\": \"Olivia has $23. She bought five bagels for $3 each. How much money does she have left?\",\n",
    "        \"answer\": \"She bought 5 bagels for $3 each. This means she spent 5\",\n",
    "    },\n",
    "]\n",
    "\n",
    "\n",
    "few_shot_prompt = outlines.Template.from_string(\n",
    "    \"\"\"\n",
    "    {% for example in examples %}\n",
    "    Q: {{ example.question }}\n",
    "    A: {{ example.answer }}\n",
    "    {% endfor %}\n",
    "    Q: {{ question }}\n",
    "    A:\n",
    "    \"\"\"\n",
    ")\n",
    "\n",
    "# Template instances can be partially evaluated because they are callable objects\n",
    "gsm8k_prompt = ft.partial(few_shot_prompt, examples=examples)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1eae0ec8-89f0-43fc-b055-6fcd64cbc03b",
   "metadata": {},
   "source": [
    "## When `gpt-4o-mini` is uncertain"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a273ed78-e813-467e-85f3-16d7f283ba87",
   "metadata": {},
   "source": [
    "Let us now sample 20 completions with the `gpt-4o-mini` model. Outlines is sampling first, and allows to draw several samples with both OpenAI and `transformers` models easily:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "beff960d-6833-4f24-af09-5b65886a9549",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = outlines.from_openai(openai.OpenAI(), \"gpt-4o\")\n",
    "\n",
    "question = \"When I was 6, my sister was half the age of my brother. When I was 14, my sister was 3 years younger than my brother. Now I'm 70, how old is my sister now?\"\n",
    "prompt = gsm8k_prompt(question=question)\n",
    "answers = model(prompt, n=20, max_tokens=512)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1a895b6d-d4d4-40f9-9156-24ba7e21cc08",
   "metadata": {},
   "source": [
    "The correct answer to this question is 67. Let us now count the different answers, and take a look at their distribution. Let us first define a few utility functions:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "f1c83d1f-a478-4509-890e-b84a2e0d8846",
   "metadata": {},
   "outputs": [],
   "source": [
    "def count_digits(answers):\n",
    "    digits = []\n",
    "    for answer in answers:\n",
    "        try:\n",
    "            match = re.findall(r\"\\d+\", answer)[-1]\n",
    "            if match is not None:\n",
    "                digit = int(match)\n",
    "                digits.append(digit)\n",
    "        except AttributeError:\n",
    "            print(f\"Could not parse the completion: '{answer}'\")\n",
    "\n",
    "    unique_digits, counts = np.unique(digits, return_counts=True)\n",
    "    return {d: c for d, c in zip(unique_digits, counts)}\n",
    "\n",
    "\n",
    "def plot_counts(counts):\n",
    "    fig = plt.figure(figsize=(12, 8))\n",
    "    ax = fig.add_subplot(111)\n",
    "\n",
    "    bar = ax.bar(counts.keys(), counts.values())\n",
    "    ax.spines[[\"right\", \"top\", \"left\"]].set_visible(False)\n",
    "    ax.get_yaxis().set_visible(False)\n",
    "    ax.get_yaxis().set_visible(False)\n",
    "\n",
    "    for rect in bar:\n",
    "        height = rect.get_height()\n",
    "        plt.text(\n",
    "            rect.get_x() + rect.get_width() / 2.0,\n",
    "            height,\n",
    "            f\"{height:.0f}\",\n",
    "            ha=\"center\",\n",
    "            va=\"bottom\",\n",
    "            fontsize=20,\n",
    "        )\n",
    "\n",
    "    ax.set_xticks(list(counts.keys()))\n",
    "    ax.set_xlabel(\"Answer\")\n",
    "\n",
    "\n",
    "def entropy(counts):\n",
    "    counts = np.array(list(counts.values()))\n",
    "    probs = counts / np.sum(counts)\n",
    "    log_probs = np.log(probs)\n",
    "    return -np.sum(probs * log_probs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "88668e09-bcd6-4a6a-83a5-838189b910eb",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAqsAAAHgCAYAAACCbCTDAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAAsTAAALEwEAmpwYAAAVwklEQVR4nO3df7DldX3f8dcbFtmVLb9cDDgLrB0DjNWUlWQzWBaDtEJg0BBiJk5VsKLFKRSsU7u2M8wKZouDjsg4Y0chBn9MTfgh3QkapQiCHSJWFigBAlMgggUkJlVJDXXh0z/2ILuwF9LZe+95372Px8yZvff7PeznvcB893m/53vOt8YYAQCAjnaZ9gAAADATsQoAQFtiFQCAtsQqAABtiVUAANoSqwAAtLXkRfb7XCsAAOZDbW+jM6sAALQlVgEAaEusAgvGddddl5NPPjn7779/dt9997ziFa/Icccdl69+9avTHg3YiTn2TNeLXbMK0MIHP/jBXHjhhVm5cmXe/OY3Z8WKFXn88cfzve99LzfccENOOOGEaY8I7IQce6avxnjB91B5gxUwdZ/97Gfz3ve+N6eeemo+85nP5CUveck2+3/+859nt912m9J0wM7KsWfebfcNVmIVaO3JJ5/MgQcemGXLluW+++573l8WAHPBsWcqthurLgMAWrv22mvz+OOP55xzzskuu+ySa665JnfeeWeWLl2aNWvW5Mgjj5z2iMBOyLGnD7EKtPbd7343SbJ06dKsXr06d9555zb7jz766FxxxRXZb7/9pjEesJNy7OnDpwEArf3whz9Mklx44YWpqtx000356U9/mjvuuCNvetObcuONN+atb33rlKcEdjaOPX2IVaC1p59+OkmyZMmSbNy4MUcddVSWL1+e1772tfnKV76SlStX5lvf+lZuvvnmKU8K7Ewce/oQq0Bre++9d5Jk9erVWbVq1Tb7XvrSl+a4445Lktxyyy3zPBmwM3Ps6UOsAq0deuihSZ79i+O59tlnnyTJz372s/kaCVgEHHv6EKtAa8cee2yqKnfdddcvXpbb2jNvenjlK18536MBOzHHnj7EKtDawQcfnJNOOinf//7388lPfnKbfd/4xjfy9a9/PXvvvXeOP/74KU0I7Iwce/pwUwCgvYcffjivf/3r89BDD+XYY4/N6tWr88ADD+Tqq69OVeXLX/5yTjnllGmPCexkHHvmnTtYAQvX448/nvPOOy8bN27MI488kj333DNr167Nhz70oaxZs2ba4wE7KceeeSVWAQBoa7ux6ppVAADaEqsAALQlVgEAaGvJtAcAeDGr1l0z474HLzhxHicBFhPHnh6cWQUAoC2xCgBAW2IVAIC2xCoAAG2JVQAA2hKrAAC0JVYBAGhLrAIA0JZYBQCgLbEKAEBbYhUAgLbEKgAAbYlVAADaEqsAALQlVgEAaEusAgDQllgFAKAtsQoAQFtiFQCAtsQqAABtiVUAANoSqwAAtCVWAQBoS6wCANCWWAUAoC2xCgBAW2IVAIC2xCoAAG2JVQAA2hKrAAC0JVYBAGhLrAIA0JZYBQCgLbEKAEBbYhUAgLbEKgAAbYlVAADaEqsAALQlVgEAaEusAgDQllgFAKAtsQoAQFtiFQCAtsQqAABtiVUAANoSqwAAtCVWAQBoS6wCANCWWAUAoC2xCgBAW2IVAIC2xCoAAG2JVQAA2hKrAAC0JVYBAGhLrAIA0JZYBQCgLbEKAEBbYhUAgLbEKgAAbYlVAADaEqsAALQlVgEAaEusAgDQllgFAKAtsQoAQFtiFQCAtsQqAABtiVUAANoSqwAAtCVWAQBoS6wCANCWWAUAoC2xCgBAW2IVAIC2xCoAAG2JVQAA2hKrAAC0JVYBAGhLrAIA0JZYBQCgLbEKAEBbYhUAgLbEKgAAbYlVAADaEqsAALQlVgEAaEusAgDQllgFAKAtsQoAQFtiFQCAtsQqAABtiVUAANoSqwAAtCVWAQBoS6wCANCWWAUAoC2xCgBAW2IVAIC2xCoAAG2JVQAA2hKrAAC0JVYBAGhLrAIA0JZYBQCgLbEKAEBbYhUAgLbEKgAAbYlVAADaEqsAALQlVgEAaEusAgDQllgFAKAtsQoAQFtiFQCAtsQqAABtiVUAANoSqwAAtCVWAQBoS6wCAAvaF7/4xVRVqiqXXHLJtMdhlolVAGDBeuihh3LmmWdm+fLl0x6FOSJWAYAFaYyRd73rXXnZy16WM844Y9rjMEfEKgCwIF188cX55je/mc997nPZY489pj0Oc0SsAgALzt13351169bl7LPPztFHHz3tcZhDYhUAWFA2b96cd7zjHTnooIOyYcOGaY/DHFsy7QEAAP5/nHfeedm0aVO+/e1vZ9myZdMehznmzCoAsGB85zvfyYYNG/KBD3wgRx555LTHYR6IVQBgQdi8eXPe+c535pBDDsn5558/7XGYJ2IVAFgQnnjiidx77725++67s3Tp0l/cCKCq8uEPfzhJ8p73vCdVlXPOOWe6wzJrXLMKACwIu+++e9797ndvd9+tt96aTZs25aijjsqhhx7qEoGdiFgFABaEZcuWzXg71fXr12fTpk059dRTc/rpp8/zZMwllwEAANCWWAUAoC2xCgAseOvXr88YwyUAOyGxCgBAW2IVAIC2xCoAAG356CoAYMFYte6aF9z/4AUnztMkzBdnVgEAaEusAgDQllgFAKAtsQoAQFtiFQCAtsQqAABtiVUAANoSqwAAtCVWAQBoS6wCANCWWAUAoC2xCgBAW2IVAIC2xCoAAG2JVQAA2hKrAAC0JVYBAGhLrAIA0JZYBQCgLbEKAEBbYhUAgLbEKgAAbYlVAADaEqsAALQlVgEAaEusAgDQllgFAKAtsQoAQFtiFQCAtsQqAABtiVUAANoSqwAAtCVWAQBoS6wCANCWWAUAoC2xCgBAW2IVAIC2xCoAAG2JVQAA2hKrAAC0JVYBAGhLrAIA0JZYBQCgLbEKAEBbYhUAgLbEKgAAbYlVAADaEqsAALQlVgEAaEusAgDQllgFAKAtsQoAQFtiFQCAtsQqAABtiVUAANoSqwAAtCVWAQBoS6wCANCWWAUAoC2xCgBAW2IVAIC2xCoAAG2JVQAA2hKrAAC0JVYBAGhLrAIA0JZYBQCgLbEKAEBbYhUAgLbEKgAAbYlVAADaEqsAALQlVgEAaEusAgDQllgFAKAtsQoAQFtiFQCAtsQqAABtiVUAANoSqwAAtCVWAQBoS6wCANCWWAUAoC2xCgBAW2IVAIC2xCoAAG2JVQAA2hKrAAC0JVYBAGhLrAIA0JZYBQCgLbEKAEBbYhUAgLbEKgAAbYlVAADaEqsAALQlVgEAaEusAgDQllgFAKAtsQoAQFtiFQCAtsQqAABtiVUAANoSqwAAtCVWAQBoS6wCANCWWAUAoC2xCgBAW2IVAIC2xCoAAG2JVQAA2hKrAAC0JVYBAGhLrAIA0JZYBQCgLbEKAEBbYhUAgLbEKgAAbYlVAADaEqsAAIvEj370o1xyySU5+eST86pXvSrLli3LXnvtlaOOOiqXXnppnn766WmP+DxLpj0AAADz4/LLL8/73ve+HHDAATnmmGNy0EEH5bHHHstVV12V008/PV/72tdy+eWXp6qmPeoviFUAgEXikEMOycaNG3PiiSdml12efYF9w4YNWbNmTa688spcddVVOeWUU6Y45bZcBgAAsEi88Y1vzEknnbRNqCbJ/vvvnzPOOCNJcsMNN0xhspmJVQAAsttuuyVJlizp9cK7WAUAWOQ2b96cz3/+80mS448/fsrTbEusAgAscuvWrcudd96ZE044Iccdd9y0x9mGWAUAWMQuvvjifPzjH89hhx2WL3zhC9Me53nEKgDAIvWpT30qZ599dl796lfn+uuvz7777jvtkZ5HrAIALEIXXXRRzjrrrLzmNa/J9ddfn/3333/aI22XWAUAWGQ++tGP5v3vf38OP/zwXH/99Xn5y18+7ZFmJFYBABaR888/P+vWrcsRRxyR6667LitWrJj2SC+o1wdpAQAwZy677LKce+652XXXXbN27dpcfPHFz3vOqlWrctppp83/cDMQqwAAi8QDDzyQJHnqqady0UUXbfc5b3jDG1rFqssAAAAWifXr12eM8YIPt1sFAIC/J7EKAEBbYhUAgLa8wQoAYBFZte6aGfc9eMGJ8zjJ348zqwAAtCVWAQBoS6wCANCWWAUAoC2xCgBAW2IVAIC2xCoAAG2JVQAA2hKrAAC0JVYBAGhLrAIA0JZYBQCgLbEKAEBbYhUAgLbEKgAAbYlVAADaEqsAALQlVgEAaEusAgDQllgFAKAtsQoAQFtiFQCAtsQqAABtiVUAANoSqwAAtCVWAQBoS6wCANCWWAUAoC2xCgBAW2IVAIC2xCoAAG2JVQAA2hKrAAC0JVYBAGhLrAIA0JZYBQCgLbEKAEBbYhUAgLbEKgAAbYlVAADaEqsAALQlVgEAaEusAgDQllgFAKAtsQoAQFtiFQCAtsQqAABtiVUAANoSqwAAtCVWAQBoq2WsXnHFFTnrrLOydu3a7LnnnqmqvP3tb5/2WAAAc0L7zGzJtAfYno985CO5/fbbs3z58qxcuTL33HPPtEcCAJgz2mdmLc+sfuITn8i9996bn/zkJ/n0pz897XEAAOaU9plZyzOrxxxzzLRHAACYN9pnZi3PrAIAQCJWAQBoTKwCANCWWAUAoC2xCgBAW2IVAIC2xCoAAG2JVQAA2mp5U4Crr746V199dZLk0UcfTZLcfPPNOe2005IkK1asyMc+9rEpTQcAMLu0z8xaxuptt92Wyy67bJtt999/f+6///4kycEHH7xo/4MBADsf7TOzlpcBrF+/PmOMGR8PPvjgtEcEAJg12mdmLWMVAAASsQoAQGNiFQCAtlq+wSpJVq27ZsZ9D15w4jxOAgAwt3TPzJxZBQCgLbEKAEBbYhUAgLbEKgAAbYlVAADaEqsAALQlVgEAaEusAgDQllgFAKAtsQoAQFtiFQCAtsQqAABtiVUAANoSqwAAtCVWAQBoS6wCANCWWAUAoC2xCgBAW2IVAIC2xCoAAG2JVQAA2hKrAAC0JVYBAGhLrAIA0JZYBQCgLbEKAEBbYhUAgLbEKgAAbYlVAADaEqsAALQlVgEAaEusAgDQllgFAKAtsQoAQFtiFQCAtsQqAABtiVUAANoSqwAAtCVWAQBoS6wCANCWWAUAoC2xCgBAW2IVAIC2xCoAAG2JVQAA2hKrAAC0JVYBAGhLrAIA0JZYBQCgLbEKAEBbYhUAgLbEKgAAbYlVAADaEqsAALQlVgEAaEusAgDQllgFAKAtsQoAQFtiFQCAtsQqAABtiVUAANoSqwAAtCVWAQBoS6wCANCWWAUAoC2xCgBAW2IVAIC2xCoAAG3VGGPmnVV/mmTF/I0zoxVJ/mraQwBtOCYAz5jP44G15tZfjTGOf+7GF4zVLqrqv48xfnXacwA9OCYAz5jP44G1psNlAAAAtCVWAQBoa6HE6memPQDQimMC8Iz5PB5YawoWxDWrAAAsTgvlzCoAAItQ+1itqr2r6oqquqeq7q6qI6c9EzA/qmppVd1SVbdX1Z9X1Ycn2/+wqh6oqtsmj8OnPCowD16oCarqA1U1qmpWPnJze2tV1R9tddx5sKpum4V1Dt3q97ytqn5SVedU1b5VdW1V3Tf5dZ85XOv8qrpjsu0bVfWKHV1rNrW/DKCqLkty0xjjkqp6SZKXjjH+95THAuZBVVWSPcYYT1TVbkm+neTsJGck+ZMxxhVTHRCYVzM1QVUdmOSSJIclOWKMscOfGfpi/VFVH0/y4zHGeTu61la/565JfpDk15P8qyR/Pca4oKrWJdlnjPHv5mitvxlj/GSy/V8nefUY44zZWmtHtT6zWlV7JTk6yaVJMsb4v0IVFo+xxROTb3ebPHr/hA3MiRdpgk8k+WBm6fjwYv0x+UH6d5P859lYbyvHJvmfY4y/TPKWJJdNtl+W5Lfmaq1nQnVijzQ7zraO1SSvTPJ4ks9V1aaquqSq9pj2UMD8qapdJy+1/TDJtWOM70x2/f7kZatPVNXu05sQmCfbbYKqekuSH4wxbp/rtbbavzbJY2OM+2ZxzST5vTwbwL80xnhk8vWjSX5pDtdKVf1+VT2U5J8nOXeW19oh3WN1SZLXJfn0GGN1kr9Nsm66IwHzaYzx1Bjj8CQrk6ypqtck+VC2vNz3a0n2TTJrL40BbW2vCdYn+feZ/bh6sf54W2b5rOrkUoM3J7n8ufvGlms2Z+1s5/bWGmP8hzHGgUm+lOTM2VprNnSP1YeTPLzVmZQrsuV/HmCRmbwEd32S48cYj0wuEXgyyeeSrJnqcMB8mKkJXpnk9qp6MFt+qL21qvafo7VSVUuS/HaSP9rBNZ7rN5PcOsZ4bPL9Y1V1wGTNA7Ll1aW5WmtrX0pyyiyutcNax+oY49EkD1XVoZNNxya5a4ojAfOoqvarqr0nXy9L8s+S3LPVAbyy5TquO6c1IzA/ZmiCW8cYLx9jrBpjrMqWyHzd5LmzvdYz/fFPk9wzxnh4R9bYjueerd2Y5NTJ16cm+S9ztVZV/fJW+96S5J5ZXGuHLYRPAzg8W97h95Ik9yd51xjjb6Y6FDAvqupXsuWNBbtmyw/XfzzGOK+qvplkvySV5LYkZ2z1RixgJ/ViTTA5u/qrs/RpANtdq6r+MMmfjTH+046usdVaeyT5fpJ/OMb48WTby5L8cZKDkvxlkt8dY/z1HK11ZZJDkzw9WeuMMcYPdnSt2dI+VgEAWLxaXwYAAMDiJlYBAGhLrAIA0JZYBQCgLbEKAEBbYhXgOarqt6pqVNVh054FYLETqwDP97Yk3578OhWTu+QALHpiFWArVbU8yVFJ3p3k9ybbfqOqbqiqK6rqnqr60uTuWamqC6rqrqq6o6o+VlW7VtUDtcXeVfVUVR09ee6NVfXLVbVHVf1BVd1SVZuq6i2T/adV1cbJTQ+um86/AYBe/OQOsK23JPnTMca9VfWjqjpisn11kn+U5H8l+W9J/klV3Z3k5CSHjTFGVe09xniqqv4iyauz5Z7ltyZZW1XfSXLgGOO+qtqQ5JtjjH8xuZ3sLVX1XyfrvC7Jr8zGnWoAdgbOrAJs621Jvjz5+st59lKAW8YYD48xns6WW7yuSvLjJH+X5NKq+u0k/2fy3JuSHD15/MdsOVP7a0m+O9n/piTrquq2JDckWZott1RMkmuFKsCznFkFmKiqfZO8Mclrq2ok2TXJSHJNkie3eupTSZaMMTZX1Zokxyb5nSRnTv75G5O8L8krkpyb5N8m+Y1sidgkqSSnjDH+4jnr/3qSv52TPxzAAuXMKsCzfifJF8YYB48xVo0xDkzyQJK123vy5PrWvcYYX03y/iT/eLLrliSvT/L0GOPvsuVM7L/MlohNkq8nOWur615Xz9GfB2DBE6sAz3pbkq88Z9uVmflTAf5Bkj+pqjuy5dMD/k2SjDGeTPJQkj+bPO+myXP/x+T785PsluSOqvrzyfcAbEeNMaY9AwAAbJczqwAAtCVWAQBoS6wCANCWWAUAoC2xCgBAW2IVAIC2xCoAAG2JVQAA2vp/jdj4sUZoV2sAAAAASUVORK5CYII=",
      "text/plain": [
       "<Figure size 864x576 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "counts = count_digits(answers)\n",
    "plot_counts(counts)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "661a1135-ac2d-4a49-a786-d04a7ba68b48",
   "metadata": {},
   "source": [
    "We see that there is an important variabilty in the answers given by `gpt-4o-mini`. Depending on the number of samples taken, even self-consistency sampling may lead to the wrong result here."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "30ea0dfe-6c15-44f0-881c-88b325542b44",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Entropy: 1.5741030017371853\n"
     ]
    }
   ],
   "source": [
    "print(f\"Entropy: {entropy(counts)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0b15b230-b667-4c9c-8a5d-366dd61de9b7",
   "metadata": {},
   "source": [
    "## `gpt-4o-mini` on an easier question"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "beae30f0-4168-4a80-90d4-d26a4f476469",
   "metadata": {},
   "source": [
    "Let us now look at the results for an arguably easier question:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "7e106b94-2dfd-4a75-b4d9-b1ad693418a7",
   "metadata": {},
   "outputs": [],
   "source": [
    "question = \"When I was 6 my sister was half my age. Now I’m 70 how old is my sister?\"\n",
    "prompt = gsm8k_prompt(question)\n",
    "answers = model(question, samples=20, max_tokens=512)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "dd46fb2b-08ef-4003-8d03-ea0f39c865c4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Entropy: 0.1985152433458726\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAqsAAAHgCAYAAACCbCTDAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAAsTAAALEwEAmpwYAAAQuklEQVR4nO3db6ye9V3H8c+PPxGxIjMVghSLkCYgOtmUzSCd6xii8mCIyxBCTBUTWbBmIzEhWeIAUfegSYkha6JIQsmSRaoclnUbmeCyQbrBoi2GrtSlEIZKxnQOCmPB9vLBuaH8aUui4dyfwuuVnPQ+13Wd3N/TR+/++ruva0zTFAAAaHTEvAcAAICDEasAANQSqwAA1BKrAADUEqsAANQSqwAA1Drqdc67rxUAAEthHOiglVUAAGqJVQAAaolVAIA3sc2bN2fdunVZvXp1jjvuuIwxcsUVVxz0+meeeSYf+9jHcsYZZ+SYY47J2972tlx44YW55557lnDq/V5vzyoAAIexG2+8Mdu3b8+yZcuyYsWK7Ny586DXfve73815552XHTt25KyzzspVV12VPXv25K677sr73//+3HLLLbnyyiuXcHorqwAAb2obNmzIrl278vTTT2fjxo2HvPa6667Ljh07cskll2Tbtm256aabcsstt+Thhx/OKaecknXr1uWJJ55YoskXiVUAgDexNWvWZNWqVRnjgB+2f4U777wzSXLDDTfkqKP2/wf8CSeckGuuuSbf//73c+utt75hsx6IWAUAIEny5JNPJklOO+2015x78dhS710VqwAAJEmWL1+eJHn00Udfc2737t1JkkceeWRJZxKrAAAkSS666KIkycc//vHs3bv3peNPPfVUNmzYkGTxQ1hLyd0AAABIsrhX9e67787mzZtz9tln5/zzz8+zzz6bu+66KyeffHIef/zxHHHE0q51WlkFACBJctJJJ+XBBx/M1VdfnWeeeSaf/OQns2XLllx66aW54447kix+2GopWVkFAOAlJ554Ym6++ebcfPPNrzh+7733JknOOeecJZ3HyioAAK9r06ZNSZLLL798Sd9XrAIAkCTZt29f9uzZ85rjt99+ezZt2pRzzz03F1988ZLOZBsAAMCb2MLCQhYWFpLsv4/q1q1bs3bt2iSLt6tav359kuS5557LiSeemAsuuCCnn356jjjiiNx///3ZunVrzjzzzNxxxx1L/gGrMU3Toc4f8iQAAN2uu+66XH/99Qc9v3Llyjz22GNJkhdeeCFXXXVV7rvvvpceq7pq1ap86EMfykc+8pEce+yxb+SoB3zEllgFAKDBAWPVnlUAAGqJVQAAaolVAABqiVUAAHLqtVvmPcIBiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABqiVUAAGrVxOrmzZuzbt26rF69Oscdd1zGGLniiivmPRYAAHN01LwHeNGNN96Y7du3Z9myZVmxYkV27tw575EAAJizmpXVDRs2ZNeuXXn66aezcePGeY8DAECBmpXVNWvWzHsEAADK1KysAgDAq4lVAABqiVUAAGqJVQAAaolVAABqiVUAAGqJVQAAaolVAABq1TwUYGFhIQsLC0mSJ598MkmydevWrF27NkmyfPnyrF+/fk7TAQAwDzWxum3bttx2222vOLZ79+7s3r07SbJy5UqxCgDwFjOmaTrU+UOeBADgzeHUa7fksU9cNM8RxoEO2rMKAEAtsQoAQC2xCgBArdpYPfXaLfMeAQCAOauNVQAAEKsAANQSqwAA1BKrAADUEqsAANQSqwAA1BKrAADUEqsAANQSqwAA1BKrAADUEqsAANQSqwAA1BKrAADUEqsAANQSqwAA1BKrAADUEqsAANQSqwAA1BKrAADUEqsAANQSqwAA1BrTNB385BhfSLJ86cZ5heVJvjOn9wYAeCuaZ399Z5qmX3v1wUPG6jyNMb4+TdMvznsOAIC3isb+sg0AAIBaYhUAgFrNsfpX8x4AAOAtpq6/avesAgBA88oqAABvcXWxOsY4ZozxwBhj+xjj4THG9fOeCQDgcHewxhqL/myMsWuM8Y0xxh+97PhfjjG+OcZ4aIzxznnMfdQ83vR1/CDJ+6Zp2jPGODrJfWOMz0/T9NV5DwYAcBg7YGMlOTPJKUnOmKZp3xjjhNn1v55k1ezr3Uk2zv5cUnWxOi1uot0z+/bo2ZeNtQAA/w+HaKwPJ7l8mqZ9s+u+PbvmA0k2zX7uq2OM48cYJ03T9B9LOXfdNoAkGWMcOcbYluTbSb44TdPX5jwSAMBh7yCNdXqSS8cYXx9jfH6MsWp2+clJvvWyH39idmxJVcbqNE17p2k6O8mKJO8aY/zsnEcCADjsHaSxfijJ87MnV/11klvnOOJrVMbqi6Zp+u8k/5jkNc+JBQDg/+ZVjfVEkr+fnbozydtnr/8ti3tZX7RidmxJ1cXqGOMnxhjHz17/cJILkuyc61AAAIe5QzTWQpI1s8t+Jcmu2evPJPmd2V0BfinJ95Z6v2pS+AGrJCcluW2McWQWY/pvp2n67JxnAgA43B2wscYY9yX51Bjjo1n8ANbvz67/XJLfSPLNJM8l+d05zOwJVgAA9KrbBgAAAC8SqwAA1BKrAADUEqsAANQSqwAA1BKrAK8yxrh4jDGNMc6Y9ywAb3ViFeC1Lkty3+zPuRhjNN4HG2DJiVWAlxljLEtyXpIrk/z27Nh7xxhfGmNsHmPsHGN8aowxZuc+McbYMcZ4aIyxfoxx5Bjj0dkTX44fY+wdY7xndu2Xxxirxhg/Msa4dYzxwBjjn8cYH5idXzvG+MwY494k98znbwCgi3+5A7zSB5J8YZqmXWOM/xxj/MLs+DuSnJXk35Pcn+SXxxjfSPKbSc6YpmkaYxw/TdPeMcYjSX4myU8n+ackq8cYX0tyyjRN/zrG+PMk907T9HuzRx8+MMb4h9n7vDPJ26dp+q+l+oUBmllZBXily5J8evb609m/FeCBaZqemKZpX5JtSU5N8r0kzyf5mzHGJVl8HGGSfCXJe2Zff5HFldpzkjw4O/+rSa4dY2xL8qUkxyT5qdm5LwpVgP2srALMjDF+PMn7kvzcGGNKcmSSKcmWJD942aV7kxw1TdP/jDHeleT8JB9M8oezn/9ykg8n+ckkf5Lkj5O8N4sRmyQjyW9N0/TIq97/3UmefUN+OYDDlJVVgP0+mOT2aZpWTtN06jRNpyR5NMnqA10829/6Y9M0fS7JR5P8/OzUA0nOTbJvmqbns7gS+wdZjNgkuTvJupfte33HG/T7ABz2xCrAfpclufNVx/4uB78rwI8m+ewY46Es3j3gmiSZpukHSb6V5Kuz674yu/ZfZt//aZKjkzw0xnh49j0ABzCmaZr3DAAAcEBWVgEAqCVWAQCoJVYBAKglVgEAqCVWAQCoJVYBAKglVgEAqCVWAQCo9b+lzUDoz9UHogAAAABJRU5ErkJggg==",
      "text/plain": [
       "<Figure size 864x576 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "counts = count_digits(answers)\n",
    "plot_counts(counts)\n",
    "print(f\"Entropy: {entropy(counts)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cf4cacdf-a31d-43bd-8517-eec9f656eee4",
   "metadata": {},
   "source": [
    "The entropy of the results is much lower, we say that the model is more \"certain\" of its answers. "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "22f31872-aab7-4a68-b9f2-d335a4f1a875",
   "metadata": {},
   "source": [
    "## How `gpt-4` compares to `gpt-4o-mini`\n",
    "\n",
    "Let us now look at how GPT4 fares on the original question:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "2d5ab5b8-eca5-47f5-a35c-5f3865e35755",
   "metadata": {},
   "outputs": [],
   "source": [
    "model = outlines.from_openai(openai.OpenAI(), \"gpt-4\")\n",
    "\n",
    "question = \"When I was 6, my sister was half the age of my brother. When I was 14, my sister was 3 years younger than my brother. Now I'm 70, how old is my sister now?\"\n",
    "prompt = gsm8k_prompt(question)\n",
    "answers = model(prompt, samples=20, max_tokens=512)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "d316a5f7-cebc-4b09-9b1b-aee219b2f088",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Entropy: -0.0\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAqwAAAHgCAYAAABgsD+6AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAAsTAAALEwEAmpwYAAAQI0lEQVR4nO3dYahf9X3H8c+vua6J3Ui4hKKDOpVMxTmn1Tlwatt0rKKUziZgfJa5B2tg0TkY5EEZLepWYYITwTJ0o5Wi08ypXYdjMxG76OpkLc51jQUdU4ZiHaagREg8e5C/0WhiNWlyP7l9vSDk/s/v/Lnffx6Ed05+95wxTVMAAKDVhxZ6AAAAeC+CFQCAaoIVAIBqghUAgGqCFQCAaoIVAIBqcz9h3T2vAAA4EsaBFlxhBQCgmmAFAKCaYAV4H15++eXcdtttueyyy7Jq1aosW7Ysy5cvzwUXXJDbb789b7zxxn7f9+ijj+aSSy7J/Px8li1bljPPPDM33XRTdu/efYQ/AcDRa/yER7PawwqQ5Ktf/Wo2bNiQ448/Pp/61Kdywgkn5MUXX8y9996bHTt2ZM2aNbnnnnsyxltbsO6///6sWbMmS5cuzeWXX575+fl885vfzPbt27N27drcc889C/iJAOoccA+rYAV4H7Zs2ZJXX301l156aT70obf+c+qFF17Ieeedl+eeey6bN2/OmjVrkiQ//vGPs2rVquzYsSPbtm3LueeemyTZuXNnVq9encceeyx33nln1q1btyCfB6CQH7oCOBSrV6/OZz/72X1iNUmOO+64fOELX0iSPPzww3uPb968OS+99FLWrVu3N1aTZOnSpbnuuuuSJLfeeuvhHxxgERCsAIfomGOOSZLMzb11p8AtW7YkSS6++OJ3nX/RRRfl2GOPzaOPPprXX3/9yAwJcBQTrACHYNeuXfn617+eZN843b59e5LklFNOedd75ubmctJJJ2XXrl155plnjsygAEcxwQpwCDZt2pSnnnoql1xyST7zmc/sPb5jx44kyfLly/f7vjePv/LKK4d9RoCjnWAFOEg333xzbrzxxpx22mm54447FnocgEVLsAIchFtuuSVXX311Tj/99GzdujXz8/P7rL95BfXNK63v9ObxFStWHNY5ARYDwQrwAd10003ZuHFjzjjjjGzdujXHHXfcu8459dRTkyRPP/30u9Z27dqVZ599NnNzczn55JMP+7wARzvBCvAB3HDDDbnmmmty1llnZevWrfnoRz+63/NWr16dJHnwwQfftfbII4/ktddey/nnn58Pf/jDh3VegMVAsAK8T9dee202bdqUc845Jw899FBWrlx5wHPXrl2blStX5q677soTTzyx9/jOnTvzxS9+MUmyYcOGwz4zwGLgSVcA78PXvva1rF+/PkuWLMnGjRv3+9P/J554YtavX7/39X333Ze1a9dm6dKlWbduXebn5/PAAw/sfTTr3Xffvc+jXAF+xnk0K8Ch+NKXvpQvf/nL73nOJz7xiX2edpUk27Zty/XXX5/HHnssO3fuzKpVq3LllVfmqquuypIlSw7jxABHHcEKAEC1AwarPawAAFQTrAAAVBOsAABUm1voAQ7kxE3fWugRAAB+pvz3Vy5d6BH2yxVWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKqNaZoOvDjGg0lWHrlxABaFlUl+tNBDABxlfjRN08X7W3jPYAXggxtjPDFN07kLPQfAYmFLAAAA1QQrAADVBCvAT99fLvQAAIuJPawAAFRzhRUAgGpzCz0AwNFsjLEiyW1JzkgyJbkyyR8mOXV2yookr0zTdNaRnw5gcRCsAIfmL5I8OE3T2jHGzyU5dpqmy99cHGPcmGTHgk0HsAjYwwpwkMYYy5N8L8nJ037+Mh1jjCT/k2T1NE0/PMLjASwa9rACHLyTkryU5K/HGN8dY9w2xvjI29YvTPKiWAU4NIIV4ODNJfl4klunaTo7yatJNr1t/Yokdy7EYACLiWAFOHjPJ3l+mqbvzF5vzp6AzRhjLsnnk/zNAs0GsGgIVoCDNE3TC0meG2O8eUeATyf5/uzr30ryg2manl+Q4QAWEXcJADg0G5N8Y3aHgGeS/O7s+LrYDgDwU+EuAQAAVLMlAACAaoIVAIBqghUAgGqCFQCAaoIVAIBqghXgHcYYvzPGmMYYpy30LAAIVoD9uSLJv8x+XxCzJ2UBEMEKsI8xxs8nuSDJ72XPzf8zxvjkGOPhMcbmMcYPxhjfGGOM2dpXxhjfH2M8Ocb48zHGkjHGs2OPFWOM3WOMi2bnPjLG+OUxxkfGGH81xnh8jPHdMcbnZuvrxxgPjDG2JHloYf4EAPr4FzzAvj6X5MFpmp4eY7w8xjhndvzsJL+S5H+TbEvym2OM/0pyWZLTpmmaxhgrpmnaPcbYnuT0JCcl+fckF44xvpPkY9M0/XCM8adJtkzTdOUYY0WSx8cY/zz7Ph9PcuY0Tf93pD4wQDtXWAH2dUWSu2Zf35W3tgU8Pk3T89M0vZHke0lOTLIjyc4kt48xPp/ktdm5305y0ezXn2XPFdtfT/Jvs/XfTrJpjPG9JA8nWZrkhNnaP4lVgH25wgowM8aYT7I6ya+OMaYkS5JMSb6V5PW3nbo7ydw0TbvGGOcl+XSStUn+YPb+R5JsSPKLSf4kyR8n+WT2hGySjCRrpmna/o7v/xtJXj0sHw7gKOYKK8Bb1ia5Y5qmX5qm6cRpmj6W5NkkF+7v5Nl+1+XTNP1DkmuS/Nps6fEk5yd5Y5qmndlzRfb3sydkk+Qfk2x82z7Ysw/T5wFYFAQrwFuuSPJ37zj2tznw3QJ+IcnfjzGezJ67CvxRkkzT9HqS55L86+y8b8/O/Y/Z62uTHJPkyTHGf85eA3AAY5qmhZ4BAAAOyBVWAACqCVYAAKoJVgAAqglWAACqCVYAAKoJVgAAqglWAACqCVYAAKr9P8bb7HZA9fu3AAAAAElFTkSuQmCC",
      "text/plain": [
       "<Figure size 864x576 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "counts = count_digits(answers)\n",
    "plot_counts(counts)\n",
    "print(f\"Entropy: {entropy(counts)}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2f6c8a22-fdf5-4f30-865c-8e11927b1b7c",
   "metadata": {},
   "source": [
    "GPT4 returns the correct answer with certainty."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "50d4a55e-86df-46ab-8b38-302c79bc8add",
   "metadata": {},
   "source": [
    "## Conclusion\n",
    "\n",
    "When generating text completions with a language model we typically look at one output sample, trying to find the \"right\" answer. However, doing so we obscure the diversity of answers that these language models can produce. Assuming the diversity of answers reflects these models' \"uncertainty\", we can use measures such as the entropy of the answers' distribution to evaluate the quality of the answer.\n",
    "\n",
    "Which result should we be choosing once we have different samples? There is no definite answer to this question. The [self-consistency method](https://arxiv.org/abs/2203.11171) consists in choosing the result based on a majority vote. We think this choice is arbitrary and that choosing the correct answer is a [decision theory](https://en.wikipedia.org/wiki/Decision_theory) problem, which can only be solved by specifying a loss function that is adapted to the experiment's context; the majority vote being a particular case with a 0-1 loss."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}


================================================
FILE: examples/self_consistency.py
================================================
import re

import numpy as np
import openai

import outlines
from outlines import Template

examples = [
    {
        "question": "There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?",
        "answer": "We start with 15 trees. Later we have 21 trees. The difference must be the number of trees they planted. So, they must have planted 21 - 15 = 6 trees. The answer is 6.",
    },
    {
        "question": "If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?",
        "answer": "There are 3 cars in the parking lot already. 2 more arrive. Now there are 3 + 2 = 5 cars. The answer is 5.",
    },
    {
        "question": "Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?",
        "answer": "Leah had 32 chocolates and Leah’s sister had 42. That means there were originally 32 + 42 = 74 chocolates. 35 have been eaten. So in total they still have 74 - 35 = 39 chocolates. The answer is 39.",
    },
    {
        "question": "Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?",
        "answer": "Jason had 20 lollipops. Since he only has 12 now, he must have given the rest to Denny. The number of lollipops he has given to Denny must have been 20 - 12 = 8 lollipops. The answer is 8.",
    },
    {
        "question": "Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?",
        "answer": "He has 5 toys. He got 2 from mom, so after that he has 5 + 2 = 7 toys. Then he got 2 more from dad, so in total he has 7 + 2 = 9 toys. The answer is 9.",
    },
    {
        "question": "There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?",
        "answer": "There are 4 days from monday to thursday. 5 computers were added each day. That means in total 4 * 5 = 20 computers were added. There were 9 computers in the beginning, so now there are 9 + 20 = 29 computers. The answer is 29.",
    },
    {
        "question": "Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?",
        "answer": "Michael initially had 58 balls. He lost 23 on Tuesday, so after that he has 58 - 23 = 35 balls. On Wednesday he lost 2 more so now he has 35 - 2 = 33 balls. The answer is 33.",
    },
    {
        "question": "Olivia has $23. She bought five bagels for $3 each. How much money does she have left?",
        "answer": "She bought 5 bagels for $3 each. This means she spent 5",
    },
]

question = "When I was 6 my sister was half my age. Now I’m 70 how old is my sister?"


few_shots = Template.from_file("prompts/self_consistency.txt")

model = outlines.from_openai(openai.OpenAI(), "gpt-4o-mini")
generator = outlines.Generator(model)
prompt = few_shots(question=question, examples=examples)
answers = generator(prompt, n=10)

digits = []
for answer in answers:
    try:
        match = re.findall(r"\d+", answer)[-1]
        if match is not None:
            digit = int(match)
            digits.append(digit)
    except AttributeError:
        print(f"Could not parse the completion: '{answer}'")

unique_digits, counts = np.unique(digits, return_counts=True)
results = {int(d): int(c) for d, c in zip(unique_digits, counts)}
print(results)

max_count = max(results.values())
answer_value = [key for key, value in results.items() if value == max_count][0]
total_count = sum(results.values())
print(
    f"The most likely answer is {answer_value} ({max_count / total_count * 100}% consensus)"
)


================================================
FILE: examples/simulation_based_inference.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "e7c7d0bb-8d45-4139-a584-02c7196db92b",
   "metadata": {},
   "source": [
    "# Find the best few-shot examples using simulation-based inference"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "831a76f5-c569-4174-adab-fb0245877367",
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import random\n",
    "import requests\n",
    "import re\n",
    "\n",
    "import openai\n",
    "\n",
    "import outlines\n",
    "\n",
    "random.seed(0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "ec604edc-c8b6-4088-bf17-b77ae57d05a1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "env: OPENAI_API_KEY=# your key here\n"
     ]
    }
   ],
   "source": [
    "%env OPENAI_API_KEY = # your key here"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "aabb4db6-fd94-4c42-ab7f-97c3de45b2cc",
   "metadata": {},
   "source": [
    "In this example we will use GPT 4 mini to solve problems from the GSM-8K dataset. The state-of-the-art performance on this dataset is obtained using few-shot prompting with 5 examples. However, it is not clear how one should select these examples. Here, we will use **simulation-based inference** to try to infer which examples we should be using to get the best out of the model's abilities to solve the problem.\n",
    "\n",
    "Let's start with downloading the dataset:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "367f5f89-8e5d-4381-b9eb-78c60bc50f86",
   "metadata": {},
   "outputs": [],
   "source": [
    "result = requests.get(\n",
    "    \"https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/train.jsonl\"\n",
    ")\n",
    "lines = result.iter_lines()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ef0f7aa9-d528-41e9-8a9d-4497f01f0692",
   "metadata": {},
   "source": [
    "We now divide the train set in two sets:\n",
    "- 20 problems from which we are going to sample 5 examples at random for every inference;\n",
    "- 500 problems which we are going to use to perform inference."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "0667c4a8-cebe-4796-bbc9-575ee9498717",
   "metadata": {},
   "outputs": [],
   "source": [
    "example_set = []\n",
    "for _ in range(10):\n",
    "    line = json.loads(next(lines))\n",
    "    answer = re.findall(r\"\\d+\", line[\"answer\"])[-1]\n",
    "    example_set.append({\"question\": line[\"question\"], \"answer\": answer})\n",
    "\n",
    "train_set = []\n",
    "for _ in range(500):\n",
    "    line = json.loads(next(lines))\n",
    "    answer = re.findall(r\"\\d+\", line[\"answer\"])[-1]\n",
    "    train_set.append({\"question\": line[\"question\"], \"answer\": answer})"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4b52b470-d818-495a-a6e3-e50a1deff13c",
   "metadata": {},
   "source": [
    "Now let's define the prompt, the model, and the sampling loop. The sampling loop consists in choosing 5 examples at random, sampling 20 model answers; if the answer is correct we keep the example ids as samples, otherwise continue:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "9fbebaa9-f05e-4c6b-8875-73a08273bbb5",
   "metadata": {},
   "outputs": [],
   "source": [
    "few_shots = outlines.Template.from_string(\n",
    "    \"\"\"\n",
    "    {% for example in examples %}\n",
    "    Q: {{ example.question }}\n",
    "    A: {{ example.answer }}\n",
    "    {% endfor %}\n",
    "    Q: {{ question }}\n",
    "    A:\n",
    "    \"\"\"\n",
    ")\n",
    "\n",
    "model = outlines.from_openai(openai.OpenAI(), \"gpt-4o-mini\")\n",
    "\n",
    "\n",
    "# TODO: This could largely benefit from vectorization in #52\n",
    "def one_train_example(problem, example_set):\n",
    "    example_ids = random.choices(range(0, len(example_set)), k=5)\n",
    "    examples = [example_set[i] for i in example_ids]\n",
    "    prompt = few_shots(question=problem[\"question\"], examples=examples)\n",
    "    answers_raw = model(prompt, samples=20)\n",
    "\n",
    "    samples = []\n",
    "    for answer_raw in answers_raw:\n",
    "        try:\n",
    "            answer = re.findall(r\"\\d+\", answer_raw)[-1]\n",
    "            if answer == problem[\"answer\"]:\n",
    "                samples += example_ids\n",
    "            else:\n",
    "                continue\n",
    "        except IndexError:\n",
    "            pass\n",
    "\n",
    "    return samples"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "1dae1ef2-c9e0-4c98-8686-7fbc2ff55e56",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "9efc9d077af24a2eb5ea3c05fe63f298",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/500 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "from tqdm.contrib import tenumerate\n",
    "\n",
    "samples = []\n",
    "for i, train_example in tenumerate(train_set):\n",
    "    samples += one_train_example(train_example, example_set)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d084a3c1-ec84-48ab-91d7-b4c64afce512",
   "metadata": {},
   "source": [
    "We can now plot the number of times each example was sampled:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "c211e0af-33c6-4e35-af83-194dcef31f01",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Text(0, 0.5, 'Counts')"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAtoAAAHgCAYAAACb58plAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjcuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/bCgiHAAAACXBIWXMAAAsTAAALEwEAmpwYAAAbiElEQVR4nO3da7BldXnn8d8jLV7wAmqHIsBMk5FxtIyJpEW8jhFjQBxxLEQdo5SFYVKjxktubZIqy5kXg1OWGmdSZFBUrFHUoJZEGJRBvE2NaItEFHTsIEozKK0i3pIo+syLszDHtpHT0P+9zuXzqTp19l5r7X2ebRfdX9f5772quwMAAOxbd5p7AAAAWI+ENgAADCC0AQBgAKENAAADCG0AABhAaAMAwACb5h5ghOOOO64vvPDCuccAAGD9q1vbsS7PaH/jG9+YewQAADa4dRnaAAAwt2GhXVVvqqobqupzy7bdp6ouqqovTd8PmrZXVb2+qnZU1Wer6qhljzllOv5LVXXKqHkBAGBfGnlG+y1Jjttt27YkF3f3kUkunu4nyfFJjpy+TktyRrIU5klekeThSY5O8opb4hwAAFazYaHd3R9N8q3dNp+Y5Ozp9tlJnrps+1t7ySeSHFhVhyT57SQXdfe3uvvGJBfl5+MdAABWnUWv0T64u6+fbn8tycHT7UOTXLvsuJ3Ttlvb/nOq6rSq2l5V23ft2rVvpwYAgL0025shu7uT9D58vjO7e2t3b928efO+eloAALhdFh3aX5+WhGT6fsO0/bokhy877rBp261tBwCAVW3RoX1ekls+OeSUJO9btv2506ePHJPkpmmJyQeSPLGqDpreBPnEaRsAAKxqw64MWVXnJHlckvtV1c4sfXrI6UneVVWnJvlKkpOnwy9I8qQkO5L8IMnzkqS7v1VV/ynJp6bj/mN37/4GSwAAWHVqaan0+rJ169bevn373GMAALD+baxLsAMAwNyENgAADCC0AQBgAKENAAADCG0AABhAaAMAwABCGwAABhDaAAAwgNAGAIABhl2CHQCA1W/LtvPnHuEOu+b0E+YeYY+c0QYAgAGENgAADCC0AQBgAKENAAADCG0AABhAaAMAwABCGwAABhDaAAAwgNAGAIABhDYAAAwgtAEAYAChDQAAAwhtAAAYQGgDAMAAQhsAAAYQ2gAAMIDQBgCAAYQ2AAAMILQBAGAAoQ0AAAMIbQAAGEBoAwDAAEIbAAAGENoAADCA0AYAgAGENgAADCC0AQBgAKENAAADCG0AABhAaAMAwABCGwAABhDaAAAwgNAGAIABhDYAAAwgtAEAYAChDQAAAwhtAAAYQGgDAMAAm+YeANayLdvOn3uEfeKa00+YewQAWHec0QYAgAGENgAADCC0AQBgAKENAAADCG0AABhAaAMAwABCGwAABhDaAAAwgNAGAIABhDYAAAwgtAEAYAChDQAAAwhtAAAYQGgDAMAAQhsAAAYQ2gAAMIDQBgCAAYQ2AAAMILQBAGCATXMPwPqwZdv5c4+wT1xz+glzjwAArBPOaAMAwABCGwAABhDaAAAwgNAGAIABhDYAAAwgtAEAYAAf7wcAe+BjS4E7apYz2lX10qr6fFV9rqrOqaq7VtURVXVpVe2oqndW1f7TsXeZ7u+Y9m+ZY2YAANgbCw/tqjo0ye8n2drdD06yX5JnJnlVktd29/2T3Jjk1Okhpya5cdr+2uk4AABY1eZao70pyd2qalOSuye5Psnjk5w77T87yVOn2ydO9zPtP7aqanGjAgDA3lt4aHf3dUleneSrWQrsm5J8Osm3u/vm6bCdSQ6dbh+a5NrpsTdPx993kTMDAMDemmPpyEFZOkt9RJJfTnJAkuP2wfOeVlXbq2r7rl277ujTAQDAHTLH0pEnJPlyd+/q7h8leU+SRyU5cFpKkiSHJbluun1dksOTZNp/7yTf3P1Ju/vM7t7a3Vs3b948+jUAAMAvNEdofzXJMVV192mt9bFJrkxySZKTpmNOSfK+6fZ50/1M+z/U3b3AeQEAYK/NsUb70iy9qfGyJFdMM5yZ5E+SvKyqdmRpDfZZ00POSnLfafvLkmxb9MwAALC3ZrlgTXe/Iskrdtt8dZKj93DsPyR5+iLmAgCAfcUl2AEAYAChDQAAAwhtAAAYQGgDAMAAQhsAAAYQ2gAAMIDQBgCAAYQ2AAAMILQBAGAAoQ0AAAMIbQAAGGDT3AOsN1u2nT/3CHfYNaefMPcIAABrnjPaAAAwgNAGAIABhDYAAAwgtAEAYAChDQAAAwhtAAAYQGgDAMAAQhsAAAYQ2gAAMIDQBgCAAYQ2AAAMILQBAGAAoQ0AAAMIbQAAGEBoAwDAAEIbAAAGENoAADCA0AYAgAGENgAADLBp7gEAWN22bDt/7hH2iWtOP2HuEdaM9fBn7s+b1cAZbQAAGEBoAwDAAEIbAAAGsEYb2GvrYf1mYg0nAGM5ow0AAAMIbQAAGEBoAwDAAEIbAAAGENoAADCA0AYAgAGENgAADCC0AQBgAKENAAADCG0AABhAaAMAwABCGwAABhDaAAAwgNAGAIABhDYAAAwgtAEAYAChDQAAAwhtAAAYYNPcAwAArAZbtp0/9wh32DWnnzD3CCzjjDYAAAwgtAEAYAChDQAAAwhtAAAYQGgDAMAAQhsAAAYQ2gAAMIDQBgCAAYQ2AAAMILQBAGAAoQ0AAAMIbQAAGEBoAwDAAEIbAAAGENoAADCA0AYAgAGENgAADCC0AQBgAKENAAADCG0AABhAaAMAwABCGwAABpgltKvqwKo6t6q+UFVXVdUjquo+VXVRVX1p+n7QdGxV1eurakdVfbaqjppjZgAA2BtzndH+iyQXdve/SvJrSa5Ksi3Jxd19ZJKLp/tJcnySI6ev05KcsfhxAQBg7yw8tKvq3kkem+SsJOnuH3b3t5OcmOTs6bCzkzx1un1ikrf2kk8kObCqDlno0AAAsJfmOKN9RJJdSd5cVZ+pqjdW1QFJDu7u66djvpbk4On2oUmuXfb4ndM2AABYteYI7U1JjkpyRnc/NMn380/LRJIk3d1Jem+etKpOq6rtVbV9165d+2xYAAC4PeYI7Z1Jdnb3pdP9c7MU3l+/ZUnI9P2Gaf91SQ5f9vjDpm0/o7vP7O6t3b118+bNw4YHAICVWHhod/fXklxbVQ+YNh2b5Mok5yU5Zdp2SpL3TbfPS/Lc6dNHjkly07IlJgAAsCptmunnvijJ26pq/yRXJ3lelqL/XVV1apKvJDl5OvaCJE9KsiPJD6ZjAQBgVZsltLv78iRb97Dr2D0c20leMHomAADYl1wZEgAABhDaAAAwgNAGAIABhDYAAAwgtAEAYAChDQAAAwhtAAAYQGgDAMAAQhsAAAYQ2gAAMIDQBgCAAYQ2AAAMILQBAGAAoQ0AAAMIbQAAGEBoAwDAAEIbAAAGENoAADCA0AYAgAH2OrSr6qCqesiIYQAAYL1YUWhX1Yer6l5VdZ8klyV5Q1W9ZuxoAACwdq30jPa9u/s7SZ6W5K3d/fAkTxg3FgAArG0rDe1NVXVIkpOTvH/gPAAAsC6sNLRfmeQDSXZ096eq6leSfGncWAAAsLZtWuFx13f3T98A2d1XW6MNAAC3bqVntP/rCrcBAAC5jTPaVfWIJI9MsrmqXrZs172S7DdyMAAAWMtua+nI/knuMR13z2Xbv5PkpFFDAQDAWvcLQ7u7P5LkI1X1lu7+yoJmAgCANW+lb4a8S1WdmWTL8sd09+NHDAUAAGvdSkP7r5P8VZI3JvnxuHEAAGB9WGlo39zdZwydBAAA1pGVfrzf31TVf6iqQ6rqPrd8DZ0MAADWsJWe0T5l+v5Hy7Z1kl/Zt+MAAMD6sKLQ7u4jRg8CAADryYpCu6qeu6ft3f3WfTsOAACsDytdOvKwZbfvmuTYJJclEdoAALAHK1068qLl96vqwCTvGDEQAACsByv91JHdfT+JddsAAHArVrpG+2+y9CkjSbJfkgcmedeooQAAYK1b6RrtVy+7fXOSr3T3zgHzAADAurCipSPd/ZEkX0hyzyQHJfnhyKEAAGCtW1FoV9XJST6Z5OlJTk5yaVWdNHIwAABYy1a6dOTPkjysu29IkqranOR/JTl31GAAALCWrfRTR+50S2RPvrkXjwUAgA1npWe0L6yqDyQ5Z7r/jCQXjBkJAADWvl8Y2lV1/yQHd/cfVdXTkjx62vV/krxt9HAAALBW3dYZ7dcleXmSdPd7krwnSarqV6d9/2bgbACrypZt5889wj5xzeknzD0CwIZwW+usD+7uK3bfOG3bMmQiAABYB24rtA/8Bfvutg/nAACAdeW2Qnt7Vf3u7hur6vlJPj1mJAAAWPtua432S5K8t6qenX8K661J9k/ybwfOBQAAa9ovDO3u/nqSR1bVbyZ58LT5/O7+0PDJAABgDVvR52h39yVJLhk8CwAArBuu7ggAAAMIbQAAGEBoAwDAAEIbAAAGENoAADCA0AYAgAGENgAADCC0AQBgAKENAAADCG0AABhAaAMAwABCGwAABhDaAAAwgNAGAIABhDYAAAwgtAEAYAChDQAAAwhtAAAYQGgDAMAAQhsAAAYQ2gAAMIDQBgCAAYQ2AAAMMFtoV9V+VfWZqnr/dP+Iqrq0qnZU1Turav9p+12m+zum/VvmmhkAAFZqzjPaL05y1bL7r0ry2u6+f5Ibk5w6bT81yY3T9tdOxwEAwKo2S2hX1WFJTkjyxul+JXl8knOnQ85O8tTp9onT/Uz7j52OBwCAVWuuM9qvS/LHSX4y3b9vkm93983T/Z1JDp1uH5rk2iSZ9t80Hf8zquq0qtpeVdt37do1cHQAALhtCw/tqnpykhu6+9P78nm7+8zu3trdWzdv3rwvnxoAAPbaphl+5qOSPKWqnpTkrknuleQvkhxYVZums9aHJbluOv66JIcn2VlVm5LcO8k3Fz82AACs3MLPaHf3y7v7sO7ekuSZST7U3c9OckmSk6bDTknyvun2edP9TPs/1N29wJEBAGCvrabP0f6TJC+rqh1ZWoN91rT9rCT3nba/LMm2meYDAIAVm2PpyE9194eTfHi6fXWSo/dwzD8kefpCBwMAgDtoNZ3RBgCAdUNoAwDAAEIbAAAGENoAADCA0AYAgAGENgAADCC0AQBgAKENAAADCG0AABhAaAMAwABCGwAABhDaAAAwgNAGAIABhDYAAAwgtAEAYAChDQAAAwhtAAAYQGgDAMAAQhsAAAYQ2gAAMIDQBgCAAYQ2AAAMILQBAGAAoQ0AAAMIbQAAGEBoAwDAAEIbAAAGENoAADCA0AYAgAGENgAADCC0AQBgAKENAAADCG0AABhAaAMAwABCGwAABhDaAAAwgNAGAIABhDYAAAwgtAEAYAChDQAAAwhtAAAYQGgDAMAAQhsAAAYQ2gAAMIDQBgCAAYQ2AAAMILQBAGAAoQ0AAAMIbQAAGEBoAwDAAEIbAAAGENoAADCA0AYAgAGENgAADCC0AQBgAKENAAADCG0AABhAaAMAwABCGwAABhDaAAAwgNAGAIABhDYAAAwgtAEAYAChDQAAAwhtAAAYQGgDAMAAQhsAAAYQ2gAAMIDQBgCAAYQ2AAAMILQBAGAAoQ0AAAMIbQAAGEBoAwDAAEIbAAAGENoAADDAwkO7qg6vqkuq6sqq+nxVvXjafp+quqiqvjR9P2jaXlX1+qraUVWfraqjFj0zAADsrTnOaN+c5A+6+0FJjknygqp6UJJtSS7u7iOTXDzdT5Ljkxw5fZ2W5IzFjwwAAHtn4aHd3dd392XT7e8muSrJoUlOTHL2dNjZSZ463T4xyVt7ySeSHFhVhyx2agAA2DuzrtGuqi1JHprk0iQHd/f1066vJTl4un1okmuXPWzntA0AAFat2UK7qu6R5N1JXtLd31m+r7s7Se/l851WVduravuuXbv24aQAALD3ZgntqrpzliL7bd39nmnz129ZEjJ9v2Hafl2Sw5c9/LBp28/o7jO7e2t3b928efO44QEAYAXm+NSRSnJWkqu6+zXLdp2X5JTp9ilJ3rds+3OnTx85JslNy5aYAADAqrRphp/5qCTPSXJFVV0+bfvTJKcneVdVnZrkK0lOnvZdkORJSXYk+UGS5y10WgAAuB0WHtrd/fEkdSu7j93D8Z3kBUOHAgCAfcyVIQEAYAChDQAAAwhtAAAYQGgDAMAAQhsAAAYQ2gAAMIDQBgCAAYQ2AAAMILQBAGAAoQ0AAAMIbQAAGEBoAwDAAEIbAAAGENoAADCA0AYAgAGENgAADCC0AQBgAKENAAADCG0AABhAaAMAwABCGwAABhDaAAAwgNAGAIABhDYAAAwgtAEAYAChDQAAAwhtAAAYQGgDAMAAQhsAAAYQ2gAAMIDQBgCAAYQ2AAAMILQBAGAAoQ0AAAMIbQAAGEBoAwDAAEIbAAAGENoAADCA0AYAgAGENgAADCC0AQBgAKENAAADCG0AABhAaAMAwABCGwAABhDaAAAwgNAGAIABhDYAAAwgtAEAYAChDQAAAwhtAAAYQGgDAMAAQhsAAAYQ2gAAMIDQBgCAAYQ2AAAMILQBAGAAoQ0AAAMIbQAAGEBoAwDAAEIbAAAGENoAADCA0AYAgAGENgAADCC0AQBgAKENAAADCG0AABhAaAMAwABCGwAABhDaAAAwgNAGAIABhDYAAAwgtAEAYAChDQAAAwhtAAAYQGgDAMAAQhsAAAZYM6FdVcdV1RerakdVbZt7HgAA+EXWRGhX1X5J/jLJ8UkelORZVfWgeacCAIBbtyZCO8nRSXZ099Xd/cMk70hy4swzAQDArVoroX1okmuX3d85bQMAgFWpunvuGW5TVZ2U5Ljufv50/zlJHt7dL1x2zGlJTpvuPiDJFxc+6OLcL8k35h5iBl73xuJ1byxe98ayUV93snFf+3p+3d/o7uP2tGPToie5na5Lcviy+4dN236qu89McuYih5pLVW3v7q1zz7FoXvfG4nVvLF73xrJRX3eycV/7Rn3da2XpyKeSHFlVR1TV/kmemeS8mWcCAIBbtSbOaHf3zVX1wiQfSLJfkjd19+dnHgsAAG7VmgjtJOnuC5JcMPccq8SGWCKzB173xuJ1byxe98ayUV93snFf+4Z83WvizZAAALDWrJU12gAAsKYI7TVko16GvqreVFU3VNXn5p5lUarq8Kq6pKqurKrPV9WL555pUarqrlX1yar62+m1v3LumRalqvarqs9U1fvnnmWRquqaqrqiqi6vqu1zz7MoVXVgVZ1bVV+oqquq6hFzzzRaVT1g+nO+5es7VfWSuedahKp66fR32ueq6pyquuvcMy1CVb14es2f3yh/1stZOrJGTJeh/79JfitLF+z5VJJndfeVsw62AFX12CTfS/LW7n7w3PMsQlUdkuSQ7r6squ6Z5NNJnrpB/rwryQHd/b2qunOSjyd5cXd/YubRhquqlyXZmuRe3f3kuedZlKq6JsnW7l6vn7G7R1V1dpKPdfcbp0/Uunt3f3vmsRZm+nftuixdF+Mrc88zUlUdmqW/yx7U3X9fVe9KckF3v2Xeycaqqgdn6WreRyf5YZILk/xed++YdbAFckZ77diwl6Hv7o8m+dbccyxSd1/f3ZdNt7+b5KpskKuh9pLvTXfvPH2t+zMCVXVYkhOSvHHuWRivqu6d5LFJzkqS7v7hRorsybFJ/m69R/Yym5Lcrao2Jbl7kv838zyL8MAkl3b3D7r75iQfSfK0mWdaKKG9drgM/QZVVVuSPDTJpTOPsjDTEorLk9yQ5KLu3giv/XVJ/jjJT2aeYw6d5INV9enpKr8bwRFJdiV587Rc6I1VdcDcQy3YM5OcM/cQi9Dd1yV5dZKvJrk+yU3d/cF5p1qIzyV5TFXdt6runuRJ+dkLEK57QhtWsaq6R5J3J3lJd39n7nkWpbt/3N2/nqWrwB49/fpx3aqqJye5obs/PfcsM3l0dx+V5PgkL5iWi613m5IcleSM7n5oku8n2Ujvvdk/yVOS/PXcsyxCVR2Upd9CH5Hkl5McUFW/M+9U43X3VUleleSDWVo2cnmSH88506IJ7bXjNi9Dz/oyrU9+d5K3dfd75p5nDtOv0i9JctzMo4z2qCRPmdYqvyPJ46vqf8w70uJMZ/vS3TckeW+WlsqtdzuT7Fz225pzsxTeG8XxSS7r7q/PPciCPCHJl7t7V3f/KMl7kjxy5pkWorvP6u7f6O7HJrkxS+832zCE9trhMvQbyPSGwLOSXNXdr5l7nkWqqs1VdeB0+25ZegPwF2YdarDufnl3H9bdW7L03/aHunvdn+1Kkqo6YHrDb6alE0/M0q+b17Xu/lqSa6vqAdOmY5Os+zc7L/OsbJBlI5OvJjmmqu4+/f1+bJbee7PuVdUvTd//WZbWZ7993okWa81cGXKj28iXoa+qc5I8Lsn9qmpnkld091nzTjXco5I8J8kV01rlJPnT6Qqp690hSc6ePpHgTkne1d0b6uPuNpiDk7x3qT2yKcnbu/vCeUdamBcledt08uTqJM+beZ6FmP4P1W8l+fdzz7Io3X1pVZ2b5LIkNyf5TDbOlRLfXVX3TfKjJC/YaG/69fF+AAAwgKUjAAAwgNAGAIABhDYAAAwgtAEAYAChDQAAAwhtgFWuqn5cVZcv+5rlCoJVdU1V3e92PO63q+qVVXWfqvqfI2YDWI18jjbA6vf30yXp16rHZOkKn49J8vGZZwFYGGe0Adagqrp3VX3xlisLVtU5VfW70+0zqmp7VX2+ql657DHXVNV/ns6Kb6+qo6rqA1X1d1X1e9Mxj6uqj1bV+dPz/1VV/dy/FVX1O1X1yem5/vt0gaHdj3nGdMGl30/yuiRvSPK8qnJVW2BDENoAq9/ddls68ozuvinJC5O8paqemeSg7n7DdPyfdffWJA9J8q+r6iHLnuur09nxjyV5S5KTkhyT5JXLjjk6S1ctfFCSf5Glyyb/VFU9MMkzkjxqeq4fJ3n27kN39zuTPDTJ57r7V5NckeSh3f2U2/8/BcDaYekIwOq3x6Uj3X1RVT09yV8m+bVlu06uqtOy9Hf8IVkK5s9O+245m3xFknt093eTfLeq/rGqDpz2fbK7r06WzpQneXSSc5c9/7FJfiPJp6ZLp98tyQ23Mvu/zNLlxZPkgOnnAWwIQhtgjZqWdDwwyQ+SHJRkZ1UdkeQPkzysu2+sqrckueuyh/3j9P0ny27fcv+WfxN6tx+1+/1KcnZ3v/w25tue5H5JNlXVlUkOmZaSvKi7P3bbrxBgbbN0BGDtemmSq5L8uyRvrqo7J7lXku8nuamqDk5y/O143qOr6ogp5J+Rn38D48VJTqqqX0qS6dNE/vnuTzItXzk/yYlJ/kuWlrT8usgGNgqhDbD67b5G+/TpTZDPT/IHU7h+NMmfd/ffJvlMki8keXuS/307ft6nkvy3LEX8l5O8d/nO7r4yyZ8n+WBVfTbJRVlaorInRyW5PEufOPKR2zELwJpV3bv/RhCAjaqqHpfkD7v7yTOPArDmOaMNAAADOKMNAAADOKMNAAADCG0AABhAaAMAwABCGwAABhDaAAAwgNAGAIAB/j/+cpD1bixMkQAAAABJRU5ErkJggg==",
      "text/plain": [
       "<Figure size 864x576 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pylab as plt\n",
    "\n",
    "example_ids, counts = np.unique(samples, return_counts=True)\n",
    "\n",
    "fig = plt.figure(figsize=(12, 8))\n",
    "ax = fig.add_subplot(111)\n",
    "ax.bar(example_ids, counts)\n",
    "\n",
    "ax.spines[[\"top\", \"right\"]].set_visible(False)\n",
    "\n",
    "ax.set_xticks(range(10))\n",
    "ax.set_xlabel(\"Example #\")\n",
    "ax.set_ylabel(\"Counts\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cde37e5b-377e-4872-af40-674d680bd2da",
   "metadata": {},
   "source": [
    "Looking at the distribution, our best guess for which examples we should use for benchmarking on the test set would be 0, 1, 2, 6 and 9. This method can be trivially extended to other workflows that use few-shot examples to query LLMs. Of course, simulation-based inference extends beyong choosing the \"best\" prompt, and could for instance be useful to select the structure of chains of LLMs and tools as well."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "bddda20b-234a-4d30-b40a-90708fbaba23",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'question': 'Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?',\n",
       " 'answer': '72'}"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "example_set[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "fb186bf9-62b7-485f-a8ce-401f551a9e57",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'question': 'Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?',\n",
       " 'answer': '10'}"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "example_set[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "ae427bb2-e3f4-4a96-a508-e8011a0fc553",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'question': 'Betty is saving money for a new wallet which costs $100. Betty has only half of the money she needs. Her parents decided to give her $15 for that purpose, and her grandparents twice as much as her parents. How much more money does Betty need to buy the wallet?',\n",
       " 'answer': '5'}"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "example_set[2]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "fe43ae0f-c18f-4b74-b639-8481472edf4d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'question': 'Albert is wondering how much pizza he can eat in one day. He buys 2 large pizzas and 2 small pizzas. A large pizza has 16 slices and a small pizza has 8 slices. If he eats it all, how many pieces does he eat that day?',\n",
       " 'answer': '48'}"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "example_set[6]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "19d9d936-d0f0-4927-990c-76dbbfa95b47",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'question': 'Tina makes $18.00 an hour.  If she works more than 8 hours per shift, she is eligible for overtime, which is paid by your hourly wage + 1/2 your hourly wage.  If she works 10 hours every day for 5 days, how much money does she make?',\n",
       " 'answer': '990'}"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "example_set[9]"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}


================================================
FILE: examples/vllm_offline_integration.py
================================================
"""Example of integrating `outlines` with `vllm`."""

import vllm
from pydantic import BaseModel
from transformers import AutoTokenizer

from outlines.models.vllm_offline import adapt_tokenizer
from outlines.processors import JSONLogitsProcessor


class Person(BaseModel):
    first_name: str
    surname: str


MODEL_ID = "mistralai/Mistral-7B-v0.1"
llm = vllm.LLM(model=MODEL_ID, max_model_len=512)
tokenizer = adapt_tokenizer(AutoTokenizer.from_pretrained(MODEL_ID))
logits_processor = JSONLogitsProcessor(
    schema=Person,
    tokenizer=tokenizer,
    tensor_library_name="torch",
    whitespace_pattern=r" ?"
)
result = llm.generate(
    ["He is Tom Jones", "She saw Linda Smith"],
    sampling_params=vllm.SamplingParams(
        temperature=0.0,
        max_tokens=50,
        logits_processors=[logits_processor],
    ),
)
print(result)


================================================
FILE: flake.nix
================================================
{
  inputs.flake-utils.url = "github:numtide/flake-utils";
  outputs = { self, nixpkgs, flake-utils }:
    flake-utils.lib.eachDefaultSystem (system:
      let
        pkgs = import nixpkgs {
          inherit system;
          config.allowUnfree = true;
        };
      in { devShells.default = import ./shell.nix { inherit pkgs; }; });
}


================================================
FILE: llm.txt
================================================
# Outlines Codebase Reference

## Overview

Outlines is a library for structured generation for type-safe LLMs. It ensures outputs conform to specified formats (JSON schemas, regex patterns, grammars) by constraining the token generation process, or calling an API that uses this process.

**Core insight**: Instead of generating text and hoping it matches a format, Outlines makes it impossible for the model to generate invalid outputs by masking invalid tokens during generation.

**Note**: The codebase has undergone significant refactoring. Core FSM functionality has been extracted to the `outlines-core` package.

## Usage Examples

For comprehensive usage examples, see:
- **README.md**: Quick start examples for JSON generation, regex constraints, and choice selection
- **docs/cookbook/**: Detailed examples including:
  - `docs/cookbook/prompting.md`: Advanced prompting techniques
  - `docs/cookbook/models.md`: Working with different model providers
  - `docs/cookbook/humaneval.md`: Code generation examples
  - `docs/cookbook/qa-with-citations.md`: Question answering with structured citations
  - `docs/cookbook/deploy-to-servers.md`: Deployment examples with vLLM and TGI
- **examples/**: Standalone example scripts
  - `examples/lark_grammar.py`: Grammar-based generation
  - `examples/math_generate_code.py`: Code generation with constraints
  - `examples/multiple_sglang_backend.py`: Using multiple backend servers
- **tests/**: Test files contain many practical usage patterns

## Architecture

### Layer Stack

```
User API (outlines.models)
    ↓
Generator Classes (SteerableGenerator, BlackBoxGenerator)
    ↓
Type System (types/dsl.py: Pydantic → JsonSchema → Regex)
    ↓
FSM Compilation (outlines-core: regex → FSM via interegular)
    ↓
Guide System (processors/guide.py: FSM state management)
    ↓
Logits Processing (processors/structured.py: token masking)
    ↓
Model Providers (transformers, OpenAI, etc.)
```

### Key Design Decisions

1. **FSM-based constraints**: For local models, constraints compile to finite state machines that track valid next tokens
2. **Provider abstraction**: Same constraint system works across local models (transformers) and APIs (OpenAI)
3. **Lazy compilation**: FSMs are compiled on first use and cached persistently
4. **Token-level control**: Constraints apply at the token level, not character level
5. **Type-driven API**: Python types are the primary interface for specifying constraints

## Core Components

### Models (`outlines/models/`)
Base classes and implementations for different model providers:
- `SteerableModel`: For models where we control logits (transformers, llama.cpp)
- `BlackBoxModel`: For API models with structured output support (OpenAI, Anthropic)
- Each provider has an adapter class handling input and output format conversion

Key files:
- `base.py`: Abstract base classes defining the model interface
- `transformers.py`: Integration with HuggingFace transformers
- `openai.py`: OpenAI API integration
- `gemini.py`: Gemini integration
- `mlxlm.py`: MLX-LM integration
- `vllm_offline.py`: vLLM integration
- `llamacpp.py`: llama.cpp integration
- `ollama.py`: Ollama integration
- `vllm.py`: Integration with vLLM servers
- `tgi.py`: Integration with text-generation-inferece servers
- `sglang.py`: Integration with SGLang servers

### Generation (`outlines/generator.py`)
Handles the generation process:
- `generator.py`: Main `Generator` class implementations (root level)
- Stream functionality is now integrated into generator classes

Base classes and implementations for different model providers:
- `BlackBoxGenerator`: For API models with structured outputs support
- `SteerableGenerator`: For modesl where we control the logits

### FSM System (`outlines/fsm/` and `outlines/processors/`)
Core constraint enforcement:
- `processors/guide.py`: Base `Guide` class and `RegexGuide` implementation
- `fsm/parsing.py`: Lark-based CFG parsing with `PartialLark` parser
- Regex to FSM compilation now uses `outlines_core.fsm` module

Key concepts:
- **Guide**: Manages FSM state during generation
- **State transitions**: Precomputed mapping of (state, token) → next_state
- **Token masking**: For each state, compute which tokens are valid

### Type System (`outlines/types/`)
Type conversion pipeline:
- `dsl.py`: Term DSL defining constraint language (Sequence, Choice, etc.) and JSON schema to regex conversion
- `__init__.py`: Common regex types and DSL functions
- Python types → Term DSL → Regex → FSM

### Logits Processors (`outlines/processors/`)
Apply constraints during generation:
- `structured.py`: Main `StructuredLogitsProcessor`
- `base_logits_processor.py`: Abstract base class
- Processors mask invalid tokens by setting their logits to -inf

## Key Algorithms

### FSM Compilation Pipeline
1. **Pattern definition**: User provides Pydantic model, regex, or grammar
2. **Schema to regex**: Convert complex types to regex patterns
   - JSON schemas become regex matching valid JSON
   - Pydantic models extract JSON schema then convert
3. **Regex to FSM**: Use interegular library to build FSM
4. **FSM to token map**: For each FSM state, compute valid tokens
   - Handle multi-character tokens
   - Account for token boundaries
5. **Guide creation**: Wrap FSM with state tracking

### Token Masking Process
```python
# Simplified logits processing
def process_logits(logits, current_state, guide):
    valid_tokens = guide.get_valid_tokens(current_state)
    mask = torch.full_like(logits, -float('inf'))
    mask[valid_tokens] = 0
    return logits + mask
```

## File Organization

```
outlines/
├── __init__.py              # Public API exports
├── generator.py             # Main Generator classes
├── models/                  # Model integrations
│   ├── base.py             # Abstract base classes
│   ├── transformers.py     # HuggingFace support
│   └── [provider].py       # Other providers (openai, anthropic, etc.)
├── fsm/                     # FSM engine
│   ├── __init__.py
│   └── parsing.py          # Grammar parsing
├── types/                   # Type system
│   ├── __init__.py         # Common regex types and DSL exports
│   ├── dsl.py              # Term DSL and JSON schema conversion
│   └── utils.py            # Type checking utilities
├── processors/              # Logits processing and guides
│   ├── guide.py            # Guide implementations
│   ├── structured.py       # Main processor
│   └── tensor_adapters/    # Framework-specific tensor handling
├── caching.py               # Caching system
├── grammars/                # Grammar files (.lark)
```

## Extension Points

### Adding a Model Provider
1. Create model class inheriting from `SteerableModel` or `BlackBoxModel`
2. Implement required methods: `generate()`, `generate_stream()`
3. Add constructor function in `outlines/__init__.py`
4. Handle provider-specific input and structured output formats with a `TypeAdapter`

### Adding a Constraint Type
1. Define new Term subclass in `types/dsl.py`
2. Implement `to_regex()` conversion
3. Register type handler for Python type conversion in `python_types_to_terms()`
4. Add tests for FSM compilation

### Custom Logits Processor
1. Inherit from `OutlinesLogitsProcessor`
2. Implement `process_logits()` method
3. Handle batch processing and state management
4. Register with generator

## Common Patterns in Codebase

1. **Factory functions**: `from_transformers()`, `from_openai()` hide complexity
2. **Abstract base classes**: Define interfaces for models, processors, guides
3. **Lazy imports**: Optional dependencies imported only when needed
5. **Type adapters**: Convert between Outlines types and provider formats


================================================
FILE: mkdocs.yml
================================================
# Site information
site_name: Outlines
site_author: The Outlines developers
site_description: >-
    Structured text generation with LLMs

# Repository
repo_name: dottxt-ai/outlines
repo_url: https://github.com/dottxt-ai/outlines

# Copyright
copyright: Copyright &copy; 2023- The Outlines Developers

# Documentation directory
docs_dir: docs

# Configuration
theme:
    name: material
    palette:
        # Palette toggle for light mode
        - media: "(prefers-color-scheme: light)"
          scheme: default
          primary: white
    logo: assets/images/logo-square.svg
    favicon: assets/images/logo-square.png
    icon:
        repo: fontawesome/brands/github
    features:
        - content.code.copy
        - navigation.expand
        - navigation.tabs
        - navigation.sections
        - header.autohide
        - announce.dismiss
    font:
        text: Inter
        code: Source Code Pro

# Additional configuration
extra:
    social:
        - icon: fontawesome/brands/github
          link: https://github.com/dottxt-ai
        - icon: fontawesome/brands/twitter
          link: https://twitter.com/remilouf
    generator: false
    analytics:
        provider: google
        property: !ENV GOOGLE_ANALYTICS_KEY
    version:
        provider: mike
        default: latest
        alias: true

# Extensions
markdown_extensions:
    - admonition
    - def_list
    - attr_list
    - md_in_html
    - pymdownx.highlight:
          anchor_linenums: true
          line_spans: __span
          pygments_lang_class: true
          noclasses: True
          pygments_style: nord
    - pymdownx.superfences:
          custom_fences:
              - name: mermaid
                class: mermaid
                format: !!python/name:pymdownx.superfences.fence_code_format
    - pymdownx.tabbed:
          alternate_style: true
    - pymdownx.inlinehilite
    - pymdownx.details
    - pymdownx.emoji:
          emoji_index: !!python/name:material.extensions.emoji.twemoji
          emoji_generator: !!python/name:material.extensions.emoji.to_svg
    - pymdownx.snippets:

extra_css:
    - stylesheets/extra.css

plugins:
    - blog
    - mkdocstrings:
          default_handler: python
          handlers:
              python:
                  options:
                      docstring_style: numpy
                      show_submodules: true
    - search
    - section-index
    - social:
          cards_layout_options:
              color: #173a58
    - redirects:
          redirect_maps:
              "welcome.md": "index.md"

    - git-committers:
        repository: dottxt-ai/outlines
        branch: main
    - git-revision-date-localized:
        enable_creation_date: true
        type: timeago

    - gen-files:
        scripts:
        - scripts/gen_ref_pages.py
    - literate-nav:
        nav_file: SUMMARY.md

nav:
    - Home: index.md

    - Guide:
          - Getting Started: guide/getting_started.md
          - Installation: guide/installation.md
          - Migrating to v1: guide/migration.md
          - Vision-Language Models: guide/vlm.md
          - Deploying with FastAPI: guide/fastapi_vllm_deployment.md
          - Chat Templating for Instruct Models: guide/chat_templating.md
          - Architecture: guide/architecture.md

    - Features:
          - Overview: features/index.md

          - Core:
                - Models:
                    - Overview: features/models/index.md
                    - Anthropic: features/models/anthropic.md
                    - Dottxt: features/models/dottxt.md
                    - Gemini: features/models/gemini.md
                    - Llamacpp: features/models/llamacpp.md
                    - Mlx-lm: features/models/mlxlm.md
                    - Ollama: features/models/ollama.md
                    - OpenAI: features/models/openai.md
                    - OpenAI compatible API: features/models/openai_compatible.md
                    - OpenRouter: features/models/openrouter.md
                    - SGLang: features/models/sglang.md
                    - TGI: features/models/tgi.md
                    - Transformers: features/models/transformers.md
                    - TransformersMultiModal: features/models/transformers_multimodal.md
                    - vLLM (online server): features/models/vllm.md
                    - vLLM (offline): features/models/vllm_offline.md
                - Model Inputs: features/core/inputs.md
                - Output Types:
                    - Overview: features/core/output_types.md
                    - Basic Types: features/core/output_types#basic-python-types
                    - Multiple-Choices: features/core/output_types#multiple-choices
                    - JSON: features/core/output_types#json-schemas
                    - Regex: features/core/output_types#regex-patterns
                    - Context-free Grammars: features/core/output_types#context-free-grammars
                - Generator: features/core/generator.md

          - Utilities:
                - Application: features/utility/application.md
                - Regex DSL: features/utility/regex_dsl.md
                - Template: features/utility/template.md

          - Advanced:
                - Logits Processors: features/advanced/logits_processors.md
                - Structured Generation Backends: features/advanced/backends.md

    - API Reference: api_reference/

    - Examples:
          - examples/index.md
          - Classification: examples/classification.md
          - Named Entity Extraction: examples/extraction.md
          - Dating Profiles: examples/dating_profiles.md
          - Chain of Density: examples/chain_of_density.md
          - Playing chess: examples/models_playing_chess.md
          - SimTom: examples/simtom.md
          - Q&A with Citations: examples/qa-with-citations.md
          - Knowledge Graph Extraction: examples/knowledge_graph_extraction.md
          - Structured Generation Workflow: examples/structured_generation_workflow.md
          - Chain of Thought (CoT): examples/chain_of_thought.md
          - ReAct Agent: examples/react_agent.md
          - Structured Generation from PDFs: examples/read-pdfs.md
          - Earnings Reports to CSV: examples/earnings-reports.md
          - Receipt Digitization: examples/receipt-digitization.md
          - Extract Events Details: examples/extract_event_details.md
          - Run on the cloud:
                - BentoML: examples/deploy-using-bentoml.md
                - Cerebrium: examples/deploy-using-cerebrium.md
                - Modal: examples/deploy-using-modal.md
    - Community:
        - community/index.md
        - Feedback 🫶: community/feedback.md
        - Our Discord Server ☕: https://discord.com/invite/R9DSu34mGd
        - How to Contribute 🏗️: community/contribute.md
        - Community Projects 👏: community/examples.md
        - Versioning Guide 📌: community/versioning.md

    - Blog: https://blog.dottxt.co


================================================
FILE: outlines/__init__.py
================================================
"""Outlines is a Generative Model Programming Framework."""

# re-export on top-level namespace
from outlines import grammars as grammars
from outlines import inputs as inputs
from outlines import models as models
from outlines import processors as processors
from outlines import types as types
from outlines.applications import Application as Application
from outlines.caching import clear_cache as clear_cache
from outlines.caching import disable_cache as disable_cache
from outlines.caching import get_cache as get_cache
from outlines.generator import Generator as Generator
from outlines.inputs import Audio as Audio
from outlines.inputs import Image as Image
from outlines.inputs import Video as Video
from outlines.models import *  # noqa: F403
from outlines.templates import Template as Template
from outlines.templates import Vision as Vision
from outlines.types import cfg as cfg
from outlines.types import json_schema as json_schema
from outlines.types import regex as regex


================================================
FILE: outlines/applications.py
================================================
"""Encapsulate a prompt template and an output type into a reusable object."""

from typing import Any, Callable, Dict, Optional, Union

from outlines.generator import (
    BlackBoxGenerator,
    Generator,
    SteerableGenerator,
)
from outlines.models.base import Model
from outlines.templates import Template


class Application:
    """
    Application is a class that encapsulates a prompt template and an
    output type. It can be called to generate a response by providing a
    model, the values to be substituted in the template in a dictionary
    and optional inference parameters.

    Parameters
    ----------
    template : Union[Template, Callable]
        A callable that takes arguments and returns a prompt string.
    output_type : Any
        The expected output type of the generated response.

    Examples
    --------
    ```python
    from pydantic import BaseModel
    from transformers import AutoModelForCausalLM, AutoTokenizer
    from outlines import models, Application
    from outlines.types import JsonType
    from outlines.templates import Template

    class OutputModel(BaseModel):
        result: int

    model = models.from_transformers(
        AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct"),
        AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
    )

    template_string = "What is 2 times {{ num }}?"
    template = Template.from_string(template_string)

    application = Application(template, JsonType(OutputModel))

    result = application(model, {"num": 3}, max_new_tokens=20)
    print(result)  # Expected output: { "result" : 6 }
    ```

    """
    def __init__(
        self,
        template: Union[Template, Callable],
        output_type: Optional[Any] = None,
    ):
        """
        Parameters
        ----------
        template
            The template to use to build the prompt.
        output_type
            The output type provided to the generator.

        """
        self.template = template
        self.output_type = output_type
        self.generator: Optional[Union[
            BlackBoxGenerator, SteerableGenerator
        ]] = None
        self.model: Optional[Model] = None

    def __call__(
        self,
        model: Model,
        template_vars: Dict[str, Any],
        **inference_kwargs
    ) -> Any:
        """
        Parameters
        ----------
        model
            The model to use to generate the response.
        template_vars
            The variables to be substituted in the template.
        **inference_kwargs
            Additional keyword arguments to pass to the model.
        Returns
        -------
        Any
            The generated response.
        """
        if model is None:
            raise ValueError("you must provide a model")
        # We save the generator to avoid creating a new one for each call.
        # If the model has changed since the last call, we create a new
        # generator.
        if model != self.model:
            self.model = model
            self.generator = Generator(model, self.output_type)  # type: ignore

        prompt = self.template(**template_vars)
        assert self.generator is not None
        return self.generator(prompt, **inference_kwargs)


================================================
FILE: outlines/backends/__init__.py
================================================
"""Module to define the backends in charge of creating logits processors."""

from outlines.backends.base import (
    BaseBackend,
    LogitsProcessorType,
)
from outlines.backends.llguidance import LLGuidanceBackend
from outlines.backends.outlines_core import OutlinesCoreBackend
from outlines.backends.xgrammar import XGrammarBackend
from outlines.models import SteerableModel

__all__ = [
    "BaseBackend",
    "LogitsProcessorType",
    "LLGuidanceBackend",
    "OutlinesCoreBackend",
    "XGrammarBackend",
    "SteerableModel",
    "CFG_DEFAULT_BACKEND",
    "JSON_SCHEMA_DEFAULT_BACKEND",
    "REGEX_DEFAULT_BACKEND",
    "get_json_schema_logits_processor",
    "get_regex_logits_processor",
    "get_cfg_logits_processor",
]

CFG_DEFAULT_BACKEND = "llguidance"
JSON_SCHEMA_DEFAULT_BACKEND = "outlines_core"
REGEX_DEFAULT_BACKEND = "outlines_core"


def _get_backend(backend_name: str, model: SteerableModel) -> BaseBackend:
    """Create a Backend instance.

    Parameters
    ----------
    backend_name: str
        The name of the backend to get.
    model: Model
        The Outlines model of the user.

    Returns
    -------
    backend: BaseBackend
        The backend instance.

    """
    if backend_name == "outlines_core":
        return OutlinesCoreBackend(model)
    elif backend_name == "xgrammar":
        return XGrammarBackend(model)
    elif backend_name == "llguidance":
        return LLGuidanceBackend(model)
    else:
        raise ValueError(f"Backend {backend_name} not supported")


def get_json_schema_logits_processor(
    backend_name: str | None,
    model: SteerableModel,
    json_schema: str,
) -> LogitsProcessorType:
    """Create a logits processor from a JSON schema.

    Parameters
    ----------
    backend_name: str | None
        The name of the backend to use.
    model: Model
        The Outlines model of the user.
    json_schema: str
        The JSON schema to create a logits processor from.

    Returns
    -------
    LogitsProcessorType
        The logits processor.

    """
    backend = _get_backend(
        backend_name or JSON_SCHEMA_DEFAULT_BACKEND,
        model,
    )
    return backend.get_json_schema_logits_processor(json_schema)


def get_regex_logits_processor(
    backend_name: str | None,
    model: SteerableModel,
    regex: str,
) -> LogitsProcessorType:
    """Create a logits processor from a regex.

    Parameters
    ----------
    backend_name: str | None
        The name of the backend to use.
    model: Model
        The Outlines model of the user.
    regex: str
        The regex to create a logits processor from.

    Returns
    -------
    LogitsProcessorType
        The logits processor.

    """
    backend = _get_backend(
        backend_name or REGEX_DEFAULT_BACKEND,
        model,
    )
    return backend.get_regex_logits_processor(regex)


def get_cfg_logits_processor(
    backend_name: str | None,
    model: SteerableModel,
    grammar: str,
) -> LogitsProcessorType:
    """Create a logits processor from a context-free grammar.

    Parameters
    ----------
    backend_name: str | None
        The name of the backend to use.
    model: Model
        The Outlines model of the user.
    grammar: str
        The context-free grammar to create a logits processor from.

    Returns
    -------
    LogitsProcessorType
        The logits processor.

    """
    backend = _get_backend(
        backend_name or CFG_DEFAULT_BACKEND,
        model,
    )
    return backend.get_cfg_logits_processor(grammar)


================================================
FILE: outlines/backends/base.py
================================================
"""Base class for all backends."""

from abc import ABC, abstractmethod
from typing import Any


LogitsProcessorType = Any


class BaseBackend(ABC):
    """Base class for all backends.

    The subclasses must implement methods that create a logits processor
    from a JSON schema, regex or CFG.

    """

    @abstractmethod
    def get_json_schema_logits_processor(
        self, json_schema: str
    ) -> LogitsProcessorType:
        """Create a logits processor from a JSON schema.

        Parameters
        ----------
        json_schema: str
            The JSON schema to create a logits processor from.

        Returns
        -------
        LogitsProcessorType
            The logits processor.

        """
        ...

    @abstractmethod
    def get_regex_logits_processor(self, regex: str) -> LogitsProcessorType:
        """Create a logits processor from a regex.

        Parameters
        ----------
        regex: str
            The regex to create a logits processor from.

        Returns
        -------
        LogitsProcessorType
            The logits processor.

        """
        ...

    @abstractmethod
    def get_cfg_logits_processor(self, grammar: str) -> LogitsProcessorType:
        """Create a logits processor from a context-free grammar.

        Parameters
        ----------
        grammar: str
            The context-free grammar to create a logits processor from.

        Returns
        -------
        LogitsProcessorType
            The logits processor.

        """
        ...


================================================
FILE: outlines/backends/llguidance.py
================================================
"""Backend class for LLGuidance."""

import warnings
from typing import TYPE_CHECKING

from outlines.backends.base import BaseBackend
from outlines.models import LlamaCpp, MLXLM, SteerableModel, Transformers
from outlines.processors.base_logits_processor import (
    OutlinesLogitsProcessor,
    TensorType
)

if TYPE_CHECKING:
    from llguidance import LLGTokenizer


SUPPORTED_TENSOR_LIBRARIES = ["numpy", "mlx", "torch"]


class LLGuidanceLogitsProcessor(OutlinesLogitsProcessor):
    """Logits Processor for the LLGuidance backend."""

    def __init__(
        self,
        grammar: str,
        llg_tokenizer,
        tensor_library_name: str,
    ) -> None:
        """
        Parameters
        ----------
        grammar: str
            The grammar spec to use to create the LLMatcher
        llg_tokenizer: LLTokenizer
            The LLGuidance tokenizer
        tensor_library_name: str
            The name of the tensor library used by the model

        """
        self.is_first_token = True
        self.grammar = grammar
        self.llg_tokenizer = llg_tokenizer
        self.tensor_library_name = tensor_library_name
        super().__init__(tensor_library_name)

    def reset(self):
        """Ensure self._setup is called again for the next generation."""
        self.is_first_token = True

    def _setup(self, batch_size: int) -> None:
        """Setup the LLMatchers, the bitmask and some functions used in the
        `process_logits` method.

        This method is called when the first token is generated instead of
        at initialization because we need to know the batch size.

        Parameters
        ----------
        batch_size: int
            The batch size of the input

        """
        from llguidance import LLMatcher

        self.ll_matchers = [
            LLMatcher(self.llg_tokenizer, self.grammar)
            for _ in range(batch_size)
        ]

        # we must adapt the bitmask creation and the bias function to the
        # tensor library used by the model
        if self.tensor_library_name == "torch":
            import llguidance.torch

            self.bitmask = llguidance.torch.allocate_token_bitmask(batch_size, self.llg_tokenizer.vocab_size)
            self._bias_logits = self._bias_logits_torch
        elif self.tensor_library_name == "numpy":
            import llguidance.numpy

            self.bitmask = llguidance.numpy.allocate_token_bitmask(batch_size, self.llg_tokenizer.vocab_size)
            self._bias_logits = self._bias_logits_numpy
        elif self.tensor_library_name == "mlx": # pragma: no cover
            import llguidance.numpy

            self.bitmask = llguidance.numpy.allocate_token_bitmask(batch_size, self.llg_tokenizer.vocab_size)
            self._bias_logits = self._bias_logits_mlx
        else: # pragma: no cover
            raise ValueError(f"Unsupported tensor library: {self.tensor_library_name}")

    def _bias_logits_mlx( # pragma: no cover
        self, input_ids: TensorType, logits: TensorType
    ) -> TensorType:
        """Bias the logits for the MLX backend."""
        import llguidance.mlx
        import llguidance.numpy

        biased_logits_array = []
        for i in range(self.tensor_adapter.shape(input_ids)[0]):
            llguidance.numpy.fill_next_token_bitmask(self.ll_matchers[i], self.bitmask, i)
            biased_logits = llguidance.mlx.apply_token_bitmask(
                logits[i], self.bitmask[i] # type: ignore
            )
            biased_logits_array.append(biased_logits)

        return self.tensor_adapter.concatenate(biased_logits_array)

    def _bias_logits_torch(
        self, input_ids: TensorType, logits: TensorType
    ) -> TensorType:
        """Bias the logits for the Torch backend."""
        import llguidance.torch

        for i in range(self.tensor_adapter.shape(input_ids)[0]):
            llguidance.torch.fill_next_token_bitmask(self.ll_matchers[i], self.bitmask, i)
            self.bitmask = self.tensor_adapter.to_device(
                self.bitmask,
                self.tensor_adapter.get_device(logits)
            )
            llguidance.torch.apply_token_bitmask_inplace(
                logits[i], # type: ignore
                self.bitmask[i]
            )
            self.bitmask = self.tensor_adapter.to_device(
                self.bitmask,
                "cpu"
            )

        return logits

    def _bias_logits_numpy(
        self, input_ids: TensorType, logits: TensorType
    ) -> TensorType:
        """Bias the logits for the Numpy backend."""
        import llguidance.numpy

        for i in range(self.tensor_adapter.shape(input_ids)[0]):
            llguidance.numpy.fill_next_token_bitmask(self.ll_matchers[i], self.bitmask, i)
            llguidance.numpy.apply_token_bitmask_inplace(
                logits[i], self.bitmask[i] # type: ignore
            )

        return logits

    def process_logits(
        self, input_ids: TensorType, logits: TensorType
    ) -> TensorType:
        """Use the instances of LLMatcher to bias the logits.

        Parameters
        ----------
        input_ids
            The ids of the tokens of the existing sequences.
        logits
            The logits for the current generation step.

        Returns
        -------
        TensorType
            The biased logits.

        """
        if self.is_first_token:
            self._setup(self.tensor_adapter.shape(input_ids)[0])
            self.is_first_token = False

        # we do not make the matchers consume the last token during the first
        # generation step because no tokens have been generated yet
        else:
            for i in range(self.tensor_adapter.shape(input_ids)[0]):
                sequence = input_ids[i] # type: ignore
                last_token = sequence[-1].item()
                self.ll_matchers[i].consume_token(last_token)
                error = self.ll_matchers[i].get_error()
                if error:
                    warnings.warn(f"Error in LLMatcher: {error}")

        return self._bias_logits(input_ids, logits)


class LLGuidanceBackend(BaseBackend):
    """Backend for LLGuidance."""

    def __init__(self, model: SteerableModel):
        """
        Parameters
        ----------
        model
            The Outlines model of the user.

        """
        import llguidance as llg

        self.llg = llg
        self.tensor_library_name = model.tensor_library_name
        self.llg_tokenizer = self._create_llg_tokenizer(model)

    def _create_llg_tokenizer(self, model: SteerableModel) -> "LLGTokenizer":
        """Create an llg tokenizer from the Outlines model's tokenizer.

        Parameters
        ----------
        model: Model
            The Outlines model.

        Returns
        -------
        LLGTokenizer
            The llg tokenizer.

        """
        if isinstance(model, Transformers):
            import llguidance.hf

            return llguidance.hf.from_tokenizer(model.hf_tokenizer)

        elif isinstance(model, LlamaCpp):
            import llama_cpp
            import llguidance.llamacpp

            vocab = llama_cpp.llama_model_get_vocab(model.model.model)
            return llguidance.llamacpp.lltokenizer_from_vocab(vocab)

        elif isinstance(model, MLXLM): # pragma: no cover
            import llguidance.hf

            return llguidance.hf.from_tokenizer(
                model.mlx_tokenizer._tokenizer
            )

        else: # pragma: no cover
            raise ValueError(
                f"Unsupported model type: {type(model)}. "
                "Llguidance only supports LlamaCpp, MLXLM "
                "and Transformers models."
            )

    def get_json_schema_logits_processor(
        self, json_schema: str
    ) -> LLGuidanceLogitsProcessor:
        """Create a logits processor from a JSON schema.

        Parameters
        ----------
        json_schema: str
            The JSON schema to create a logits processor from.

        Returns
        -------
        LogitsProcessor
            The logits processor to use to constrain the generation.

        """
        grammar_spec = self.llg.grammar_from("json_schema", json_schema)
        return LLGuidanceLogitsProcessor(
            grammar_spec, self.llg_tokenizer, self.tensor_library_name
        )

    def get_regex_logits_processor(
        self, regex: str
    ) -> LLGuidanceLogitsProcessor:
        """Create a logits processor from a regex.

        Parameters
        ----------
        regex: str
            The regex to create a logits processor from.

        Returns
        -------
        LogitsProcessor
            The logits processor to use to constrain the generation.

        """
        grammar_spec = self.llg.grammar_from("regex", regex)
        return LLGuidanceLogitsProcessor(
            grammar_spec, self.llg_tokenizer, self.tensor_library_name
        )

    def get_cfg_logits_processor(
        self, grammar: str
    ) -> LLGuidanceLogitsProcessor:
        """Create a logits processor from a context-free grammar.

        Parameters
        ----------
        grammar: str
            The context-free grammar to create a logits processor from.

        Returns
        -------
        LogitsProcessor
            The logits processor to use to constrain the generation.

        """
        # We try both lark and ebnf
        try:
            grammar_spec = self.llg.grammar_from("grammar", grammar)
        except ValueError:
            grammar_spec = self.llg.grammar_from("lark", grammar)
        return LLGuidanceLogitsProcessor(
            grammar_spec, self.llg_tokenizer, self.tensor_library_name
        )


================================================
FILE: outlines/backends/outlines_core.py
================================================
"""Backend class for Outlines Core."""

from typing import Callable, Dict, List

from outlines_core import Guide, Index, Vocabulary
from outlines_core.json_schema import build_regex_from_schema

from outlines.backends.base import BaseBackend
from outlines.models import SteerableModel
from outlines.models.llamacpp import LlamaCpp
from outlines.models.mlxlm import MLXLM
from outlines.models.transformers import Transformers
from outlines.processors.base_logits_processor import (
    OutlinesLogitsProcessor,
    TensorType,
)


class OutlinesCoreLogitsProcessor(OutlinesLogitsProcessor):
    """Logits processor for Outlines Core."""

    def __init__(self, index: Index, tensor_library_name: str):
        """
        Parameters
        ----------
        index: Index
            The Outlines Core `Index` instance to use to create the Outlines
            Core `Guide` instances that will be used to bias the logits
        tensor_library_name: str
            The tensor library name to use for the logits processor.

        """
        self.index = index
        self.tensor_library_name = tensor_library_name
        self.is_first_token = True
        super().__init__(tensor_library_name)

    def reset(self) -> None:
        """Reset the logits processor."""
        self.is_first_token = True

    def _setup(self, batch_size: int, vocab_size: int) -> None:
        """Set the guides, bitmasks and some functions used in the
        `process_logits` method.

        This method is called when the first token is generated instead of
        at initialization because we need to know the batch size and the device
        of the logits.

        Parameters
        ----------
        batch_size: int
            The batch size.
        vocab_size: int
            The vocabulary size.

        """
        if self.tensor_library_name == "torch":
            from outlines_core.kernels.torch import allocate_token_bitmask

            self.allocate_token_bitmask = allocate_token_bitmask
            self.bias_logits = self._bias_logits_torch

        elif self.tensor_library_name == "numpy":
            from outlines_core.kernels.numpy import allocate_token_bitmask

            self.allocate_token_bitmask = allocate_token_bitmask
            self.bias_logits = self._bias_logits_numpy

        elif self.tensor_library_name == "mlx":  # pragma: no cover
            from outlines_core.kernels.mlx import allocate_token_bitmask

            self.allocate_token_bitmask = allocate_token_bitmask
            self.bias_logits = self._bias_logits_mlx

        else:  # pragma: no cover
            raise ValueError(f"Unsupported tensor library: {self.tensor_library_name}")

        self._guides = [Guide(self.index) for _ in range(batch_size)]
        self._bitmasks = [
            self.allocate_token_bitmask(vocab_size) for _ in range(batch_size)
        ]

    def _bias_logits_mlx(  # pragma: no cover
        self, batch_size: int, logits: TensorType
    ) -> TensorType:
        """Bias the logits for MLX tensors."""
        from outlines_core.kernels.mlx import (
            apply_token_bitmask,
            fill_next_token_bitmask,
        )

        biased_logits_array = []
        for i in range(batch_size):
            fill_next_token_bitmask(self._guides[i], self._bitmasks[i])
            biased_logits = apply_token_bitmask(
                self.tensor_adapter.unsqueeze(logits[i]), # type: ignore
                self._bitmasks[i],  # type: ignore
            )
            biased_logits_array.append(biased_logits)

        return self.tensor_adapter.concatenate(biased_logits_array)

    def _bias_logits_torch(self, batch_size: int, logits: TensorType) -> TensorType:
        """Bias the logits for Torch tensors."""
        from outlines_core.kernels.torch import (
            apply_token_bitmask_inplace,
            fill_next_token_bitmask,
        )

        for i in range(batch_size):
            fill_next_token_bitmask(self._guides[i], self._bitmasks[i])
            self._bitmasks[i] = self.tensor_adapter.to_device(
                self._bitmasks[i], self.tensor_adapter.get_device(logits)
            )
            apply_token_bitmask_inplace(
                self.tensor_adapter.unsqueeze(logits[i]),  # type: ignore
                self._bitmasks[i],
            )
            self._bitmasks[i] = self.tensor_adapter.to_device(self._bitmasks[i], "cpu")

        return logits

    def _bias_logits_numpy(self, batch_size: int, logits: TensorType) -> TensorType:
        """Bias the logits for Numpy tensors."""
        from outlines_core.kernels.numpy import (
            apply_token_bitmask_inplace,
            fill_next_token_bitmask,
        )

        for i in range(batch_size):
            fill_next_token_bitmask(self._guides[i], self._bitmasks[i])
            apply_token_bitmask_inplace(
                self.tensor_adapter.unsqueeze(logits[i]),  # type: ignore
                self._bitmasks[i],
            )

        return logits

    def process_logits(self, input_ids: TensorType, logits: TensorType) -> TensorType:
        """Use the guides to bias the logits.

        Parameters
        ----------
        input_ids
            The ids of the tokens of the existing sequences.
        logits
            The logits for the current generation step.

        Returns
        -------
        TensorType
            The biased logits.

        """
        batch_size = self.tensor_adapter.shape(input_ids)[0]
        vocab_size = self.tensor_adapter.shape(logits)[1]

        if self.is_first_token:
            self._setup(batch_size, vocab_size)
            self.is_first_token = False
        else:
            for i in range(batch_size):
                last_token_id = self.tensor_adapter.to_scalar(input_ids[i][-1])  # type: ignore
                # This circumvents issue #227 in outlines_core
                # Ideally, we would be able to advance all the times as the final
                # state would accept the eos token leading to itself
                if not self._guides[i].is_finished() or self._guides[i].accepts_tokens(
                    [last_token_id]
                ):
                    self._guides[i].advance(token_id=last_token_id, return_tokens=False)

        return self.bias_logits(batch_size, logits)


class OutlinesCoreBackend(BaseBackend):
    """Backend for Outlines Core."""

    def __init__(self, model: SteerableModel):
        """
        Parameters
        ----------
        model
            The Outlines model of the user.

        """
        if isinstance(model, Transformers):
            tokenizer = model.tokenizer
            vocabulary = tokenizer.get_vocab()
            eos_token_id = tokenizer.eos_token_id
            eos_token = tokenizer.eos_token
            token_to_str = tokenizer.convert_token_to_string
        elif isinstance(model, LlamaCpp):
            tokenizer = model.tokenizer  # type: ignore
            vocabulary = tokenizer.vocabulary
            eos_token_id = tokenizer.eos_token_id
            eos_token = tokenizer.eos_token
            token_to_str = tokenizer.convert_token_to_string
        elif isinstance(model, MLXLM):  # pragma: no cover
            tokenizer = model.mlx_tokenizer  # type: ignore
            vocabulary = tokenizer.get_vocab()
            eos_token_id = tokenizer.eos_token_id
            eos_token = tokenizer.eos_token
            token_to_str = lambda token: tokenizer.convert_tokens_to_string([token])  # type: ignore
        else:  # pragma: no cover
            raise ValueError(f"Unsupported model type: {type(model)}")

        self.eos_token_id = eos_token_id
        self.vocabulary = self.create_outlines_core_vocabulary(
            vocabulary, eos_token_id, eos_token, token_to_str
        )
        self.tensor_library_name = model.tensor_library_name

    def get_json_schema_logits_processor(self, json_schema: str):
        """Create a logits processor from a JSON schema.

        Parameters
        ----------
        json_schema: str
            The JSON schema to create a logits processor from.

        Returns
        -------
        LogitsProcessor
            The logits processor to use to constrain the generation.

        """
        regex = build_regex_from_schema(json_schema)
        return self.get_regex_logits_processor(regex)

    def get_regex_logits_processor(self, regex: str):
        """Create a logits processor from a regex.

        Parameters
        ----------
        regex: str
            The regex to create a logits processor from.

        Returns
        -------
        LogitsProcessor
            The logits processor to use to constrain the generation.

        """
        index = Index(regex, self.vocabulary)
        return OutlinesCoreLogitsProcessor(index, self.tensor_library_name)

    def get_cfg_logits_processor(self, grammar):
        raise NotImplementedError(
            "Outlines Core does not support context-free grammar."
        )

    @staticmethod
    def create_outlines_core_vocabulary(
        vocab: Dict[str, int],
        eos_token_id: int,
        eos_token: str,
        token_to_str: Callable[[str], str],
    ) -> Vocabulary:
        """Create an Outlines Core Vocabulary instance.

        Parameters
        ----------
        vocab: Dict[str, int]
            The vocabulary to create an Outlines Core vocabulary from.
        eos_token_id: int
            The EOS token ID.
        eos_token: str
            The EOS token.
        token_to_str: Callable[[str], str]
            The function to convert a token to a string.

        Returns
        -------
        Vocabulary
            The Outlines Core Vocabulary instance.

        """
        formatted_vocab: Dict[str, List[int]] = {}
        for token, token_id in vocab.items():
            # This step is necessary to transform special tokens into their
            # string representation, in particular for spacing. We need those
            # string representations as outlines core first builds an FSM from
            # the regex provided that only contains regular strings.
            token_as_str = token_to_str(token)
            formatted_vocab.setdefault(token_as_str, []).append(token_id)
        formatted_vocab.pop(eos_token)
        return Vocabulary(eos_token_id, formatted_vocab)


================================================
FILE: outlines/backends/xgrammar.py
================================================
"""Backend class for XGrammar."""

from outlines.backends.base import BaseBackend
from outlines.models import SteerableModel
from outlines.models.mlxlm import MLXLM
from outlines.models.transformers import Transformers
from outlines.processors.base_logits_processor import (
    OutlinesLogitsProcessor,
    TensorType
)


class XGrammarLogitsProcessor(OutlinesLogitsProcessor):
    """Logits processor for XGrammar."""

    def __init__(self, compiled_grammar: str, tensor_library_name: str,):
        """
        Parameters
        ----------
        compiled_grammar: str
            The compiled grammar to use to create the logits processor.
        tensor_library_name: str
            The name of the tensor library used by the model

        """
        import xgrammar as xgr

        self.xgr = xgr
        self.is_first_token = True
        self.compiled_grammar = compiled_grammar
        self.tensor_library_name = tensor_library_name
        super().__init__(tensor_library_name)

    def reset(self):
        """Ensure self._setup is called again for the next generation."""
        self.is_first_token = True

    def _setup(self, batch_size: int, vocab_size: int) -> None:
        """Setup the logits processor for a new generation."""
        if self.tensor_library_name == "torch":
            self._bias_logits = self._bias_logits_torch
        elif self.tensor_library_name == "mlx": # pragma: no cover
            self._bias_logits = self._bias_logits_mlx
        else: # pragma: no cover
            raise ValueError(
                f"Unsupported tensor library: {self.tensor_library_name}"
            )

        self._matchers = [
            self.xgr.GrammarMatcher(self.compiled_grammar)
            for _ in range(batch_size)
        ]
        self._bitmask = self.xgr.allocate_token_bitmask(batch_size, vocab_size)

    def _bias_logits_torch(
        self, input_ids: TensorType, logits: TensorType
    ) -> TensorType:
        """Bias the logits for Torch tensors."""
        for i in range(self.tensor_adapter.shape(input_ids)[0]):
            if not self._matchers[i].is_terminated():
                self._matchers[i].fill_next_token_bitmask(self._bitmask, i)

        self._bitmask = self.tensor_adapter.to_device(
            self._bitmask,
            self.tensor_adapter.get_device(logits)
        )
        self.xgr.apply_token_bitmask_inplace(logits, self._bitmask)
        self._bitmask = self.tensor_adapter.to_device(
            self._bitmask,
            "cpu"
        )

        return logits

    def _bias_logits_mlx( # pragma: no cover
        self, input_ids: TensorType, logits: TensorType
    ) -> TensorType:
        """Bias the logits for MLX tensors."""
        import mlx.core as mx
        from xgrammar.kernels.apply_token_bitmask_mlx import apply_token_bitmask_mlx

        for i in range(self.tensor_adapter.shape(input_ids)[0]):
            if not self._matchers[i].is_terminated():
                self._matchers[i].fill_next_token_bitmask(self._bitmask, i)

        biased_logits = apply_token_bitmask_mlx(
            mx.array(self._bitmask.numpy()), logits, self.tensor_adapter.shape(logits)[1]
        )

        return biased_logits

    def process_logits(
        self, input_ids: TensorType, logits: TensorType
    ) -> TensorType:
        """Use the XGrammar matchers to bias the logits."""
        batch_size = self.tensor_adapter.shape(input_ids)[0]
        vocab_size = self.tensor_adapter.shape(logits)[1]

        if self.is_first_token:
            self._setup(batch_size, vocab_size)
            self.is_first_token = False
        else:
            for i in range(batch_size):
                if not self._matchers[i].is_terminated(): # pragma: no cover
                    last_token_id = self.tensor_adapter.to_scalar(
                        input_ids[i][-1] # type: ignore
                    )
                    assert self._matchers[i].accept_token(last_token_id)

        return self._bias_logits(input_ids, logits)


class XGrammarBackend(BaseBackend):
    """Backend for XGrammar."""

    def __init__(self, model: SteerableModel):
        """
        Parameters
        ----------
        model
            The Outlines model of the user.

        """
        import xgrammar as xgr

        if isinstance(model, Transformers):
            tokenizer = model.hf_tokenizer
        elif isinstance(model, MLXLM): # pragma: no cover
            tokenizer = model.mlx_tokenizer._tokenizer
        else: # pragma: no cover
            raise ValueError(
                "The xgrammar backend only supports Transformers and "
                + "MLXLM models"
            )

        tokenizer_info = xgr.TokenizerInfo.from_huggingface(
            tokenizer,
            vocab_size=len(tokenizer.get_vocab())
        )
        self.grammar_compiler = xgr.GrammarCompiler(tokenizer_info)
        self.tensor_library_name = model.tensor_library_name

    def get_json_schema_logits_processor(
        self, json_schema: str
    ) -> XGrammarLogitsProcessor:
        """Create a logits processor from a JSON schema.

        Parameters
        ----------
        json_schema: str
            The JSON schema to create a logits processor from.

        Returns
        -------
        LogitsProcessor
            The logits processor to use to constrain the generation.

        """
        compiled_grammar = self.grammar_compiler.compile_json_schema(
            json_schema
        )
        return XGrammarLogitsProcessor(
            compiled_grammar,
            self.tensor_library_name
        )

    def get_regex_logits_processor(
        self, regex: str
    ) -> XGrammarLogitsProcessor:
        """Create a logits processor from a regex.

        Parameters
        ----------
        regex: str
            The regex to create a logits processor from.

        Returns
        -------
        LogitsProcessor
            The logits processor to use to constrain the generation.

        """
        compiled_grammar = self.grammar_compiler.compile_regex(regex)
        return XGrammarLogitsProcessor(
            compiled_grammar,
            self.tensor_library_name
        )

    def get_cfg_logits_processor(
        self, grammar: str
    ) -> XGrammarLogitsProcessor:
        """Create a logits processor from a context-free grammar.

        Parameters
        ----------
        grammar: str
            The context-free grammar to create a logits processor from.

        Returns
        -------
        LogitsProcessor
            The logits processor to use to constrain the generation.

        """
        compiled_grammar = self.grammar_compiler.compile_grammar(grammar)
        return XGrammarLogitsProcessor(
            compiled_grammar,
            self.tensor_library_name
        )


================================================
FILE: outlines/caching.py
================================================
"""Caching and memoization of function calls."""

import asyncio
import contextlib
import functools
import os
import tempfile
from typing import Callable, Optional

import cloudpickle
from diskcache import Cache, Disk
from diskcache.core import ENOVAL, UNKNOWN, args_to_key, full_name

_caching_enabled = True


class CloudpickleDisk(Disk): # pragma: no cover
    def __init__(self, directory, compress_level=1, **kwargs):
        self.compress_level = compress_level
        super().__init__(directory, **kwargs)

    def put(self, key):
        data = cloudpickle.dumps(key)
        return super().put(data)

    def get(self, key, raw):
        data = super().get(key, raw)
        return cloudpickle.loads(data)

    def store(self, value, read, key=UNKNOWN):
        if not read:
            value = cloudpickle.dumps(value)
        return super().store(value, read, key=key)

    def fetch(self, mode, filename, value, read):
        data = super().fetch(mode, filename, value, read)
        if not read:
            data = cloudpickle.loads(data)
        return data


@functools.lru_cache(1)
def get_cache():
    """Get the context object that contains previously-computed return values.

    The cache is used to avoid unnecessary computations and API calls, which can
    be long and expensive for large models.

    The cache directory defaults to `HOMEDIR/.cache/outlines`, but this choice
    can be overridden by the user by setting the value of the `OUTLINES_CACHE_DIR`
    environment variable.

    """
    from outlines._version import __version__ as outlines_version  # type: ignore

    outlines_cache_dir = os.environ.get("OUTLINES_CACHE_DIR")
    xdg_cache_home = os.environ.get("XDG_CACHE_HOME")
    home_dir = os.path.normpath(os.path.expanduser("~"))
    if outlines_cache_dir:
        # OUTLINES_CACHE_DIR takes precedence
        cache_dir = outlines_cache_dir
    elif xdg_cache_home:  # pragma: no cover
        cache_dir = os.path.join(xdg_cache_home, "outlines")
    elif home_dir != "/": # pragma: no cover
        cache_dir = os.path.join(home_dir, ".cache", "outlines")
    else:  # pragma: no cover
        # home_dir may be / inside a docker container without existing user
        tempdir = tempfile.gettempdir()
        cache_dir = os.path.join(tempdir, ".cache", "outlines")

    memory = Cache(
        cache_dir,
        eviction_policy="none",
        cull_limit=0,
        disk=CloudpickleDisk,
    )

    # ensure if version upgrade occurs, old cache is pruned
    if outlines_version != memory.get("__version__"):
        memory.clear()
    memory["__version__"] = outlines_version

    return memory


def cache(expire: Optional[float] = None, typed=False, ignore=()):
    """Caching decorator for memoizing function calls.

    The cache key is created based on the values returned by the key_function callable
    if provided or based on the arguments of the decorated function directly otherwise

    This is based on `diskcache`'s `memoize`.

    Parameters
    ----------
    expire
        Seconds until arguments expire.
    typed
        Cache different types separately.
    ignore
        Positional or keyword arguments to ignore.

    Returns
    -------
        A decorator function that can be applied to other functions.
    """

    def decorator(cached_function: Callable):
        memory = get_cache()

        base = (full_name(cached_function),)

        if asyncio.iscoroutinefunction(cached_function):  # pragma: no cover

            async def wrapper(*args, **kwargs):
                if not _caching_enabled:
                    return await cached_function(*args, **kwargs)

                cache_key = wrapper.__cache_key__(*args, **kwargs)
                result = wrapper.__memory__.get(cache_key, default=ENOVAL, retry=True)

                if result is ENOVAL:
                    result = await cached_function(*args, **kwargs)
                    wrapper.__memory__.set(cache_key, result, expire, retry=True)

                return result

        else:

            def wrapper(*args, **kwargs):
                if not _caching_enabled:
                    return cached_function(*args, **kwargs)

                cache_key = wrapper.__cache_key__(*args, **kwargs)
                result = wrapper.__memory__.get(cache_key, default=ENOVAL, retry=True)

                if result is ENOVAL:
                    result = cached_function(*args, **kwargs)
                    wrapper.__memory__.set(cache_key, result, expire, retry=True)

                return result

        def __cache_key__(*args, **kwargs):
            """Make key for cache given function arguments."""
            return args_to_key(base, args, kwargs, typed, ignore)

        wrapper.__cache_key__ = __cache_key__  # type: ignore
        wrapper.__memory__ = memory  # type: ignore
        wrapper.__wrapped__ = cached_function  # type: ignore

        return wrapper

    return decorator


def disable_cache():
    """Disable the cache for this session.

    Generative models output different results each time they are called when
    sampling. This can be a desirable property for some workflows, in which case
    one can call `outlines.call.disable` to disable the cache for the session.

    This function does not delete the cache, call `outlines.cache.clear`
    instead. It also does not overwrite the cache with the values returned
    during the session.

    Example
    -------

    `outlines.cache.disable` should be called right after importing outlines:

    >>> import outlines.caching as cache
    >>> cache.disable_cache()

    """
    global _caching_enabled
    _caching_enabled = False


def clear_cache():
    """Erase the cache completely."""
    memory = get_cache()
    memory.clear()


@contextlib.contextmanager
def cache_disabled():
    # outlines.caching._caching_enabled
    global _caching_enabled
    original_state = _caching_enabled
    _caching_enabled = False
    try:
        yield
    finally:
        _caching_enabled = original_state


================================================
FILE: outlines/generator.py
================================================
"""Encapsulate a model and an output type into a reusable object."""

from typing import (
    Any,
    AsyncIterator,
    Iterator,
    List,
    Optional,
    Union,
)

from outlines.models import (
    AsyncBlackBoxModel,
    BlackBoxModel,
    SteerableModel,
)
from outlines.models.base import AsyncModel, Model
from outlines.backends import (
    get_cfg_logits_processor,
    get_json_schema_logits_processor,
    get_regex_logits_processor,
)
from outlines.backends.base import LogitsProcessorType
from outlines.types import CFG, JsonSchema
from outlines.types.dsl import python_types_to_terms, to_regex


class BlackBoxGenerator:
    """Synchronous generator for which we don't control constrained
    generation.

    The output type provided is not compiled into a logits processor, but is
    instead directly passed on to the model.

    """
    output_type: Optional[Any]

    def __init__(self, model: BlackBoxModel, output_type: Optional[Any]):
        """
        Parameters
        ----------
        model
            An instance of an Outlines model.
        output_type
            The output type that will be used to constrain the generation.

        """
        self.model = model
        self.output_type = output_type

    def __call__(self, prompt: Any, **inference_kwargs) -> Any:
        """Generate a response from the model.

        Parameters
        ----------
        prompt
            The prompt to use to generate a response.
        **inference_kwargs
            Additional keyword arguments to pass to the model.

        Returns
        -------
        Any
            The response generated by the model.

        """
        return self.model.generate(
            prompt, self.output_type, **inference_kwargs
        )

    def batch(self, prompts: List[Any], **inference_kwargs) -> List[Any]:
        """Generate a batch of responses from the model.

        Parameters
        ----------
        prompts
            The list of prompts to use to generate a batch of responses.
        **inference_kwargs
            Additional keyword arguments to pass to the model.

        Returns
        -------
        List[Any]
            The list of responses generated by the model.

        """
        return self.model.generate_batch(
            prompts, self.output_type, **inference_kwargs
        )

    def stream(self, prompt: Any, **inference_kwargs) -> Iterator[Any]:
        """Generate a stream of responses from the model.

        Parameters
        ----------
        prompt
            The prompt to use to generate a response.
        **inference_kwargs
            Additional keyword arguments to pass to the model.

        Returns
        -------
        Any
            The response generated by the model.

        """
        return self.model.generate_stream(
            prompt, self.output_type, **inference_kwargs
        )


class AsyncBlackBoxGenerator:
    """Asynchronous generator for which we don't control constrained
    generation.

    The output type provided is not compiled into a logits processor, but is
    instead directly passed on to the model.

    """
    output_type: Optional[Any]

    def __init__(self, model: AsyncBlackBoxModel, output_type: Optional[Any]):
        """
        Parameters
        ----------
        model
            An instance of an Outlines model.
        output_type
            The output type that will be used to constrain the generation.

        """
        self.model = model
        self.output_type = output_type

    async def __call__(self, prompt: Any, **inference_kwargs) -> Any:
        """Generate a response from the model.

        Parameters
        ----------
        prompt
            The prompt to use to generate a response.
        **inference_kwargs
            Additional keyword arguments to pass to the model.

        Returns
        -------
        Any
            The response generated by the model.

        """
        return await self.model.generate(
            prompt, self.output_type, **inference_kwargs
        )

    async def batch(self, prompts: List[Any], **inference_kwargs) -> List[Any]:
        """Generate a batch of responses from the model.

        Parameters
        ----------
        prompts
            The list of prompts to use to generate a batch of responses.
        **inference_kwargs
            Additional keyword arguments to pass to the model.

        Returns
        -------
        List[Any]
            The list of responses generated by the model.

        """
        return await self.model.generate_batch(
            prompts, self.output_type, **inference_kwargs
        )

    async def stream(self, prompt: Any, **inference_kwargs) -> AsyncIterator[Any]:
        """Generate a stream of responses from the model.

        Parameters
        ----------
        prompt
            The prompt to use to generate a response.
        **inference_kwargs
            Additional keyword arguments to pass to the model.

        Returns
        -------
        Any
            The response generated by the model.

        """
        async for chunk in self.model.generate_stream(  # pragma: no cover
            prompt, self.output_type, **inference_kwargs
        ):
            yield chunk


class SteerableGenerator:
    """Represents a generator for which we control constrained generation.

    The generator is responsible for building and storing the logits processor
    (which can be quite expensive to build), and then passing it to the model
    when the generator is called.

    The argument defining constrained generation can be of 2 types associated
    to different methods to create an instance of the generator:
    - `output_type` (through `__init__`): an output type as defined in the
      `outlines.types` module
    - `processor` (through `from_processor`): an already built logits processor
       as defined in the `outlines.processors` module

    The 2 parameters are mutually exclusive.

    """
    logits_processor: Optional[LogitsProcessorType]

    def __init__(
        self,
        model: SteerableModel,
        output_type: Optional[Any],
        backend_name: Optional[str] = None,
    ):
        """
        Parameters
        ----------
        model
            An instance of an Outlines model.
        output_type
            The output type expressed as a Python type
        backend_name
            The name of the backend to use to create the logits processor.

        """
        self.model = model
        if output_type is None:
            self.logits_processor = None
        else:
            term = python_types_to_terms(output_type)
            if isinstance(term, CFG):
                cfg_string = term.definition
                self.logits_processor = get_cfg_logits_processor(
                    backend_name,
                    model,
                    cfg_string,
                )
            elif isinstance(term, JsonSchema):
                self.logits_processor = get_json_schema_logits_processor(
                    backend_name,
                    model,
                    term.schema,
                )
            else:
                regex_string = to_regex(term)
                self.logits_processor = get_regex_logits_processor(
                    backend_name,
                    model,
                    regex_string,
                )

    @classmethod
    def from_processor(
        cls, model: SteerableModel, processor: LogitsProcessorType
    ):
        """Create a generator from a logits processor.

        Parameters
        ----------
        model
            An instance of an Outlines model.
        processor
            An instance of a logits processor.

        """
        instance = cls.__new__(cls)
        instance.model = model
        instance.logits_processor = processor

        return instance

    def __call__(self, prompt: Any, **inference_kwargs) -> Any:
        """Generate a response from the model.

        Parameters
        ----------
        prompt
            The prompt to use to generate a response.
        **inference_kwargs
            Additional keyword arguments to pass to the model.

        Returns
        -------
        Any
            The response generated by the model.

        """
        if self.logits_processor is not None:
            self.logits_processor.reset()
        return self.model.generate(
            prompt, self.logits_processor, **inference_kwargs
        )

    def batch(self, prompts: List[Any], **inference_kwargs) -> List[Any]:
        """Generate a batch of responses from the model.

        Parameters
        ----------
        prompts
            The list of prompts to use to generate a batch of responses.
        **inference_kwargs
            Additional keyword arguments to pass to the model.

        Returns
        -------
        List[Any]
            The list of responses generated by the model.

        """
        if self.logits_processor is not None:
            self.logits_processor.reset()
        return self.model.generate_batch(
            prompts, self.logits_processor, **inference_kwargs
        )

    def stream(self, prompt: Any, **inference_kwargs) -> Iterator[Any]:
        """Generate a stream of responses from the model.

        Parameters
        ----------
        prompt
            The prompt to use to generate a response.
        **inference_kwargs
            Additional keyword arguments to pass to the model.

        Returns
        -------
        Any
            The response generated by the model.

        """
        if self.logits_processor is not None:
            self.logits_processor.reset()
        return self.model.generate_stream(
            prompt, self.logits_processor, **inference_kwargs
        )


def Generator(
    model: Union[Model, AsyncModel],
    output_type: Optional[Any] = None,
    backend: Optional[str] = None,
    *,
    processor: Optional[LogitsProcessorType] = None,
) -> Union[SteerableGenerator, BlackBoxGenerator, AsyncBlackBoxGenerator]:
    """Create a generator for the given model and output parameters.

    The 2 parameters output_type and processor are mutually exclusive. The
    parameters processor is only supported for SteerableModel instances
    (typically local models) and is intended to be only used by advanced users.

    Parameters
    ----------
    model
        An instance of an Outlines model.
    output_type
        The output type expressed as a Python type or a type defined in the
        outlines.types.dsl module.
    backend
        The name of the backend to use to create the logits processor. Only
        used for steerable models if there is an output type and `processor` is
        not provided.
    processor
        An instance of a logits processor.

    Returns
    -------
    Union[SteerableGenerator, BlackBoxGenerator, AsyncBlackBoxGenerator]
        A generator instance.

    """
    provided_output_params = sum(
        param is not None
        for param in [output_type, processor]
    )
    if provided_output_params > 1:
        raise ValueError(
            "At most one of output_type or processor can be provided"
        )

    if isinstance(model, SteerableModel): # type: ignore
        if processor is not None:
            return SteerableGenerator.from_processor(model, processor) # type: ignore
        else:
            return SteerableGenerator(model, output_type, backend) # type: ignore
    else:
        if processor is not None:
            raise NotImplementedError(
                "This model does not support logits processors"
            )
        if isinstance(model, AsyncBlackBoxModel): # type: ignore
            return AsyncBlackBoxGenerator(model, output_type) # type: ignore
        elif isinstance(model, BlackBoxModel): # type: ignore
            return BlackBoxGenerator(model, output_type) # type: ignore
        else:
            raise ValueError(
                "The model argument must be an instance of "
                "SteerableModel, BlackBoxModel or AsyncBlackBoxModel"
            )


================================================
FILE: outlines/grammars/arithmetic.lark
================================================
?start: sum

?sum: product
| sum "+" product   -> add
| sum "-" product   -> sub

?product: atom
| product "*" atom  -> mul
| product "/" atom  -> div

?atom: NUMBER           -> number
| "-" atom         -> neg
| "(" sum ")"

%import common.NUMBER
%import common.WS_INLINE

%ignore WS_INLINE


================================================
FILE: outlines/grammars/common.lark
================================================
// Adapted from https://github.com/lark-parser/lark/blob/master/lark/grammars/common.lark

// Lark License:
// Copyright © 2017 Erez Shinan
//
// Permission is hereby granted, free of charge, to any person obtaining a copy of
// this software and associated documentation files (the "Software"), to deal in
// the Software without restriction, including without limitation the rights to
// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
// the Software, and to permit persons to whom the Software is furnished to do so,
// subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
// FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE AUTHORS OR
// COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
// IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
// CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


// Basic terminals for common use


//
// Numbers
//

DIGIT: "0".."9"
HEXDIGIT: "a".."f"|"A".."F"|DIGIT

INT: DIGIT+
SIGNED_INT: ["+"|"-"] INT
DECIMAL: INT "." INT? | "." INT

// float = /-?\d+(\.\d+)?([eE][+-]?\d+)?/
_EXP: ("e"|"E") SIGNED_INT
FLOAT: INT _EXP | DECIMAL _EXP?
SIGNED_FLOAT: ["+"|"-"] FLOAT

NUMBER: FLOAT | INT
SIGNED_NUMBER: ["+"|"-"] NUMBER

UNESCAPED_STRING: /\"[^"]*\"/

// based on `outlines/fsm/json_schema.py`
_NON_CONTROL_CHAR: /([^"\\\x00-\x1F\x7F-\x9F])/
_ESCAPED_CHAR: /\\/ (_NON_CONTROL_CHAR | /\\/ | /"/)
ESCAPED_STRING_INNER: _NON_CONTROL_CHAR | _ESCAPED_CHAR
ESCAPED_STRING: /"/ ESCAPED_STRING_INNER* /"/


//
// Names (Variables)
//
LCASE_LETTER: "a".."z"
UCASE_LETTER: "A".."Z"

LETTER: UCASE_LETTER | LCASE_LETTER
WORD: LETTER+

CNAME: ("_"|LETTER) ("_"|LETTER|DIGIT)*


//
// Whitespace
//
WS_INLINE: (" "|/\t/)+
WS: /[ \t\f\r\n]/+

CR : /\r/
LF : /\n/
NEWLINE: (CR? LF)+


// Comments
SH_COMMENT: /#[^\n]*/
CPP_COMMENT: /\/\/[^\n]*/
C_COMMENT: "/*" /(.|\n)*?/ "*/"
SQL_COMMENT: /--[^\n]*/


================================================
FILE: outlines/grammars/json.lark
================================================
?start: value

?value: object
| array
| ESCAPED_STRING
| SIGNED_NUMBER      -> number
| "true"             -> true
| "false"            -> false
| "null"             -> null

array  : "[" [value ("," value)*] "]"
object : "{" [pair ("," pair)*] "}"
pair   : ESCAPED_STRING ":" value

%import common.ESCAPED_STRING
%import common.SIGNED_NUMBER
%import common.WS

%ignore WS


================================================
FILE: outlines/grammars.py
================================================
"""A few common Lark grammars."""

from pathlib import Path

GRAMMAR_PATH = Path(__file__).parent / "grammars"


def read_grammar(
    grammar_file_name: str,
    base_grammar_path: Path = GRAMMAR_PATH,
) -> str:
    """Read grammar file from default grammar path.

    Parameters
    ----------
    grammar_file_name
        The name of the grammar file to read.
    base_grammar_path
        The path to the directory containing the grammar file.

    Returns
    -------
    str
        The contents of the grammar file.

    """
    full_path = base_grammar_path / grammar_file_name
    with open(full_path) as file:
        return file.read()


arithmetic = read_grammar("arithmetic.lark")
json = read_grammar("json.lark")


================================================
FILE: outlines/inputs.py
================================================
"""Contain classes used to define the inputs of a model."""

import base64
from dataclasses import dataclass
from io import BytesIO
from typing import Any, Dict, List, Optional

from PIL import Image as PILImage


@dataclass
class Image:
    """Contains an image that can be passed to a multimodal model.

    Provide one or several instances of this class along with a text prompt
    in a list as the `model_input` argument to a model that supports vision.

    Parameters
    ----------
    image
        The image to use in the text generation.

    """
    image: PILImage.Image

    def __post_init__(self):
        image = self.image

        if not image.format:
            raise TypeError(
                "Could not read the format of the image passed to the model."
            )

        buffer = BytesIO()
        image.save(buffer, format=image.format)
        self.image_str = base64.b64encode(buffer.getvalue()).decode("utf-8")
        self.image_format = f"image/{image.format.lower()}"


@dataclass
class Video:
    """Contains a video that can be passed to a multimodal model.

    Provide one or several instances of this class along with a text prompt
    in a list as the `model_input` argument to a model that supports video
    processing.

    Parameters
    ----------
    video
        The video to use in the text generation.

    """
    video: Any


@dataclass
class Audio:
    """Contains an audio that can be passed to a multimodal model.

    Provide one or several instances of this class along with a text prompt
    in a list as the `model_input` argument to a model that supports audio
    processing.

    Parameters
    ----------
    audio
        The audio to use in the text generation.

    """
    audio: Any


@dataclass
class Chat:
    """Contains the input for a chat model.

    Provide an instance of this class as the `model_input` argument to a model
    that supports chat.

    Each message contained in the messages list must be a dict with 'role' and
    'content' keys. The role can be 'user', 'assistant', or 'system'. The content
    supports either:
    - a text string,
    - a list containing text and assets (e.g., ["Describe...", Image(...)]),
    - only for HuggingFace transformers models, a list of dict items with explicit types (e.g.,
      [{"type": "text", "text": "Describe..."}, {"type": "image", "image": Image(...)}])

    Examples
    --------
    ```python
    # Initialize the chat with a system message.
    chat_prompt = Chat([
        {"role": "system", "content": "You are a helpful assistant."},
    ])

    # Add a user message with an image and call the model (not shown here).
    chat_prompt.add_user_message(["Describe the image below", Image(image)])

    # Add as an assistant message the response from the model.
    chat_prompt.add_assistant_message("There is a black cat sitting on a couch.")
    ```

    Parameters
    ----------
    messages
        The list of messages that will be provided to the model.

    """
    messages: List[Dict[str, Any]] = None # type: ignore

    def __post_init__(self):
        if self.messages is None:
            self.messages = []

    def append(self, message: Dict[str, Any]):
        """Add a message to the chat.

        Parameters
        ----------
        message
            The message to add to the chat.

        """
        self.messages.append(message)

    def extend(self, messages: List[Dict[str, Any]]):
        """Add a list of messages to the chat.

        Parameters
        ----------
        messages
            The list of messages to add to the chat.

        """
        self.messages.extend(messages)

    def pop(self) -> Dict[str, Any]:
        """Remove the last message from the chat.

        Returns
        -------
        message
            The removed message.

        """
        return self.messages.pop()

    def add_system_message(self, content: str | List[Any]):
        """Add a system message to the chat.

        Parameters
        ----------
        content
            The content of the system message.

        """
        self.messages.append({"role": "system", "content": content})

    def add_user_message(self, content: str | List[Any]):
        """Add a user message to the chat.

        Parameters
        ----------
        content
            The content of the user message.

        """
        self.messages.append({"role": "user", "content": content})

    def add_assistant_message(self, content: str | List[Any]):
        """Add an assistant message to the chat.

        Parameters
        ----------
        content
            The content of the assistant message.

        """
        self.messages.append({"role": "assistant", "content": content})

    def __str__(self):
        return "\n".join(str(message) for message in self.messages)

    def __repr__(self):
        return f"Chat(messages={self.messages})"


================================================
FILE: outlines/models/__init__.py
================================================
"""Module that contains all the models integrated in outlines.

We group the models in submodules by provider instead of theme (completion, chat
completion, diffusers, etc.) and use routing functions everywhere else in the
codebase.

"""

from typing import Union

from .anthropic import Anthropic, from_anthropic
from .base import Model, ModelTypeAdapter
from .dottxt import Dottxt, from_dottxt
from .gemini import Gemini, from_gemini
from .llamacpp import LlamaCpp, from_llamacpp
from .lmstudio import AsyncLMStudio, LMStudio, from_lmstudio
from .mistral import AsyncMistral, Mistral, from_mistral
from .mlxlm import MLXLM, from_mlxlm
from .ollama import AsyncOllama, Ollama, from_ollama
from .openai import AsyncOpenAI, OpenAI, from_openai
from .sglang import AsyncSGLang, SGLang, from_sglang
from .tgi import TGI, AsyncTGI, from_tgi
from .transformers import (
    Transformers,
    TransformersMultiModal,
    TransformerTokenizer,
    from_transformers,
)
from .vllm import VLLM, AsyncVLLM, from_vllm
from .vllm_offline import VLLMOffline, from_vllm_offline

SteerableModel = Union[LlamaCpp, MLXLM, Transformers]
BlackBoxModel = Union[
    Anthropic,
    Dottxt,
    Gemini,
    LMStudio,
    Ollama,
    OpenAI,
    Mistral,
    SGLang,
    TGI,
    VLLM,
    VLLMOffline,
]
AsyncBlackBoxModel = Union[
    AsyncLMStudio,
    AsyncMistral,
    AsyncOllama,
    AsyncOpenAI,
    AsyncTGI,
    AsyncSGLang,
    AsyncVLLM,
]

__all__ = [

    "Anthropic",
    "from_anthropic",
    "Model",
    "ModelTypeAdapter",
    "Dottxt",
    "from_dottxt",
    "Gemini",
    "from_gemini",
    "LlamaCpp",
    "from_llamacpp",
    "AsyncLMStudio",
    "LMStudio",
    "from_lmstudio",
    "AsyncMistral",
    "Mistral",
    "from_mistral",
    "MLXLM",
    "from_mlxlm",
    "AsyncOllama",
    "Ollama",
    "from_ollama",
    "AsyncOpenAI",
    "OpenAI",
    "from_openai",
    "AsyncSGLang",
    "SGLang",
    "from_sglang",
    "AsyncTGI",
    "TGI",
    "from_tgi",
    "Transformers",
    "TransformerTokenizer",
    "TransformersMultiModal",
    "from_transformers",
    "VLLMOffline",
    "from_vllm_offline",
    "AsyncVLLM",
    "VLLM",
    "from_vllm",
    "SteerableModel",
    "BlackBoxModel",
    "AsyncBlackBoxModel",
]


================================================
FILE: outlines/models/anthropic.py
================================================
"""Integration with Anthropic's API."""

from functools import singledispatchmethod
from typing import TYPE_CHECKING, Any, Iterator, Optional, Union

from outlines.inputs import Chat, Image
from outlines.models.base import Model, ModelTypeAdapter

if TYPE_CHECKING:
    from anthropic import Anthropic as AnthropicClient

__all__ = ["Anthropic", "from_anthropic"]


class AnthropicTypeAdapter(ModelTypeAdapter):
    """Type adapter for the `Anthropic` model.

    `AnthropicTypeAdapter` is responsible for preparing the arguments to
    Anthropic's `messages.create` method: the input (prompt and possibly
    image).
    Anthropic does not support defining the output type, so
    `format_output_type` is not implemented.

    """

    @singledispatchmethod
    def format_input(self, model_input):
        """Generate the `messages` argument to pass to the client.

        Parameters
        ----------
        model_input
            The input provided by the user.

        Returns
        -------
        dict
            The `messages` argument to pass to the client.

        """
        raise TypeError(
            f"The input type {type(model_input)} is not available with "
            "Anthropic. The only available types are `str`, `list` and `Chat` "
            "(containing a prompt and images)."
        )

    @format_input.register(str)
    def format_str_model_input(self, model_input: str) -> dict:
        return {
            "messages": [self._create_message("user", model_input)]
        }

    @format_input.register(list)
    def format_list_model_input(self, model_input: list) -> dict:
        return {
            "messages": [
                self._create_message("user", model_input)
            ]
        }

    @format_input.register(Chat)
    def format_chat_model_input(self, model_input: Chat) -> dict:
        """Generate the `messages` argument to pass to the client when the user
        passes a Chat instance.

        """
        return {
            "messages": [
                self._create_message(message["role"], message["content"])
                for message in model_input.messages
            ]
        }

    def _create_message(self, role: str, content: str | list) -> dict:
        """Create a message."""

        if isinstance(content, str):
            return {
                "role": role,
                "content": content,
            }

        elif isinstance(content, list):
            prompt = content[0]
            images = content[1:]

            if not all(isinstance(image, Image) for image in images):
                raise ValueError("All assets provided must be of type Image")

            image_content_messages = [
                {
                    "type": "image",
                    "source": {
                        "type": "base64",
                        "media_type": image.image_format,
                        "data": image.image_str,
                    },
                }
                for image in images
            ]

            return {
                "role": role,
                "content": [
                    *image_content_messages,
                    {"type": "text", "text": prompt},
                ],
            }

        else:
            raise ValueError(
                f"Invalid content type: {type(content)}. "
                "The content must be a string or a list containing a string "
                "and a list of images."
            )

    def format_output_type(self, output_type):
        """Not implemented for Anthropic."""
        if output_type is None:
            return {}
        else:
            raise NotImplementedError(
                f"The output type {output_type} is not available with "
                "Anthropic."
            )


class Anthropic(Model):
    """Thin wrapper around the `anthropic.Anthropic` client.

    This wrapper is used to convert the input and output types specified by the
    users at a higher level to arguments to the `anthropic.Anthropic` client.

    """
    def __init__(
        self, client: "AnthropicClient", model_name: Optional[str] = None
    ):
        """
        Parameters
        ----------
        client
            An `anthropic.Anthropic` client.
        model_name
            The name of the model to use.

        """
        self.client = client
        self.model_name = model_name
        self.type_adapter = AnthropicTypeAdapter()

    def generate(
        self,
        model_input: Union[Chat, list, str],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> str:
        """Generate text using Anthropic.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            As structured generation is not supported by Anthropic, the value
            of this argument must be `None`. Otherwise, an error will be
            raised at runtime.
        **inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        str
            The response generated by the model.

        """
        messages = self.type_adapter.format_input(model_input)

        if output_type is not None:
            raise NotImplementedError(
                f"The type {output_type} is not available with Anthropic."
            )

        if (
            "model" not in inference_kwargs
            and self.model_name is not None
        ):
            inference_kwargs["model"] = self.model_name

        completion = self.client.messages.create(
            **messages,
            **inference_kwargs,
        )
        return completion.content[0].text

    def generate_batch(
        self,
        model_input,
        output_type = None,
        **inference_kwargs,
    ):
        raise NotImplementedError(
            "Anthropic does not support batch generation."
        )

    def generate_stream(
        self,
        model_input: Union[Chat, list, str],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> Iterator[str]:
        """Stream text using Anthropic.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            As structured generation is not supported by Anthropic, the value
            of this argument must be `None`. Otherwise, an error will be
            raised at runtime.
        **inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        Iterator[str]
            An iterator that yields the text generated by the model.

        """
        messages = self.type_adapter.format_input(model_input)

        if output_type is not None:
            raise NotImplementedError(
                f"The type {output_type} is not available with Anthropic."
            )

        if (
            "model" not in inference_kwargs
            and self.model_name is not None
        ):
            inference_kwargs["model"] = self.model_name

        stream = self.client.messages.create(
            **messages,
            stream=True,
            **inference_kwargs,
        )

        for chunk in stream:
            if (
                chunk.type == "content_block_delta"
                and chunk.delta.type == "text_delta"
            ):
                yield chunk.delta.text


def from_anthropic(
    client: "AnthropicClient", model_name: Optional[str] = None
) -> Anthropic:
    """Create an Outlines `Anthropic` model instance from an
    `anthropic.Anthropic` client instance.

    Parameters
    ----------
    client
        An `anthropic.Anthropic` client instance.
    model_name
        The name of the model to use.

    Returns
    -------
    Anthropic
        An Outlines `Anthropic` model instance.

    """
    return Anthropic(client, model_name)


================================================
FILE: outlines/models/base.py
================================================
"""Base classes for all models and model type adapters."""

from abc import ABC, abstractmethod
from typing import Any, AsyncIterator, Iterator, List, Optional


class ModelTypeAdapter(ABC):
    """Base class for all model type adapters.

    A type adapter instance must be given as a value to the `type_adapter`
    attribute when instantiating a model.
    The type adapter is responsible for formatting the input and output types
    passed to the model to match the specific format expected by the
    associated model.

    """

    @abstractmethod
    def format_input(self, model_input: Any) -> Any:
        """Format the user input to the expected format of the model.

        For API-based models, it typically means creating the `messages`
        argument passed to the client. For local models, it can mean casting
        the input from str to list for instance.
        This method is also used to validate that the input type provided by
        the user is supported by the model.

        Parameters
        ----------
        model_input
            The input provided by the user.

        Returns
        -------
        Any
            The formatted input to be passed to the model.

        """
        ...

    @abstractmethod
    def format_output_type(self, output_type: Optional[Any] = None) -> Any:
        """Format the output type to the expected format of the model.

        For black-box models, this typically means creating a `response_format`
        argument. For steerable models, it means formatting the logits processor
        to create the object type expected by the model.

        Parameters
        ----------
        output_type
            The output type provided by the user.

        Returns
        -------
        Any
            The formatted output type to be passed to the model.

        """
        ...

class Model(ABC):
    """Base class for all synchronous models.

    This class defines shared `__call__`, `batch` and `stream` methods that can
    be used to call the model directly. The `generate`, `generate_batch`, and
    `generate_stream` methods must be implemented by the subclasses.
    All models inheriting from this class must define a `type_adapter`
    attribute of type `ModelTypeAdapter`. The methods of the `type_adapter`
    attribute are used in the `generate`, `generate_batch`, and
    `generate_stream` methods to format the input and output types received by
    the model.
    Additionally, steerable models must define a `tensor_library_name`
    attribute.

    """
    type_adapter: ModelTypeAdapter
    tensor_library_name: str

    def __call__(
        self,
        model_input: Any,
        output_type: Optional[Any] = None,
        backend: Optional[str] = None,
        **inference_kwargs: Any
    ) -> Any:
        """Call the model.

        Users can call the model directly, in which case we will create a
        generator instance with the output type provided and call it.
        Thus, those commands are equivalent:
        ```python
        generator = Generator(model, Foo)
        generator("prompt")
        ```
        and
        ```python
        model("prompt", Foo)
        ```

        Parameters
        ----------
        model_input
            The input provided by the user.
        output_type
            The output type provided by the user.
        backend
            The name of the backend to use to create the logits processor that
            will be used to generate the response. Only used for steerable
            models if `output_type` is provided.
        **inference_kwargs
            Additional keyword arguments to pass to the model.

        Returns
        -------
        Any
            The response generated by the model.

        """
        from outlines.generator import Generator

        return Generator(self, output_type, backend)(model_input, **inference_kwargs)

    def batch(
        self,
        model_input: List[Any],
        output_type: Optional[Any] = None,
        backend: Optional[str] = None,
        **inference_kwargs: Any
    ) -> List[Any]:
        """Make a batch call to the model (several inputs at once).

        Users can use the `batch` method from the model directly, in which
        case we will create a generator instance with the output type provided
        and then invoke its `batch` method.
        Thus, those commands are equivalent:
        ```python
        generator = Generator(model, Foo)
        generator.batch(["prompt1", "prompt2"])
        ```
        and
        ```python
        model.batch(["prompt1", "prompt2"], Foo)
        ```

        Parameters
        ----------
        model_input
            The list of inputs provided by the user.
        output_type
            The output type provided by the user.
        backend
            The name of the backend to use to create the logits processor that
            will be used to generate the response. Only used for steerable
            models if `output_type` is provided.
        **inference_kwargs
            Additional keyword arguments to pass to the model.

        Returns
        -------
        List[Any]
            The list of responses generated by the model.

        """
        from outlines import Generator

        generator = Generator(self, output_type, backend)
        return generator.batch(model_input, **inference_kwargs) # type: ignore

    def stream(
        self,
        model_input: Any,
        output_type: Optional[Any] = None,
        backend: Optional[str] = None,
        **inference_kwargs: Any
    ) -> Iterator[Any]:
        """Stream a response from the model.

        Users can use the `stream` method from the model directly, in which
        case we will create a generator instance with the output type provided
        and then invoke its `stream` method.
        Thus, those commands are equivalent:
        ```python
        generator = Generator(model, Foo)
        for chunk in generator("prompt"):
            print(chunk)
        ```
        and
        ```python
        for chunk in model.stream("prompt", Foo):
            print(chunk)
        ```

        Parameters
        ----------
        model_input
            The input provided by the user.
        output_type
            The output type provided by the user.
        backend
            The name of the backend to use to create the logits processor that
            will be used to generate the response. Only used for steerable
            models if `output_type` is provided.
        **inference_kwargs
            Additional keyword arguments to pass to the model.

        Returns
        -------
        Iterator[Any]
            A stream of responses from the model.

        """
        from outlines import Generator

        generator = Generator(self, output_type, backend)
        return generator.stream(model_input, **inference_kwargs) # type: ignore

    @abstractmethod
    def generate(
        self,
        model_input: Any,
        output_type: Optional[Any] = None,
        **inference_kwargs: Any
    ) -> Any:
        """Generate a response from the model.

        The output_type argument contains a logits processor for steerable
        models while it contains a type (Json, Enum...) for black-box models.
        This method is not intended to be used directly by end users.

        Parameters
        ----------
        model_input
            The input provided by the user.
        output_type
            The output type provided by the user.
        **inference_kwargs
            Additional keyword arguments to pass to the model.

        Returns
        -------
        Any
            The response generated by the model.

        """
        ...

    @abstractmethod
    def generate_batch(
        self,
        model_input: List[Any],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any
    ) -> List[Any]:
        """Generate a batch of responses from the model.

        The output_type argument contains a logits processor for steerable
        models while it contains a type (Json, Enum...) for black-box models.
        This method is not intended to be used directly by end users.

        Parameters
        ----------
        model_input
            The list of inputs provided by the user.
        output_type
            The output type provided by the user.
        **inference_kwargs
            Additional keyword arguments to pass to the model.

        Returns
        -------
        List[Any]
            The list of responses generated by the model.

        """
        ...
    @abstractmethod
    def generate_stream(
        self,
        model_input: Any,
        output_type: Optional[Any] = None,
        **inference_kwargs: Any
    ) -> Iterator[Any]:
        """Generate a stream of responses from the model.

        The output_type argument contains a logits processor for steerable
        models while it contains a type (Json, Enum...) for black-box models.
        This method is not intended to be used directly by end users.

        Parameters
        ----------
        model_input
            The input provided by the user.
        output_type
            The output type provided by the user.
        **inference_kwargs
            Additional keyword arguments to pass to the model.

        Returns
        -------
        Iterator[Any]
            A stream of responses from the model.

        """
        ...

class AsyncModel(ABC):
    """Base class for all asynchronous models.

    This class defines shared `__call__`, `batch` and `stream` methods that can
    be used to call the model directly. The `generate`, `generate_batch`, and
    `generate_stream` methods must be implemented by the subclasses.
    All models inheriting from this class must define a `type_adapter`
    attribute of type `ModelTypeAdapter`. The methods of the `type_adapter`
    attribute are used in the `generate`, `generate_batch`, and
    `generate_stream` methods to format the input and output types received by
    the model.
    Additionally, steerable models must define a `tensor_library_name`
    attribute.

    """
    type_adapter: ModelTypeAdapter
    tensor_library_name: str

    async def __call__(
        self,
        model_input: Any,
        output_type: Optional[Any] = None,
        backend: Optional[str] = None,
        **inference_kwargs: Any
    ) -> Any:
        """Call the model.

        Users can call the model directly, in which case we will create a
        generator instance with the output type provided and call it.
        Thus, those commands are equivalent:
        ```python
        generator = Generator(model, Foo)
        await generator("prompt")
        ```
        and
        ```python
        await model("prompt", Foo)
        ```

        Parameters
        ----------
        model_input
            The input provided by the user.
        output_type
            The output type provided by the user.
        backend
            The name of the backend to use to create the logits processor that
            will be used to generate the response. Only used for steerable
            models if `output_type` is provided.
        **inference_kwargs
            Additional keyword arguments to pass to the model.

        Returns
        -------
        Any
            The response generated by the model.

        """
        from outlines import Generator

        generator = Generator(self, output_type, backend)
        return await generator(model_input, **inference_kwargs)

    async def batch(
        self,
        model_input: List[Any],
        output_type: Optional[Any] = None,
        backend: Optional[str] = None,
        **inference_kwargs: Any
    ) -> List[Any]:
        """Make a batch call to the model (several inputs at once).

        Users can use the `batch` method from the model directly, in which
        case we will create a generator instance with the output type provided
        and then invoke its `batch` method.
        Thus, those commands are equivalent:
        ```python
        generator = Generator(model, Foo)
        await generator.batch(["prompt1", "prompt2"])
        ```
        and
        ```python
        await model.batch(["prompt1", "prompt2"], Foo)
        ```

        Parameters
        ----------
        model_input
            The list of inputs provided by the user.
        output_type
            The output type provided by the user.
        backend
            The name of the backend to use to create the logits processor that
            will be used to generate the response. Only used for steerable
            models if `output_type` is provided.
        **inference_kwargs
            Additional keyword arguments to pass to the model.

        Returns
        -------
        List[Any]
            The list of responses generated by the model.

        """
        from outlines import Generator

        generator = Generator(self, output_type, backend)
        return await generator.batch(model_input, **inference_kwargs) # type: ignore

    async def stream(
        self,
        model_input: Any,
        output_type: Optional[Any] = None,
        backend: Optional[str] = None,
        **inference_kwargs: Any
    ) -> AsyncIterator[Any]:
        """Stream a response from the model.

        Users can use the `stream` method from the model directly, in which
        case we will create a generator instance with the output type provided
        and then invoke its `stream` method.
        Thus, those commands are equivalent:
        ```python
        generator = Generator(model, Foo)
        async for chunk in generator("prompt"):
            print(chunk)
        ```
        and
        ```python
        async for chunk in model.stream("prompt", Foo):
            print(chunk)
        ```

        Parameters
        ----------
        model_input
            The input provided by the user.
        output_type
            The output type provided by the user.
        backend
            The name of the backend to use to create the logits processor that
            will be used to generate the response. Only used for steerable
            models if `output_type` is provided.
        **inference_kwargs
            Additional keyword arguments to pass to the model.

        Returns
        -------
        AsyncIterator[Any]
            A stream of responses from the model.

        """
        from outlines import Generator

        generator = Generator(self, output_type, backend)

        async for chunk in generator.stream(model_input, **inference_kwargs):  # type: ignore
            yield chunk

    @abstractmethod
    async def generate(
        self,
        model_input: Any,
        output_type: Optional[Any] = None,
        **inference_kwargs: Any
    ) -> Any:
        """Generate a response from the model.

        The output_type argument contains a logits processor for steerable
        models while it contains a type (Json, Enum...) for black-box models.
        This method is not intended to be used directly by end users.

        Parameters
        ----------
        model_input
            The input provided by the user.
        output_type
            The output type provided by the user.
        **inference_kwargs
            Additional keyword arguments to pass to the model.

        Returns
        -------
        Any
            The response generated by the model.

        """
        ...

    @abstractmethod
    async def generate_batch(
        self,
        model_input: List[Any],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any
    ) -> List[Any]:
        """Generate a batch of responses from the model.

        The output_type argument contains a logits processor for steerable
        models while it contains a type (Json, Enum...) for black-box models.
        This method is not intended to be used directly by end users.

        Parameters
        ----------
        model_input
            The list of inputs provided by the user.
        output_type
            The output type provided by the user.
        **inference_kwargs
            Additional keyword arguments to pass to the model.

        Returns
        -------
        List[Any]
            The list of responses generated by the model.

        """
        ...

    @abstractmethod
    async def generate_stream(
        self,
        model_input: Any,
        output_type: Optional[Any] = None,
        **inference_kwargs: Any
    ) -> AsyncIterator[Any]:
        """Generate a stream of responses from the model.

        The output_type argument contains a logits processor for steerable
        models while it contains a type (Json, Enum...) for black-box models.
        This method is not intended to be used directly by end users.

        Parameters
        ----------
        model_input
            The input provided by the user.
        output_type
            The output type provided by the user.
        **inference_kwargs
            Additional keyword arguments to pass to the model.

        Returns
        -------
        AsyncIterator[Any]
            A coroutine that will produce an async iterator of responses from the model.

        """
        ...


================================================
FILE: outlines/models/dottxt.py
================================================
"""Integration with Dottxt's API."""

from typing import TYPE_CHECKING, Any, Optional, cast

from outlines.models.base import Model, ModelTypeAdapter
from outlines.types import CFG, JsonSchema, Regex

if TYPE_CHECKING:
    from dottxt import Dottxt as DottxtClient

__all__ = ["Dottxt", "from_dottxt"]


class DottxtTypeAdapter(ModelTypeAdapter):
    """Type adapter for the `Dottxt` model."""

    def format_input(self, model_input: str) -> str:
        """Format the prompt to pass to the client.

        Parameters
        ----------
        model_input
            The input provided by the user.

        Returns
        -------
        str
            The input to pass to the client.

        """
        if isinstance(model_input, str):
            return model_input
        raise TypeError(
            f"The input type {model_input} is not available with Dottxt. "
            "The only available type is `str`."
        )

    def format_output_type(self, output_type: Optional[Any] = None) -> str:
        """Format the output type to pass to the client.

        Parameters
        ----------
        output_type
            The output type provided by the user.

        Returns
        -------
        str
            The output type to pass to the client.

        """
        # Unsupported languages
        if output_type is None:
            raise TypeError(
                "You must provide an output type. Dottxt only supports "
                "constrained generation."
            )
        elif isinstance(output_type, Regex):
            raise TypeError(
                "Regex-based structured outputs will soon be available with "
                "Dottxt. Use an open source model in the meantime."
            )
        elif isinstance(output_type, CFG):
            raise TypeError(
                "CFG-based structured outputs will soon be available with "
                "Dottxt. Use an open source model in the meantime."
            )
        elif JsonSchema.is_json_schema(output_type):
            return cast(str, JsonSchema.convert_to(output_type, ["str"]))
        else:
            type_name = getattr(output_type, "__name__", output_type)
            raise TypeError(
                f"The type `{type_name}` is not supported by Dottxt. "
                "Consider using a local mode instead."
            )


class Dottxt(Model):
    """Thin wrapper around the `dottxt.client.Dottxt` client.

    This wrapper is used to convert the input and output types specified by the
    users at a higher level to arguments to the `dottxt.client.Dottxt` client.

    """

    def __init__(
        self,
        client: "DottxtClient",
        model_name: Optional[str] = None,
        model_revision: Optional[str] = None,
    ):
        """
        Parameters
        ----------
        client
            A `dottxt.Dottxt` client.
        model_name
            The name of the model to use.
        model_revision
            The revision of the model to use.

        """
        self.client = client
        self.model_name = model_name
        self.model_revision = model_revision
        self.type_adapter = DottxtTypeAdapter()

    def generate(
        self,
        model_input: str,
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> str:
        """Generate text using Dottxt.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. The
            output type must be of a type that can be converted to a JSON
            schema.
        **inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        str
            The text generated by the model.

        """
        prompt = self.type_adapter.format_input(model_input)
        json_schema = self.type_adapter.format_output_type(output_type)

        if (
            "model_name" not in inference_kwargs
            and self.model_name is not None
        ):
            inference_kwargs["model_name"] = self.model_name

        if (
            "model_revision" not in inference_kwargs
            and self.model_revision is not None
        ):
            inference_kwargs["model_revision"] = self.model_revision

        completion = self.client.json(
            prompt,
            json_schema,
            **inference_kwargs,
        )
        return completion.data

    def generate_batch(
        self,
        model_input,
        output_type = None,
        **inference_kwargs,
    ):
        raise NotImplementedError(
            "Dottxt does not support batch generation."
        )

    def generate_stream(
        self,
        model_input,
        output_type=None,
        **inference_kwargs,
    ):
        """Not available for Dottxt."""
        raise NotImplementedError(
            "Dottxt does not support streaming. Call the model/generator for "
            + "regular generation instead."
        )


def from_dottxt(
    client: "DottxtClient",
    model_name: Optional[str] = None,
    model_revision: Optional[str] = None,
) -> Dottxt:
    """Create an Outlines `Dottxt` model instance from a `dottxt.Dottxt`
    client instance.

    Parameters
    ----------
    client
        A `dottxt.Dottxt` client instance.
    model_name
        The name of the model to use.
    model_revision
        The revision of the model to use.

    Returns
    -------
    Dottxt
        An Outlines `Dottxt` model instance.

    """
    return Dottxt(client, model_name, model_revision)


================================================
FILE: outlines/models/gemini.py
================================================
"""Integration with Gemini's API."""

from functools import singledispatchmethod
from typing import (
    TYPE_CHECKING,
    Any,
    Iterator,
    Optional,
    Union,
    get_args,
)

from outlines.inputs import Image, Chat
from outlines.models.base import Model, ModelTypeAdapter
from outlines.types import CFG, Choice, JsonSchema, Regex
from outlines.types.utils import (
    is_enum,
    get_enum_from_choice,
    get_enum_from_literal,
    is_genson_schema_builder,
    is_literal,
    is_typing_list,
)

if TYPE_CHECKING:
    from google.genai import Client

__all__ = ["Gemini", "from_gemini"]


class GeminiTypeAdapter(ModelTypeAdapter):
    """Type adapter for the `Gemini` model.

    `GeminiTypeAdapter` is responsible for preparing the arguments to Gemini's
    client `models.generate_content` method: the input (prompt and possibly
    image), as well as the output type (either JSON or multiple choice).

    """

    @singledispatchmethod
    def format_input(self, model_input):
        """Generate the `contents` argument to pass to the client.

        Parameters
        ----------
        model_input
            The input provided by the user.

        Returns
        -------
        dict
            The `contents` argument to pass to the client.

        """
        raise TypeError(
            f"The input type {type(model_input)} is not available with "
            "Gemini. The only available types are `str`, `list` and `Chat` "
            "(containing a prompt and images)."
        )

    @format_input.register(str)
    def format_str_model_input(self, model_input: str) -> dict:
        return {"contents": [self._create_text_part(model_input)]}

    @format_input.register(list)
    def format_list_model_input(self, model_input: list) -> dict:
        return {
            "contents": [
                self._create_message("user", model_input)
            ]
        }

    @format_input.register(Chat)
    def format_chat_model_input(self, model_input: Chat) -> dict:
        """Generate the `contents` argument to pass to the client when the user
        passes a Chat instance.

        """
        return {
            "contents": [
                self._create_message(message["role"], message["content"])
                for message in model_input.messages
            ]
        }

    def _create_message(self, role: str, content: str | list) -> dict:
        """Create a message."""

        # Gemini uses "model" instead of "assistant"
        if role == "assistant":
            role = "model"

        if isinstance(content, str):
            return {
                "role": role,
                "parts": [self._create_text_part(content)],
            }

        elif isinstance(content, list):
            prompt = content[0]
            images = content[1:]

            if not all(isinstance(image, Image) for image in images):
                raise ValueError("All assets provided must be of type Image")

            image_parts = [
                self._create_img_part(image)
                for image in images
            ]

            return {
                "role": role,
                "parts": [
                    self._create_text_part(prompt),
                    *image_parts,
                ],
            }

        else:
            raise ValueError(
                f"Invalid content type: {type(content)}. "
                "The content must be a string or a list containing a string "
                "and a list of images."
            )

        return {"contents": [prompt, *image_parts]}


    def _create_text_part(self, text: str) -> dict:
        """Create a text input part for a message."""
        return {
            "text": text,
        }

    def _create_img_part(self, image: Image) -> dict:
        """Create an image input part for a message."""
        return {
            "inline_data": {
                "mime_type": image.image_format,
                "data": image.image_str,
            }
        }

    def format_output_type(self, output_type: Optional[Any] = None) -> dict:
        """Generate the `generation_config` argument to pass to the client.

        Parameters
        ----------
        output_type
            The output type provided by the user.

        Returns
        -------
        dict
            The `generation_config` argument to pass to the client.

        """

        # Unsupported output pytes
        if isinstance(output_type, Regex):
            raise TypeError(
                "Neither regex-based structured outputs nor the `pattern` "
                "keyword in Json Schema are available with Gemini. Use an "
                "open source model or dottxt instead."
            )
        elif isinstance(output_type, CFG):
            raise TypeError(
                "CFG-based structured outputs are not available with Gemini. "
                "Use an open source model or dottxt instead."
            )

        if output_type is None:
            return {}

        # JSON schema types
        elif JsonSchema.is_json_schema(output_type):
            return self.format_json_output_type(
                JsonSchema.convert_to(
                    output_type,
                    ["dataclass", "typeddict", "pydantic"]
                )
            )

        # List of structured types
        elif is_typing_list(output_type):
            return self.format_list_output_type(output_type)

        # Multiple choice types
        elif is_enum(output_type):
            return self.format_enum_output_type(output_type)
        elif is_literal(output_type):
            enum = get_enum_from_literal(output_type)
            return self.format_enum_output_type(enum)
        elif isinstance(output_type, Choice):
            enum = get_enum_from_choice(output_type)
            return self.format_enum_output_type(enum)

        else:
            type_name = getattr(output_type, "__name__", output_type)
            raise TypeError(
                f"The type `{type_name}` is not supported by Gemini. "
                "Consider using a local model or dottxt instead."
            )

    def format_enum_output_type(self, output_type: Optional[Any]) -> dict:
        return {
            "response_mime_type": "text/x.enum",
            "response_schema": output_type,
        }

    def format_json_output_type(self, output_type: Optional[Any]) -> dict:
        return {
            "response_mime_type": "application/json",
            "response_schema": output_type,
        }

    def format_list_output_type(self, output_type: Optional[Any]) -> dict:
        args = get_args(output_type)

        if len(args) == 1:
            item_type = args[0]

            if JsonSchema.is_json_schema(item_type):
                return {
                    "response_mime_type": "application/json",
                    "response_schema": list[  # type: ignore
                        JsonSchema.convert_to(
                            item_type,
                            ["dataclass", "typeddict", "pydantic"]
                        )
                    ],
                }
            else:
                raise TypeError(
                    "The list items output type must contain a JSON schema "
                    "type."
                )

        raise TypeError(
            f"Gemini only supports homogeneous lists: "
            "list[BaseModel], list[TypedDict] or list[dataclass]. "
            f"Got {output_type} instead."
        )


class Gemini(Model):
    """Thin wrapper around the `google.genai.Client` client.

    This wrapper is used to convert the input and output types specified by
    the users at a higher level to arguments to the `google.genai.Client`
    client.

    """

    def __init__(self, client: "Client", model_name: Optional[str] = None):
        """
        Parameters
        ----------
        client
            A `google.genai.Client` instance.
        model_name
            The name of the model to use.

        """
        self.client = client
        self.model_name = model_name
        self.type_adapter = GeminiTypeAdapter()

    def generate(
        self,
        model_input: Union[Chat, list, str],
        output_type: Optional[Any] = None,
        **inference_kwargs,
    ) -> str:
        """Generate a response from the model.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. The
            output type must be of a type that can be converted to a JSON
            schema, a list of such types, or a multiple choice type.
        **inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        str
            The response generated by the model.

        """
        contents = self.type_adapter.format_input(model_input)
        generation_config = self.type_adapter.format_output_type(output_type)

        completion = self.client.models.generate_content(
            **contents,
            model=inference_kwargs.pop("model", self.model_name),
            config={**generation_config, **inference_kwargs}
        )

        return completion.text

    def generate_batch(
        self,
        model_input,
        output_type = None,
        **inference_kwargs,
    ):
        raise NotImplementedError(
            "Gemini does not support batch generation."
        )

    def generate_stream(
        self,
        model_input: Union[Chat, list, str],
        output_type: Optional[Any] = None,
        **inference_kwargs,
    ) -> Iterator[str]:
        """Generate a stream of responses from the model.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. The
            output type must be of a type that can be converted to a JSON
            schema, a list of such types, or a multiple choice type.
        **inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        Iterator[str]
            An iterator that yields the text generated by the model.

        """
        contents = self.type_adapter.format_input(model_input)
        generation_config = self.type_adapter.format_output_type(output_type)

        stream = self.client.models.generate_content_stream(
            **contents,
            model=inference_kwargs.pop("model", self.model_name),
            config={**generation_config, **inference_kwargs},
        )

        for chunk in stream:
            if hasattr(chunk, "text") and chunk.text:
                yield chunk.text


def from_gemini(client: "Client", model_name: Optional[str] = None) -> Gemini:
    """Create an Outlines `Gemini` model instance from a
    `google.genai.Client` instance.

    Parameters
    ----------
    client
        A `google.genai.Client` instance.
    model_name
        The name of the model to use.

    Returns
    -------
    Gemini
        An Outlines `Gemini` model instance.

    """
    return Gemini(client, model_name)


================================================
FILE: outlines/models/llamacpp.py
================================================
"""Integration with the `llama-cpp-python` library."""

import ctypes
from functools import singledispatchmethod
from typing import (
    TYPE_CHECKING,
    Any,
    Dict,
    Iterator,
    List,
    Optional,
    Set,
    Tuple,
    Union,
)

from outlines.inputs import Chat
from outlines.models.base import Model, ModelTypeAdapter
from outlines.models.tokenizer import Tokenizer
from outlines.processors import OutlinesLogitsProcessor

if TYPE_CHECKING:
    from llama_cpp import Llama, LogitsProcessorList

__all__ = ["LlamaCpp", "from_llamacpp"]


class LlamaCppTokenizer(Tokenizer):
    def __init__(self, model: "Llama"):
        self.tokenizer = model.tokenizer()
        self.special_tokens: Set[str] = set()
        self.vocabulary: Dict[str, int] = dict()

        # TODO: Remove when https://github.com/ggerganov/llama.cpp/pull/5613
        # is resolved
        self._hf_tokenizer = None
        if (
            hasattr(model, "tokenizer_")
            and hasattr(model.tokenizer_, "hf_tokenizer")
        ):
            self._hf_tokenizer = model.tokenizer_.hf_tokenizer
            self.eos_token_id = self._hf_tokenizer.eos_token_id
            self.eos_token = self._hf_tokenizer.eos_token
            self.vocabulary = self._hf_tokenizer.get_vocab()
        else:
            from llama_cpp import (
                llama_model_get_vocab,
                llama_token_to_piece,
            )

            self.eos_token_id = model.token_eos()
            size = 32
            buffer = (ctypes.c_char * size)()
            vocab = llama_model_get_vocab(model.model)
            for i in range(model.n_vocab()):
                n = llama_token_to_piece(
                    vocab,
                    i,
                    buffer,
                    size,
                    0,
                    True
                )
                # n < 0 is an error return from llama_token_to_piece;
                # skip invalid tokens so they don't pollute the vocabulary.
                if n < 0:
                    continue
                # n > size means the piece was truncated; retry with a
                # larger buffer so distinct tokens are not collapsed.
                if n > size:
                    big = (ctypes.c_char * n)()
                    llama_token_to_piece(vocab, i, big, n, 0, True)
                    token_piece = big[:n].decode("utf-8", errors="replace")  # type: ignore
                else:
                    token_piece = buffer[:n].decode("utf-8", errors="replace")  # type: ignore
                self.vocabulary[token_piece] = i
                if i == self.eos_token_id:
                    self.eos_token = token_piece

        self.pad_token_id = self.eos_token_id
        # ensure stable ordering of vocabulary
        self.vocabulary = {
            tok: tok_id
            for tok, tok_id
            in sorted(self.vocabulary.items(), key=lambda x: x[1])
        }
        self._hash = None

    def decode(self, token_ids: List[int]) -> List[str]:
        decoded_bytes = self.tokenizer.detokenize(token_ids)
        return [decoded_bytes.decode("utf-8", errors="ignore")]

    def encode(
        self,
        prompt: Union[str, List[str]],
        add_bos: bool = True,
        special: bool = True,
    ) -> Tuple[List[int], List[int]]:
        if isinstance(prompt, list):
            raise NotImplementedError(
                "llama-cpp-python tokenizer doesn't support batch tokenization"
            )
        token_ids = self.tokenizer.tokenize(
            prompt.encode("utf-8", errors="ignore"),
            add_bos=add_bos,
            special=special,
        )
        # generate attention mask, missing from llama-cpp-python.
        # For a single (non-batched) prompt there is no real padding, so
        # every token — including EOS when it appears inside the prompt —
        # should be attended.  We therefore always set the mask to 1.
        attention_mask = [1] * len(token_ids)
        return token_ids, attention_mask

    def convert_token_to_string(self, token: str) -> str:
        if self._hf_tokenizer is not None:
            from transformers.file_utils import SPIECE_UNDERLINE

            token_str = self._hf_tokenizer.convert_tokens_to_string([token])
            if (
                token.startswith(SPIECE_UNDERLINE)
                or token == "<0x20>"
            ):  # pragma: no cover
                token_str = " " + token_str
            return token_str
        else:
            return token

    def __eq__(self, other):
        if not isinstance(other, LlamaCppTokenizer):
            return False
        return self.__getstate__() == other.__getstate__()

    def __hash__(self):
        # We create a custom hash as pickle.dumps(self) is not stable
        if self._hash is None:
            self._hash = hash((
                tuple(sorted(self.vocabulary.items())),
                self.eos_token_id,
                self.eos_token,
                self.pad_token_id,
                tuple(sorted(self.special_tokens)),
            ))
        return self._hash

    def __getstate__(self):
        """Create a stable representation for outlines.caching"""
        return (
            self.vocabulary,
            self.eos_token_id,
            self.eos_token,
            self.pad_token_id,
            sorted(self.special_tokens),
        )

    def __setstate__(self, state):
        raise NotImplementedError("Cannot load a pickled llamacpp tokenizer")


class LlamaCppTypeAdapter(ModelTypeAdapter):
    """Type adapter for the `LlamaCpp` model.

    `LlamaCppTypeAdapter` is responsible for preparing the arguments to
    the `Llama` object text generation methods.

    """

    def __init__(self, has_chat_template: bool = False):
        """
        Parameters
        ----------
        has_chat_template
            Whether the model has a chat template defined.
        """
        self.has_chat_template = has_chat_template

    @singledispatchmethod
    def format_input(self, model_input):
        """Generate the prompt argument to pass to the model.

        Parameters
        ----------
        model_input
            The input provided by the user.

        Returns
        -------
        str
            The formatted input to be passed to the model.

        """
        raise NotImplementedError(
            f"The input type {type(model_input)} is not available with "
            "LlamaCpp. The only available types are `str` and `Chat`."
        )

    @format_input.register(str)
    def format_str_input(self, model_input: str) -> str | list:
        if self.has_chat_template:
            return [{"role": "user", "content": model_input}]
        return model_input

    @format_input.register(Chat)
    def format_chat_input(self, model_input: Chat) -> list:
        if not all(
            isinstance(message["content"], str)
            for message in model_input.messages
        ):
            raise ValueError(
                "LlamaCpp does not support multi-modal messages."
                + "The content of each message must be a string."
            )

        return  [
            {
                "role": message["role"],
                "content": message["content"],
            }
            for message in model_input.messages
        ]

    def format_output_type(
        self, output_type: Optional[OutlinesLogitsProcessor] = None,
    ) -> Optional["LogitsProcessorList"]:
        """Generate the logits processor argument to pass to the model.

        Parameters
        ----------
        output_type
            The logits processor provided.

        Returns
        -------
        LogitsProcessorList
            The logits processor to pass to the model.

        """
        from llama_cpp import LogitsProcessorList

        if output_type is not None:
            return LogitsProcessorList([output_type])
        return None


class LlamaCpp(Model):
    """Thin wrapper around the `llama_cpp.Llama` model.

    This wrapper is used to convert the input and output types specified by the
    users at a higher level to arguments to the `llama_cpp.Llama` model.
    """

    tensor_library_name = "numpy"

    def __init__(self, model: "Llama", chat_mode: bool = True):
        """
        Parameters
        ----------
        model
            A `llama_cpp.Llama` model instance.
        chat_mode
            Whether to enable chat mode. If `False`, the model will regard
            all `str` inputs as plain text prompts. If `True`, the model will
            regard all `str` inputs as user messages in a chat conversation.

        """
        self.model = model
        self.tokenizer = LlamaCppTokenizer(self.model)

        # Note: llama-cpp-python provides a default chat-template fallback even when
        # the user hasn't explicitly configured one:
        # https://github.com/abetlen/llama-cpp-python/blob/c37132b/llama_cpp/llama.py#L540-L545
        # We keep the default as True because the upstream library generally favors chat-style usage.
        self.type_adapter = LlamaCppTypeAdapter(has_chat_template=chat_mode)

    def generate(
        self,
        model_input: Union[Chat, str],
        output_type: Optional[OutlinesLogitsProcessor] = None,
        **inference_kwargs: Any,
    ) -> str:
        """Generate text using `llama-cpp-python`.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The logits processor the model will use to constrain the format of
            the generated text.
        **inference_kwargs
            Additional keyword arguments to pass to the `Llama.__call__`
            method of the `llama-cpp-python` library.

        Returns
        -------
        str
            The text generated by the model.

        """
        prompt = self.type_adapter.format_input(model_input)

        if isinstance(prompt, str):
            completion = self.model(
                prompt,
                logits_processor=self.type_adapter.format_output_type(output_type),
                **inference_kwargs,
            )
            result = completion["choices"][0]["text"]
        elif isinstance(prompt, list):
            completion = self.model.create_chat_completion(
                prompt,
                logits_processor=self.type_adapter.format_output_type(output_type),
                **inference_kwargs,
            )
            result = completion["choices"][0]["message"]["content"]
        else:  # Never reached  # pragma: no cover
            raise ValueError("Unexpected prompt type.")

        self.model.reset()

        return result

    def generate_batch(
        self,
        model_input,
        output_type = None,
        **inference_kwargs,
    ):
        raise NotImplementedError("LlamaCpp does not support batch generation.")

    def generate_stream(
        self,
        model_input: Union[Chat, str],
        output_type: Optional[OutlinesLogitsProcessor] = None,
        **inference_kwargs: Any,
    ) -> Iterator[str]:
        """Stream text using `llama-cpp-python`.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The logits processor the model will use to constrain the format of
            the generated text.
        **inference_kwargs
            Additional keyword arguments to pass to the `Llama.__call__`
            method of the `llama-cpp-python` library.

        Returns
        -------
        Iterator[str]
            An iterator that yields the text generated by the model.

        """
        prompt = self.type_adapter.format_input(model_input)

        if isinstance(prompt, str):
            generator = self.model(
                prompt,
                logits_processor=self.type_adapter.format_output_type(output_type),
                stream=True,
                **inference_kwargs,
            )
            for chunk in generator:
                yield chunk["choices"][0]["text"]

        elif isinstance(prompt, list):
            generator = self.model.create_chat_completion(
                prompt,
                logits_processor=self.type_adapter.format_output_type(output_type),
                stream=True,
                **inference_kwargs,
            )
            for chunk in generator:
                yield chunk["choices"][0]["delta"].get("content", "")
        else:  # Never reached  # pragma: no cover
            raise ValueError("Unexpected prompt type.")

def from_llamacpp(model: "Llama", chat_mode: bool = True) -> LlamaCpp:
    """Create an Outlines `LlamaCpp` model instance from a
    `llama_cpp.Llama` instance.

    Parameters
    ----------
    model
        A `llama_cpp.Llama` instance.
    chat_mode
        Whether to enable chat mode. If `False`, the model will regard
        all `str` inputs as plain text prompts. If `True`, the model will
        regard all `str` inputs as user messages in a chat conversation.

    Returns
    -------
    LlamaCpp
        An Outlines `LlamaCpp` model instance.

    """
    return LlamaCpp(model, chat_mode=chat_mode)


================================================
FILE: outlines/models/lmstudio.py
================================================
"""Integration with the `lmstudio` library."""

from functools import singledispatchmethod
from typing import (
    TYPE_CHECKING,
    Any,
    AsyncIterator,
    Iterator,
    Optional,
    Union,
    cast,
)

from outlines.inputs import Chat, Image
from outlines.models.base import AsyncModel, Model, ModelTypeAdapter
from outlines.types import CFG, JsonSchema, Regex

if TYPE_CHECKING:
    from lmstudio import AsyncClient, Chat as LMStudioChat, Client

__all__ = ["LMStudio", "AsyncLMStudio", "from_lmstudio"]


class LMStudioTypeAdapter(ModelTypeAdapter):
    """Type adapter for the `LMStudio` model."""

    def _prepare_lmstudio_image(self, image: Image):
        """Convert Outlines Image to LMStudio image handle.

        LMStudio's SDK only accepts file paths, raw bytes, or binary IO objects.
        Unlike Ollama which accepts base64 directly, we must decode from base64.
        """
        import base64

        import lmstudio as lms

        image_bytes = base64.b64decode(image.image_str)
        return lms.prepare_image(image_bytes)

    @singledispatchmethod
    def format_input(self, model_input):
        """Format input for LMStudio model.

        Parameters
        ----------
        model_input
            The input provided by the user.

        Returns
        -------
        str | LMStudioChat
            The formatted input to be passed to the model.

        """
        raise TypeError(
            f"The input type {type(model_input)} is not available with "
            "LMStudio. The only available types are `str`, `list` and `Chat`."
        )

    @format_input.register(str)
    def format_str_model_input(self, model_input: str) -> str:
        """Pass through string input directly to LMStudio."""
        return model_input

    @format_input.register(list)
    def format_list_model_input(self, model_input: list) -> "LMStudioChat":
        """Handle list input containing prompt and images."""
        from lmstudio import Chat as LMSChat

        prompt = model_input[0]
        images = model_input[1:]

        if not all(isinstance(img, Image) for img in images):
            raise ValueError("All assets provided must be of type Image")

        chat = LMSChat()
        image_handles = [self._prepare_lmstudio_image(img) for img in images]
        chat.add_user_message(prompt, images=image_handles)
        return chat

    @format_input.register(Chat)
    def format_chat_model_input(self, model_input: Chat) -> "LMStudioChat":
        """Convert Outlines Chat to LMStudio Chat with image support."""
        from lmstudio import Chat as LMSChat

        system_prompt = None
        messages = model_input.messages

        if messages and messages[0]["role"] == "system":
            system_prompt = messages[0]["content"]
            messages = messages[1:]

        chat = LMSChat(system_prompt) if system_prompt else LMSChat()

        for message in messages:
            role = message["role"]
            content = message["content"]

            if role == "user":
                if isinstance(content, str):
                    chat.add_user_message(content)
                elif isinstance(content, list):
                    prompt = content[0]
                    images = content[1:]
                    if not all(isinstance(img, Image) for img in images):
                        raise ValueError("All assets provided must be of type Image")
                    image_handles = [self._prepare_lmstudio_image(img) for img in images]
                    chat.add_user_message(prompt, images=image_handles)
                else:
                    raise ValueError(
                        f"Invalid content type: {type(content)}. "
                        "The content must be a string or a list containing a string "
                        "and a list of images."
                    )
            elif role == "assistant":
                chat.add_assistant_response(content)
            else:
                raise ValueError(f"Unsupported role: {role}")

        return chat

    def format_output_type(
        self, output_type: Optional[Any] = None
    ) -> Optional[dict]:
        """Format the output type to pass to the model.

        Parameters
        ----------
        output_type
            The output type provided by the user.

        Returns
        -------
        Optional[dict]
            The formatted output type (JSON schema) to be passed to the model.

        """
        if output_type is None:
            return None
        elif isinstance(output_type, Regex):
            raise TypeError(
                "Regex-based structured outputs are not supported by LMStudio. "
                "Use an open source model in the meantime."
            )
        elif isinstance(output_type, CFG):
            raise TypeError(
                "CFG-based structured outputs are not supported by LMStudio. "
                "Use an open source model in the meantime."
            )
        elif JsonSchema.is_json_schema(output_type):
            return cast(dict, JsonSchema.convert_to(output_type, ["dict"]))
        else:
            type_name = getattr(output_type, "__name__", output_type)
            raise TypeError(
                f"The type `{type_name}` is not supported by LMStudio. "
                "Consider using a local model instead."
            )


class LMStudio(Model):
    """Thin wrapper around a `lmstudio.Client` client.

    This wrapper is used to convert the input and output types specified by the
    users at a higher level to arguments to the LMStudio client.

    """

    def __init__(self, client: "Client", model_name: Optional[str] = None):
        """
        Parameters
        ----------
        client
            A LMStudio Client instance obtained via `lmstudio.Client()` or
            `lmstudio.get_default_client()`.
        model_name
            The name of the model to use. If not provided, uses the default
            loaded model in LMStudio.

        """
        self.client = client
        self.model_name = model_name
        self.type_adapter = LMStudioTypeAdapter()

    def generate(
        self,
        model_input: Chat | str | list,
        output_type: Optional[Any] = None,
        **kwargs: Any,
    ) -> str:
        """Generate text using LMStudio.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. The
            output type must be of a type that can be converted to a JSON
            schema.
        **kwargs
            Additional keyword arguments to pass to the model.

        Returns
        -------
        str
            The text generated by the model.

        """
        if "model" not in kwargs and self.model_name is not None:
            kwargs["model"] = self.model_name

        model_key = kwargs.pop("model", None)
        model = self.client.llm.model(model_key) if model_key else self.client.llm.model()

        formatted_input = self.type_adapter.format_input(model_input)
        response_format = self.type_adapter.format_output_type(output_type)

        if response_format is not None:
            kwargs["response_format"] = response_format

        result = model.respond(formatted_input, **kwargs)
        return result.content

    def generate_batch(
        self,
        model_input,
        output_type=None,
        **kwargs,
    ):
        raise NotImplementedError(
            "The `lmstudio` library does not support batch inference."
        )

    def generate_stream(
        self,
        model_input: Chat | str | list,
        output_type: Optional[Any] = None,
        **kwargs: Any,
    ) -> Iterator[str]:
        """Stream text using LMStudio.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. The
            output type must be of a type that can be converted to a JSON
            schema.
        **kwargs
            Additional keyword arguments to pass to the model.

        Returns
        -------
        Iterator[str]
            An iterator that yields the text generated by the model.

        """
        if "model" not in kwargs and self.model_name is not None:
            kwargs["model"] = self.model_name

        model_key = kwargs.pop("model", None)
        model = self.client.llm.model(model_key) if model_key else self.client.llm.model()

        formatted_input = self.type_adapter.format_input(model_input)
        response_format = self.type_adapter.format_output_type(output_type)

        if response_format is not None:
            kwargs["response_format"] = response_format

        stream = model.respond_stream(formatted_input, **kwargs)
        for fragment in stream:
            yield fragment.content


class AsyncLMStudio(AsyncModel):
    """Thin wrapper around a `lmstudio.AsyncClient` client.

    This wrapper is used to convert the input and output types specified by the
    users at a higher level to arguments to the LMStudio async client.

    """

    def __init__(
        self, client: "AsyncClient", model_name: Optional[str] = None
    ):
        """
        Parameters
        ----------
        client
            A LMStudio AsyncClient instance.
        model_name
            The name of the model to use. If not provided, uses the default
            loaded model in LMStudio.

        """
        self.client = client
        self.model_name = model_name
        self.type_adapter = LMStudioTypeAdapter()
        self._context_entered = False

    async def close(self) -> None:
        """Close the async client and release resources."""
        if self._context_entered:
            await self.client.__aexit__(None, None, None)
            self._context_entered = False

    async def generate(
        self,
        model_input: Chat | str | list,
        output_type: Optional[Any] = None,
        **kwargs: Any,
    ) -> str:
        """Generate text using LMStudio asynchronously.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. The
            output type must be of a type that can be converted to a JSON
            schema.
        **kwargs
            Additional keyword arguments to pass to the model.

        Returns
        -------
        str
            The text generated by the model.

        """
        if not self._context_entered:
            await self.client.__aenter__()
            self._context_entered = True

        if "model" not in kwargs and self.model_name is not None:
            kwargs["model"] = self.model_name

        model_key = kwargs.pop("model", None)
        model = await self.client.llm.model(model_key) if model_key else await self.client.llm.model()

        formatted_input = self.type_adapter.format_input(model_input)
        response_format = self.type_adapter.format_output_type(output_type)

        if response_format is not None:
            kwargs["response_format"] = response_format

        result = await model.respond(formatted_input, **kwargs)
        return result.content

    async def generate_batch(
        self,
        model_input,
        output_type=None,
        **kwargs,
    ):
        raise NotImplementedError(
            "The `lmstudio` library does not support batch inference."
        )

    async def generate_stream(  # type: ignore
        self,
        model_input: Chat | str | list,
        output_type: Optional[Any] = None,
        **kwargs: Any,
    ) -> AsyncIterator[str]:
        """Stream text using LMStudio asynchronously.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. The
            output type must be of a type that can be converted to a JSON
            schema.
        **kwargs
            Additional keyword arguments to pass to the model.

        Returns
        -------
        AsyncIterator[str]
            An async iterator that yields the text generated by the model.

        """
        if not self._context_entered:
            await self.client.__aenter__()
            self._context_entered = True

        if "model" not in kwargs and self.model_name is not None:
            kwargs["model"] = self.model_name

        model_key = kwargs.pop("model", None)
        model = await self.client.llm.model(model_key) if model_key else await self.client.llm.model()

        formatted_input = self.type_adapter.format_input(model_input)
        response_format = self.type_adapter.format_output_type(output_type)

        if response_format is not None:
            kwargs["response_format"] = response_format

        stream = await model.respond_stream(formatted_input, **kwargs)
        async for fragment in stream:
            yield fragment.content


def from_lmstudio(
    client: Union["Client", "AsyncClient"],
    model_name: Optional[str] = None,
) -> Union[LMStudio, AsyncLMStudio]:
    """Create an Outlines `LMStudio` model instance from a
    `lmstudio.Client` or `lmstudio.AsyncClient` instance.

    Parameters
    ----------
    client
        A `lmstudio.Client` or `lmstudio.AsyncClient` instance.
    model_name
        The name of the model to use.

    Returns
    -------
    Union[LMStudio, AsyncLMStudio]
        An Outlines `LMStudio` or `AsyncLMStudio` model instance.

    """
    from lmstudio import AsyncClient, Client

    if isinstance(client, Client):
        return LMStudio(client, model_name)
    elif isinstance(client, AsyncClient):
        return AsyncLMStudio(client, model_name)
    else:
        raise ValueError(
            "Invalid client type, the client must be an instance of "
            "`lmstudio.Client` or `lmstudio.AsyncClient`."
        )


================================================
FILE: outlines/models/mistral.py
================================================
"""Integration with Mistral AI API."""

import json
from functools import singledispatchmethod
from typing import (
    TYPE_CHECKING,
    Any,
    Iterator,
    List,
    Dict,
    Optional,
    Union,
)

from pydantic import TypeAdapter

from outlines.inputs import Chat, Image
from outlines.models.base import AsyncModel, Model, ModelTypeAdapter
from outlines.models.utils import set_additional_properties_false_json_schema
from outlines.types import JsonSchema, Regex, CFG
from outlines.types.utils import (
    is_dataclass,
    is_genson_schema_builder,
    is_native_dict,
    is_pydantic_model,
    is_typed_dict,
)

if TYPE_CHECKING:
    from mistralai import Mistral as MistralClient

__all__ = ["AsyncMistral", "Mistral", "from_mistral"]


class MistralTypeAdapter(ModelTypeAdapter):
    """Type adapter for the `Mistral` model.

    Prepares arguments for Mistral's client `chat.complete`,
    `chat.complete_async`, or `chat.stream` methods. Handles input (prompt or
    chat messages) and output type (JSON schema types).
    """

    @singledispatchmethod
    def format_input(self, model_input):
        """Generate the `messages` argument to pass to the client.

        Parameters
        ----------
        model_input
            The input provided by the user.

        Returns
        -------
        list
            The `messages` argument to pass to the client.

        """
        raise TypeError(
            f"The input type {type(model_input)} is not available with "
            "Mistral. The only available types are `str`, `list` and `Chat`."
        )

    @format_input.register(str)
    def format_str_model_input(self, model_input: str) -> list:
        """Format a string input into a list of messages.

        Parameters
        ----------
        model_input : str
            The input string prompt.

        Returns
        -------
        list
            A list of Mistral message objects.

        """
        from mistralai import UserMessage

        return [UserMessage(content=model_input)]

    @format_input.register(list)
    def format_list_model_input(self, model_input: list) -> list:
        """Format a list input into a list of messages.

        Parameters
        ----------
        model_input : list
            The input list, containing a string prompt and optionally Image
            objects (vision models only).

        Returns
        -------
        list
            A list of Mistral message objects.

        """
        from mistralai import UserMessage

        return [UserMessage(content=self._create_message_content(model_input))]

    @format_input.register(Chat)
    def format_chat_model_input(self, model_input: Chat) -> list:
        """Format a Chat input into a list of messages.

        Parameters
        ----------
        model_input : Chat
            The Chat object containing a list of message dictionaries.

        Returns
        -------
        list
            A list of Mistral message objects.

        """
        from mistralai import UserMessage, AssistantMessage, SystemMessage

        messages = []

        for message in model_input.messages:
            role = message["role"]
            content = message["content"]
            if role == "user":
                messages.append(
                    UserMessage(content=self._create_message_content(content))
                )
            elif role == "assistant":
                messages.append(AssistantMessage(content=content))
            elif role == "system":
                messages.append(SystemMessage(content=content))
            else:
                raise ValueError(f"Unsupported role: {role}")

        return messages

    def _create_message_content(
        self, content: Union[str, list]
    ) -> Union[str, List[Dict[str, Union[str, Dict[str, str]]]]]:
        """Create message content from an input.

        Parameters
        ----------
        content : Union[str, list]
            The content to format, either a string or a list containing a
            string and optionally Image objects.

        Returns
        -------
        Union[str, List[Dict[str, Union[str, Dict[str, str]]]]]
            The formatted content, either a string or a list of content parts
            (text and image URLs).

        """
        if isinstance(content, str):
            return content
        elif isinstance(content, list):
            if not content:
                raise ValueError("Content list cannot be empty.")
            if not isinstance(content[0], str):
                raise ValueError(
                    "The first item in the list should be a string."
                )
            if len(content) == 1:
                return content[0]
            content_parts: List[Dict[str, Union[str, Dict[str, str]]]] = [
                {"type": "text", "text": content[0]}
            ]
            for item in content[1:]:
                if isinstance(item, Image):
                    data_url = f"data:{item.image_format};base64,{item.image_str}"
                    content_parts.append({
                        "type": "image_url",
                        "image_url": {"url": data_url}
                    })
                else:
                    raise ValueError(
                        f"Invalid item type in content list: {type(item)}. "
                        + "Expected Image objects after the first string."
                    )
            return content_parts
        else:
            raise TypeError(
                f"Invalid content type: {type(content)}. "
                + "Content must be a string or a list starting with a string "
                + "followed by optional Image objects."
            )

    def format_output_type(self, output_type: Optional[Any] = None) -> dict:
        """Generate the `response_format` argument to pass to the client.

        Parameters
        ----------
        output_type : Optional[Any]
            The desired output type provided by the user.

        Returns
        -------
        dict
            The `response_format` dict to pass to the client.

        """
        if output_type is None:
            return {}

        # JSON schema types
        elif is_pydantic_model(output_type):
            schema = output_type.model_json_schema()
            return self.format_json_schema_type(schema, output_type.__name__)
        elif is_dataclass(output_type):
            schema = TypeAdapter(output_type).json_schema()
            return self.format_json_schema_type(schema, output_type.__name__)
        elif is_typed_dict(output_type):
            schema = TypeAdapter(output_type).json_schema()
            return self.format_json_schema_type(schema, output_type.__name__)
        elif is_genson_schema_builder(output_type):
            schema = json.loads(output_type.to_json())
            return self.format_json_schema_type(schema)
        elif isinstance(output_type, JsonSchema):
            return self.format_json_schema_type(json.loads(output_type.schema))

        # Json mode
        elif is_native_dict(output_type):
            return {"type": "json_object"}

        # Unsupported types
        elif isinstance(output_type, Regex):
            raise TypeError(
                "Regex-based structured outputs are not available with "
                "Mistral."
            )
        elif isinstance(output_type, CFG):
            raise TypeError(
                "CFG-based structured outputs are not available with Mistral."
            )
        else:
            type_name = getattr(output_type, "__name__", str(output_type))
            raise TypeError(
                f"The type {type_name} is not available with Mistral."
            )

    def format_json_schema_type(
        self, schema: dict, schema_name: str = "default"
    ) -> dict:
        """Create the `response_format` argument to pass to the client from a
        JSON schema dictionary.

        Parameters
        ----------
        schema : dict
            The JSON schema to format.
        schema_name : str
            The name of the schema.

        Returns
        -------
        dict
            The value of the `response_format` argument to pass to the client.

        """
        schema = set_additional_properties_false_json_schema(schema)

        return {
            "type": "json_schema",
            "json_schema": {
                "schema": schema,
                "name": schema_name.lower(),
                "strict": True
            }
        }


class Mistral(Model):
    """Thin wrapper around the `mistralai.Mistral` client.

    Converts input and output types to arguments for the `mistralai.Mistral`
    client's `chat.complete` or `chat.stream` methods.

    """

    def __init__(
        self, client: "MistralClient", model_name: Optional[str] = None
    ):
        """
        Parameters
        ----------
        client : MistralClient
            A mistralai.Mistral client instance.
        model_name : Optional[str]
            The name of the model to use.

        """
        self.client = client
        self.model_name = model_name
        self.type_adapter = MistralTypeAdapter()

    def generate(
        self,
        model_input: Union[Chat, list, str],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> Union[str, list[str]]:
        """Generate a response from the model.

        Parameters
        ----------
        model_input : Union[Chat, list, str]
            The prompt or chat messages to generate a response from.
        output_type : Optional[Any]
            The desired format of the response (e.g., JSON schema).
        **inference_kwargs : Any
            Additional keyword arguments to pass to the client.

        Returns
        -------
        Union[str, list[str]]
            The response generated by the model as text.

        """
        messages = self.type_adapter.format_input(model_input)
        response_format = self.type_adapter.format_output_type(output_type)

        if "model" not in inference_kwargs and self.model_name is not None:
            inference_kwargs["model"] = self.model_name

        try:
            result = self.client.chat.complete(
                messages=messages,
                response_format=response_format,
                **inference_kwargs,
            )
        except Exception as e:
            if "schema" in str(e).lower() or "json_schema" in str(e).lower():
                raise TypeError(
                    f"Mistral does not support your schema: {e}. "
                    "Try a local model or dottxt instead."
                )
            else:
                raise RuntimeError(f"Mistral API error: {e}") from e

        outputs = [choice.message for choice in result.choices]

        if len(outputs) == 1:
            return outputs[0].content
        else:
            return [m.content for m in outputs]

    def generate_batch(
        self,
        model_input,
        output_type=None,
        **inference_kwargs,
    ):
        raise NotImplementedError(
            "The `mistralai` library does not support batch inference."
        )

    def generate_stream(
        self,
        model_input: Union[Chat, list, str],
        output_type: Optional[Any] = None,
        **inference_kwargs,
    ) -> Iterator[str]:
        """Generate a stream of responses from the model.

        Parameters
        ----------
        model_input : Union[Chat, list, str]
            The prompt or chat messages to generate a response from.
        output_type : Optional[Any]
            The desired format of the response (e.g., JSON schema).
        **inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        Iterator[str]
            An iterator that yields the text chunks generated by the model.

        """
        messages = self.type_adapter.format_input(model_input)
        response_format = self.type_adapter.format_output_type(output_type)

        if "model" not in inference_kwargs and self.model_name is not None:
            inference_kwargs["model"] = self.model_name

        try:
            stream = self.client.chat.stream(
                messages=messages,
                response_format=response_format,
                **inference_kwargs
            )
        except Exception as e:
            if "schema" in str(e).lower() or "json_schema" in str(e).lower():
                raise TypeError(
                    f"Mistral does not support your schema: {e}. "
                    "Try a local model or dottxt instead."
                )
            else:
                raise RuntimeError(f"Mistral API error: {e}") from e

        for chunk in stream:
            if (
                hasattr(chunk, "data")
                and chunk.data.choices
                and chunk.data.choices[0].delta.content is not None
            ):
                yield chunk.data.choices[0].delta.content


class AsyncMistral(AsyncModel):
    """Async thin wrapper around the `mistralai.Mistral` client.

    Converts input and output types to arguments for the `mistralai.Mistral`
    client's async methods (`chat.complete_async` or `chat.stream_async`).

    """

    def __init__(
        self, client: "MistralClient", model_name: Optional[str] = None
    ):
        """
        Parameters
        ----------
        client : MistralClient
            A mistralai.Mistral client instance.
        model_name : Optional[str]
            The name of the model to use.

        """
        self.client = client
        self.model_name = model_name
        self.type_adapter = MistralTypeAdapter()

    async def generate(
        self,
        model_input: Union[Chat, list, str],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> Union[str, list[str]]:
        """Generate a response from the model asynchronously.

        Parameters
        ----------
        model_input : Union[Chat, list, str]
            The prompt or chat messages to generate a response from.
        output_type : Optional[Any]
            The desired format of the response (e.g., JSON schema).
        **inference_kwargs : Any
            Additional keyword arguments to pass to the client.

        Returns
        -------
        Union[str, list[str]]
            The response generated by the model as text.

        """
        messages = self.type_adapter.format_input(model_input)
        response_format = self.type_adapter.format_output_type(output_type)

        if "model" not in inference_kwargs and self.model_name is not None:
            inference_kwargs["model"] = self.model_name

        try:
            result = await self.client.chat.complete_async(
                messages=messages,
                response_format=response_format,
                stream=False,
                **inference_kwargs,
            )
        except Exception as e:
            if "schema" in str(e).lower() or "json_schema" in str(e).lower():
                raise TypeError(
                    f"Mistral does not support your schema: {e}. "
                    "Try a local model or dottxt instead."
                )
            else:
                raise RuntimeError(f"Mistral API error: {e}") from e

        outputs = [choice.message for choice in result.choices]

        if len(outputs) == 1:
            return outputs[0].content
        else:
            return [m.content for m in outputs]

    async def generate_batch(
        self,
        model_input,
        output_type=None,
        **inference_kwargs,
    ):
        raise NotImplementedError(
            "The mistralai library does not support batch inference."
        )

    async def generate_stream(
        self,
        model_input,
        output_type=None,
        **inference_kwargs,
    ):
        """Generate text from the model as an async stream of chunks.

        Parameters
        ----------
        model_input
            str, list, or chat input to generate from.
        output_type
            Optional type for structured output.
        **inference_kwargs
            Extra kwargs like "model" name.

        Yields
        ------
        str
            Chunks of text as they are streamed.

        """
        messages = self.type_adapter.format_input(model_input)
        response_format = self.type_adapter.format_output_type(output_type)

        if "model" not in inference_kwargs and self.model_name is not None:
            inference_kwargs["model"] = self.model_name

        try:
            response = await self.client.chat.stream_async(
                messages=messages,
                response_format=response_format,
                **inference_kwargs
            )
        except Exception as e:
            if "schema" in str(e).lower() or "json_schema" in str(e).lower():
                raise TypeError(
                    f"Mistral does not support your schema: {e}. "
                    "Try a local model or dottxt instead."
                )
            else:
                raise RuntimeError(f"Mistral API error: {e}") from e

        async for chunk in response:
            if (
                hasattr(chunk, "data")
                and chunk.data.choices
                and len(chunk.data.choices) > 0
                and hasattr(chunk.data.choices[0], "delta")
                and chunk.data.choices[0].delta.content is not None
            ):
                yield chunk.data.choices[0].delta.content


def from_mistral(
    client: "MistralClient",
    model_name: Optional[str] = None,
    async_client: bool = False,
) -> Union[Mistral, AsyncMistral]:
    """Create an Outlines Mistral model instance from a mistralai.Mistral
    client.

    Parameters
    ----------
    client : MistralClient
        A mistralai.Mistral client instance.
    model_name : Optional[str]
        The name of the model to use.
    async_client : bool
        If True, return an AsyncMistral instance;
        otherwise, return a Mistral instance.

    Returns
    -------
    Union[Mistral, AsyncMistral]
        An Outlines Mistral or AsyncMistral model instance.

    """
    from mistralai import Mistral as MistralClient

    if not isinstance(client, MistralClient):
        raise ValueError(
            "Invalid client type. The client must be an instance of "
            "`mistralai.Mistral`."
        )

    if async_client:
        return AsyncMistral(client, model_name)
    else:
        return Mistral(client, model_name)


================================================
FILE: outlines/models/mlxlm.py
================================================
"""Integration with the `mlx_lm` library."""

from functools import singledispatchmethod
from typing import TYPE_CHECKING, Iterator, List, Optional

from outlines.inputs import Chat
from outlines.models.base import Model, ModelTypeAdapter
from outlines.models.tokenizer import _check_hf_chat_template
from outlines.models.transformers import TransformerTokenizer
from outlines.processors import OutlinesLogitsProcessor

if TYPE_CHECKING:
    import mlx.nn as nn
    from transformers import PreTrainedTokenizer

__all__ = ["MLXLM", "from_mlxlm"]


class MLXLMTypeAdapter(ModelTypeAdapter):
    """Type adapter for the `MLXLM` model."""

    def __init__(self, tokenizer: "PreTrainedTokenizer", has_chat_template: bool = False):
        self.tokenizer = tokenizer
        self.has_chat_template = has_chat_template

    @singledispatchmethod
    def format_input(self, model_input):
        """Generate the prompt argument to pass to the model.

        Parameters
        ----------
        model_input
            The input provided by the user.

        Returns
        -------
        str
            The formatted input to be passed to the model.

        """
        raise NotImplementedError(
            f"The input type {type(model_input)} is not available with "
            "mlx-lm. The available types are `str` and `Chat`."
        )

    @format_input.register(str)
    def format_str_input(self, model_input: str) -> str:
        if self.has_chat_template:
            return self.format_chat_input(Chat([{"role": "user", "content": model_input}]))
        return model_input

    @format_input.register(Chat)
    def format_chat_input(self, model_input: Chat) -> str:
        if not all(
            isinstance(message["content"], str)
            for message in model_input.messages
        ):
            raise ValueError(
                "mlx-lm does not support multi-modal messages."
                + "The content of each message must be a string."
            )

        return self.tokenizer.apply_chat_template(
            model_input.messages,
            tokenize=False,
            add_generation_prompt=True,
        )

    def format_output_type(
        self, output_type: Optional[OutlinesLogitsProcessor] = None,
    ) -> Optional[List[OutlinesLogitsProcessor]]:
        """Generate the logits processor argument to pass to the model.

        Parameters
        ----------
        output_type
            The logits processor provided.

        Returns
        -------
        Optional[list[OutlinesLogitsProcessor]]
            The logits processor argument to be passed to the model.

        """
        if not output_type:
            return None
        return [output_type]


class MLXLM(Model):
    """Thin wrapper around an `mlx_lm` model.

    This wrapper is used to convert the input and output types specified by the
    users at a higher level to arguments to the `mlx_lm` library.

    """

    tensor_library_name = "mlx"

    def __init__(
        self,
        model: "nn.Module",
        tokenizer: "PreTrainedTokenizer",
    ):
        """
        Parameters
        ----------
        model
            An instance of an `mlx_lm` model.
        tokenizer
            An instance of an `mlx_lm` tokenizer or of a compatible
            `transformers` tokenizer.

        """
        self.model = model
        # self.mlx_tokenizer is used by the mlx-lm in its generate function
        self.mlx_tokenizer = tokenizer
        # self.tokenizer is used by the logits processor
        self.tokenizer = TransformerTokenizer(tokenizer._tokenizer)
        self.type_adapter = MLXLMTypeAdapter(
            tokenizer=tokenizer,
            has_chat_template=_check_hf_chat_template(tokenizer)
        )

    def generate(
        self,
        model_input: str,
        output_type: Optional[OutlinesLogitsProcessor] = None,
        **kwargs,
    ) -> str:
        """Generate text using `mlx-lm`.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The logits processor the model will use to constrain the format of
            the generated text.
        kwargs
            Additional keyword arguments to pass to the `mlx-lm` library.

        Returns
        -------
        str
            The text generated by the model.

        """
        from mlx_lm import generate

        return generate(
            self.model,
            self.mlx_tokenizer,
            self.type_adapter.format_input(model_input),
            logits_processors=self.type_adapter.format_output_type(output_type),
            **kwargs,
        )

    def generate_batch(
        self,
        model_input: list[str],
        output_type: Optional[OutlinesLogitsProcessor] = None,
        **kwargs,
    ) -> list[str]:
        """Generate a batch of text using `mlx-lm`.

        Parameters
        ----------
        model_input
            The list of prompts based on which the model will generate a response.
        output_type
            The logits processor the model will use to constrain the format of
            the generated text.
        kwargs
            Additional keyword arguments to pass to the `mlx-lm` library.

        Returns
        -------
        list[str]
            The list of text generated by the model.

        """
        from mlx_lm import batch_generate

        if output_type:
            raise NotImplementedError(
                "mlx-lm does not support constrained generation with batching."
                + "You cannot provide an `output_type` with this method."
            )

        model_input = [self.type_adapter.format_input(item) for item in model_input]

        # Contrarily to the other generate methods, batch_generate requires
        # tokenized prompts
        add_special_tokens = [
            (
                self.mlx_tokenizer.bos_token is None
                or not prompt.startswith(self.mlx_tokenizer.bos_token)
            )
            for prompt in model_input
        ]
        tokenized_model_input = [
            self.mlx_tokenizer.encode(
                model_input[i], add_special_tokens=add_special_tokens[i]
            )
            for i in range(len(model_input))
        ]

        response = batch_generate(
            self.model,
            self.mlx_tokenizer,
            tokenized_model_input,
            **kwargs,
        )

        return response.texts

    def generate_stream(
        self,
        model_input: str,
        output_type: Optional[OutlinesLogitsProcessor] = None,
        **kwargs,
    ) -> Iterator[str]:
        """Stream text using `mlx-lm`.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The logits processor the model will use to constrain the format of
            the generated text.
        kwargs
            Additional keyword arguments to pass to the `mlx-lm` library.

        Returns
        -------
        Iterator[str]
            An iterator that yields the text generated by the model.

        """
        from mlx_lm import stream_generate

        for gen_response in stream_generate(
            self.model,
            self.mlx_tokenizer,
            self.type_adapter.format_input(model_input),
            logits_processors=self.type_adapter.format_output_type(output_type),
            **kwargs,
        ):
            yield gen_response.text


def from_mlxlm(model: "nn.Module", tokenizer: "PreTrainedTokenizer") -> MLXLM:
    """Create an Outlines `MLXLM` model instance from an `mlx_lm` model and a
    tokenizer.

    Parameters
    ----------
    model
        An instance of an `mlx_lm` model.
    tokenizer
        An instance of an `mlx_lm` tokenizer or of a compatible
        transformers tokenizer.

    Returns
    -------
    MLXLM
        An Outlines `MLXLM` model instance.

    """
    return MLXLM(model, tokenizer)


================================================
FILE: outlines/models/ollama.py
================================================
"""Integration with the `ollama` library."""

from functools import singledispatchmethod
from typing import (
    TYPE_CHECKING,
    Any,
    AsyncIterator,
    Iterator,
    Optional,
    Union,
    cast,
)

from outlines.inputs import Chat, Image
from outlines.models.base import AsyncModel, Model, ModelTypeAdapter
from outlines.types import CFG, JsonSchema, Regex

if TYPE_CHECKING:
    from ollama import Client
    from ollama import AsyncClient

__all__ = ["AsyncOllama", "Ollama", "from_ollama"]


class OllamaTypeAdapter(ModelTypeAdapter):
    """Type adapter for the `Ollama` model."""

    @singledispatchmethod
    def format_input(self, model_input):
        """Generate the value of the `messages` argument to pass to the client.

        Parameters
        ----------
        model_input
            The input provided by the user.

        Returns
        -------
        list
            The formatted value of the `messages` argument to be passed to
            the client.

        """
        raise TypeError(
            f"The input type {type(model_input)} is not available with "
            "Ollama. The only available types are `str`, `list` and `Chat`."
        )

    @format_input.register(str)
    def format_str_model_input(self, model_input: str) -> list:
        """Generate the value of the `messages` argument to pass to the
        client when the user only passes a prompt.

        """
        return [
            self._create_message("user", model_input)
        ]

    @format_input.register(list)
    def format_list_model_input(self, model_input: list) -> list:
        """Generate the value of the `messages` argument to pass to the
        client when the user passes a prompt and images.

        """
        return [
            self._create_message("user", model_input)
        ]

    @format_input.register(Chat)
    def format_chat_model_input(self, model_input: Chat) -> list:
        """Generate the value of the `messages` argument to pass to the
        client when the user passes a Chat instance.

        """
        return [
            self._create_message(message["role"], message["content"])
            for message in model_input.messages
        ]

    def _create_message(self, role: str, content: str | list) -> dict:
        """Create a message."""

        if isinstance(content, str):
            return {
                "role": role,
                "content": content,
            }

        elif isinstance(content, list):
            prompt = content[0]
            images = content[1:]

            if not all(isinstance(image, Image) for image in images):
                raise ValueError("All assets provided must be of type Image")

            return {
                "role": role,
                "content": prompt,
                "images": [image.image_str for image in images],
            }

        else:
            raise ValueError(
                f"Invalid content type: {type(content)}. "
                "The content must be a string or a list containing a string "
                "and a list of images."
            )

    def format_output_type(
        self, output_type: Optional[Any] = None
    ) -> Optional[dict]:
        """Format the output type to pass to the client.

        Parameters
        ----------
        output_type
            The output type provided by the user.

        Returns
        -------
        Optional[str]
            The formatted output type to be passed to the model.

        """
        if output_type is None:
            return None
        elif isinstance(output_type, Regex):
            raise TypeError(
                "Regex-based structured outputs are not supported by Ollama. "
                "Use an open source model in the meantime."
            )
        elif isinstance(output_type, CFG):
            raise TypeError(
                "CFG-based structured outputs are not supported by Ollama. "
                "Use an open source model in the meantime."
            )
        elif JsonSchema.is_json_schema(output_type):
            return cast(dict, JsonSchema.convert_to(output_type, ["dict"]))
        else:
            type_name = getattr(output_type, "__name__", output_type)
            raise TypeError(
                f"The type `{type_name}` is not supported by Ollama. "
                "Consider using a local model instead."
            )


class Ollama(Model):
    """Thin wrapper around the `ollama.Client` client.

    This wrapper is used to convert the input and output types specified by the
    users at a higher level to arguments to the `ollama.Client` client.

    """

    def __init__(self, client: "Client", model_name: Optional[str] = None):
        """
        Parameters
        ----------
        client
            The `ollama.Client` client.
        model_name
            The name of the model to use.

        """
        self.client = client
        self.model_name = model_name
        self.type_adapter = OllamaTypeAdapter()

    def generate(self,
        model_input: Chat | str | list,
        output_type: Optional[Any] = None,
        **kwargs: Any,
    ) -> str:
        """Generate text using Ollama.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. The
            output type must be of a type that can be converted to a JSON
            schema.
        **kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        str
            The text generated by the model.

        """
        if "model" not in kwargs and self.model_name is not None:
            kwargs["model"] = self.model_name

        print(self.type_adapter.format_input(model_input))

        response = self.client.chat(
            messages=self.type_adapter.format_input(model_input),
            format=self.type_adapter.format_output_type(output_type),
            **kwargs,
        )
        return response.message.content

    def generate_batch(
        self,
        model_input,
        output_type = None,
        **kwargs,
    ):
        raise NotImplementedError(
            "The `ollama` library does not support batch inference."
        )

    def generate_stream(
        self,
        model_input: Chat | str | list,
        output_type: Optional[Any] = None,
        **kwargs: Any,
    ) -> Iterator[str]:
        """Stream text using Ollama.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. The
            output type must be of a type that can be converted to a JSON
            schema.
        **kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        Iterator[str]
            An iterator that yields the text generated by the model.

        """
        if "model" not in kwargs and self.model_name is not None:
            kwargs["model"] = self.model_name

        response = self.client.chat(
            messages=self.type_adapter.format_input(model_input),
            format=self.type_adapter.format_output_type(output_type),
            stream=True,
            **kwargs,
        )
        for chunk in response:
            yield chunk.message.content


class AsyncOllama(AsyncModel):
    """Thin wrapper around the `ollama.AsyncClient` client.

    This wrapper is used to convert the input and output types specified by the
    users at a higher level to arguments to the `ollama.AsyncClient` client.

    """

    def __init__(
        self,client: "AsyncClient", model_name: Optional[str] = None,
    ):
        """
        Parameters
        ----------
        client
            The `ollama.Client` client.
        model_name
            The name of the model to use.

        """
        self.client = client
        self.model_name = model_name
        self.type_adapter = OllamaTypeAdapter()

    async def generate(self,
        model_input: Chat | str | list,
        output_type: Optional[Any] = None,
        **kwargs: Any,
    ) -> str:
        """Generate text using Ollama.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. The
            output type must be of a type that can be converted to a JSON
            schema.
        **kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        str
            The text generated by the model.

        """
        if "model" not in kwargs and self.model_name is not None:
            kwargs["model"] = self.model_name

        response = await self.client.chat(
            messages=self.type_adapter.format_input(model_input),
            format=self.type_adapter.format_output_type(output_type),
            **kwargs,
        )
        return response.message.content

    async def generate_batch(
        self,
        model_input,
        output_type = None,
        **kwargs,
    ):
        raise NotImplementedError(
            "The `ollama` library does not support batch inference."
        )

    async def generate_stream( # type: ignore
        self,
        model_input: Chat | str | list,
        output_type: Optional[Any] = None,
        **kwargs: Any,
    ) -> AsyncIterator[str]:
        """Stream text using Ollama.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. The
            output type must be of a type that can be converted to a JSON
            schema.
        **kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        Iterator[str]
            An iterator that yields the text generated by the model.

        """
        if "model" not in kwargs and self.model_name is not None:
            kwargs["model"] = self.model_name

        stream = await self.client.chat(
            messages=self.type_adapter.format_input(model_input),
            format=self.type_adapter.format_output_type(output_type),
            stream=True,
            **kwargs,
        )
        async for chunk in stream:
            yield chunk.message.content


def from_ollama(
    client: Union["Client", "AsyncClient"], model_name: Optional[str] = None
) -> Union[Ollama, AsyncOllama]:
    """Create an Outlines `Ollama` model instance from an `ollama.Client`
    or `ollama.AsyncClient` instance.

    Parameters
    ----------
    client
        A `ollama.Client` or `ollama.AsyncClient` instance.
    model_name
        The name of the model to use.

    Returns
    -------
    Union[Ollama, AsyncOllama]
        An Outlines `Ollama` or `AsyncOllama` model instance.

    """
    from ollama import AsyncClient, Client

    if isinstance(client, Client):
        return Ollama(client, model_name)
    elif isinstance(client, AsyncClient):
        return AsyncOllama(client, model_name)
    else:
        raise ValueError(
            "Invalid client type, the client must be an instance of "
            "`ollama.Client` or `ollama.AsyncClient`."
        )


================================================
FILE: outlines/models/openai.py
================================================
"""Integration with OpenAI's API."""

from typing import (
    TYPE_CHECKING,
    Any,
    AsyncIterator,
    Iterator,
    Optional,
    Union,
    cast,
)
from functools import singledispatchmethod

from pydantic import BaseModel

from outlines.inputs import Chat, Image
from outlines.models.base import AsyncModel, Model, ModelTypeAdapter
from outlines.models.utils import set_additional_properties_false_json_schema
from outlines.types import JsonSchema, Regex, CFG
from outlines.types.utils import is_native_dict

if TYPE_CHECKING:
    from openai import (
        OpenAI as OpenAIClient,
        AsyncOpenAI as AsyncOpenAIClient,
        AzureOpenAI as AzureOpenAIClient,
        AsyncAzureOpenAI as AsyncAzureOpenAIClient,
    )

__all__ = ["AsyncOpenAI", "OpenAI", "from_openai"]


class OpenAITypeAdapter(ModelTypeAdapter):
    """Type adapter for the `OpenAI` model.

    `OpenAITypeAdapter` is responsible for preparing the arguments to OpenAI's
    `completions.create` methods: the input (prompt and possibly image), as
    well as the output type (only JSON).

    """

    @singledispatchmethod
    def format_input(self, model_input):
        """Generate the `messages` argument to pass to the client.

        Parameters
        ----------
        model_input
            The input provided by the user.

        Returns
        -------
        dict
            The formatted input to be passed to the client.

        """
        raise TypeError(
            f"The input type {type(model_input)} is not available with "
            "OpenAI. The only available types are `str`, `list` and `Chat`."
        )

    @format_input.register(str)
    def format_str_model_input(self, model_input: str) -> list:
        """Generate the value of the `messages` argument to pass to the
        client when the user only passes a prompt.

        """
        return [
            self._create_message("user", model_input)
        ]

    @format_input.register(list)
    def format_list_model_input(self, model_input: list) -> list:
        """Generate the value of the `messages` argument to pass to the
        client when the user passes a prompt and images.

        """
        return [
            self._create_message("user", model_input)
        ]

    @format_input.register(Chat)
    def format_chat_model_input(self, model_input: Chat) -> list:
        """Generate the value of the `messages` argument to pass to the
        client when the user passes a Chat instance.

        """
        return [
            self._create_message(message["role"], message["content"])
            for message in model_input.messages
        ]

    def _create_message(self, role: str, content: str | list) -> dict:
        """Create a message."""

        if isinstance(content, str):
            return {
                "role": role,
                "content": content,
            }

        elif isinstance(content, list):
            prompt = content[0]
            images = content[1:]

            if not all(isinstance(image, Image) for image in images):
                raise ValueError("All assets provided must be of type Image")

            image_parts = [
                self._create_img_content(image)
                for image in images
            ]

            return {
                "role": role,
                "content": [
                    {"type": "text", "text": prompt},
                    *image_parts,
                ],
            }

        else:
            raise ValueError(
                f"Invalid content type: {type(content)}. "
                "The content must be a string or a list containing a string "
                "and a list of images."
            )

    def _create_img_content(self, image: Image) -> dict:
        """Create the content for an image input."""
        return {
            "type": "image_url",
            "image_url": {
                "url": f"data:{image.image_format};base64,{image.image_str}"  # noqa: E702
            },
        }

    def format_output_type(self, output_type: Optional[Any] = None) -> dict:
        """Generate the `response_format` argument to the client based on the
        output type specified by the user.

        Parameters
        ----------
        output_type
            The output type provided by the user.

        Returns
        -------
        dict
            The formatted output type to be passed to the client.

        """
        # Unsupported languages
        if isinstance(output_type, Regex):
            raise TypeError(
                "Neither regex-based structured outputs nor the `pattern` keyword "
                "in Json Schema are available with OpenAI. Use an open source "
                "model or dottxt instead."
            )
        elif isinstance(output_type, CFG):
            raise TypeError(
                "CFG-based structured outputs are not available with OpenAI. "
                "Use an open source model or dottxt instead."
            )

        if output_type is None:
            return {}
        elif is_native_dict(output_type):
            return self.format_json_mode_type()
        elif JsonSchema.is_json_schema(output_type):
            return self.format_json_output_type(
                cast(dict, JsonSchema.convert_to(output_type, ["dict"]))
            )
        else:
            type_name = getattr(output_type, "__name__", output_type)
            raise TypeError(
                f"The type `{type_name}` is not available with OpenAI. "
                "Use an open source model or dottxt instead."
            )

    def format_json_output_type(self, schema: dict) -> dict:
        """Generate the `response_format` argument to the client when the user
        specified a `Json` output type.

        """
        # OpenAI requires `additionalProperties` to be set to False
        schema = set_additional_properties_false_json_schema(schema)

        return {
            "response_format": {
                "type": "json_schema",
                "json_schema": {
                    "name": "default",
                    "strict": True,
                    "schema": schema,
                },
            }
        }

    def format_json_mode_type(self) -> dict:
        """Generate the `response_format` argument to the client when the user
        specified the output type should be a JSON but without specifying the
        schema (also called "JSON mode").

        """
        return {"response_format": {"type": "json_object"}}


class OpenAI(Model):
    """Thin wrapper around the `openai.OpenAI` client.

    This wrapper is used to convert the input and output types specified by the
    users at a higher level to arguments to the `openai.OpenAI` client.

    """

    def __init__(
        self,
        client: Union["OpenAIClient", "AzureOpenAIClient"],
        model_name: Optional[str] = None,
    ):
        """
        Parameters
        ----------
        client
            The `openai.OpenAI` client.
        model_name
            The name of the model to use.

        """
        self.client = client
        self.model_name = model_name
        self.type_adapter = OpenAITypeAdapter()

    def generate(
        self,
        model_input: Union[Chat, list, str],
        output_type: Optional[Union[type[BaseModel], str]] = None,
        **inference_kwargs: Any,
    ) -> Union[str, list[str]]:
        """Generate text using OpenAI.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. The
            output type must be of a type that can be converted to a JSON
            schema or an empty dictionary.
        **inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        Union[str, list[str]]
            The text generated by the model.

        """
        import openai

        messages = self.type_adapter.format_input(model_input)
        response_format = self.type_adapter.format_output_type(output_type)

        if "model" not in inference_kwargs and self.model_name is not None:
            inference_kwargs["model"] = self.model_name

        try:
            result = self.client.chat.completions.create(
                messages=messages,
                **response_format,
                **inference_kwargs,
            )
        except openai.BadRequestError as e:
            if e.body["message"].startswith("Invalid schema"):
                raise TypeError(
                    f"OpenAI does not support your schema: {e.body['message']}. "
                    "Try a local model or dottxt instead."
                )
            else:
                raise e

        messages = [choice.message for choice in result.choices]
        for message in messages:
            if message.refusal is not None:
                raise ValueError(
                    f"OpenAI refused to answer the request: {message.refusal}"
                )

        if len(messages) == 1:
            return messages[0].content
        else:
            return [message.content for message in messages]

    def generate_batch(
        self,
        model_input,
        output_type = None,
        **inference_kwargs,
    ):
        raise NotImplementedError(
            "The `openai` library does not support batch inference."
        )

    def generate_stream(
        self,
        model_input: Union[Chat, list, str],
        output_type: Optional[Union[type[BaseModel], str]] = None,
        **inference_kwargs,
    ) -> Iterator[str]:
        """Stream text using OpenAI.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. The
            output type must be of a type that can be converted to a JSON
            schema or an empty dictionary.
        **inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        Iterator[str]
            An iterator that yields the text generated by the model.

        """
        import openai

        messages = self.type_adapter.format_input(model_input)
        response_format = self.type_adapter.format_output_type(output_type)

        if "model" not in inference_kwargs and self.model_name is not None:
            inference_kwargs["model"] = self.model_name

        try:
            stream = self.client.chat.completions.create(
                stream=True,
                messages=messages,
                **response_format,
                **inference_kwargs
            )
        except openai.BadRequestError as e:
            if e.body["message"].startswith("Invalid schema"):
                raise TypeError(
                    f"OpenAI does not support your schema: {e.body['message']}. "
                    "Try a local model or dottxt instead."
                )
            else:
                raise e

        for chunk in stream:
            if chunk.choices and chunk.choices[0].delta.content is not None:
                yield chunk.choices[0].delta.content


class AsyncOpenAI(AsyncModel):
    """Thin wrapper around the `openai.AsyncOpenAI` client.

    This wrapper is used to convert the input and output types specified by the
    users at a higher level to arguments to the `openai.AsyncOpenAI` client.

    """

    def __init__(
        self,
        client: Union["AsyncOpenAIClient", "AsyncAzureOpenAIClient"],
        model_name: Optional[str] = None,
    ):
        """
        Parameters
        ----------
        client
            The `openai.AsyncOpenAI` or `openai.AsyncAzureOpenAI` client.
        model_name
            The name of the model to use.

        """
        self.client = client
        self.model_name = model_name
        self.type_adapter = OpenAITypeAdapter()

    async def generate(
        self,
        model_input: Union[Chat, list, str],
        output_type: Optional[Union[type[BaseModel], str]] = None,
        **inference_kwargs: Any,
    ) -> Union[str, list[str]]:
        """Generate text using OpenAI.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. The
            output type must be of a type that can be converted to a JSON
            schema or an empty dictionary.
        **inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        Union[str, list[str]]
            The text generated by the model.

        """
        import openai

        messages = self.type_adapter.format_input(model_input)
        response_format = self.type_adapter.format_output_type(output_type)

        if "model" not in inference_kwargs and self.model_name is not None:
            inference_kwargs["model"] = self.model_name

        try:
            result = await self.client.chat.completions.create(
                messages=messages,
                **response_format,
                **inference_kwargs,
            )
        except openai.BadRequestError as e:
            if e.body["message"].startswith("Invalid schema"):
                raise TypeError(
                    f"OpenAI does not support your schema: {e.body['message']}. "
                    "Try a local model or dottxt instead."
                )
            else:
                raise e

        messages = [choice.message for choice in result.choices]
        for message in messages:
            if message.refusal is not None:
                raise ValueError(
                    f"OpenAI refused to answer the request: {message.refusal}"
                )

        if len(messages) == 1:
            return messages[0].content
        else:
            return [message.content for message in messages]

    async def generate_batch(
        self,
        model_input,
        output_type = None,
        **inference_kwargs,
    ):
        raise NotImplementedError(
            "The `openai` library does not support batch inference."
        )

    async def generate_stream( # type: ignore
        self,
        model_input: Union[Chat, list, str],
        output_type: Optional[Union[type[BaseModel], str]] = None,
        **inference_kwargs,
    ) -> AsyncIterator[str]:
        """Stream text using OpenAI.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. The
            output type must be of a type that can be converted to a JSON
            schema or an empty dictionary.
        **inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        Iterator[str]
            An iterator that yields the text generated by the model.

        """
        import openai

        messages = self.type_adapter.format_input(model_input)
        response_format = self.type_adapter.format_output_type(output_type)

        if "model" not in inference_kwargs and self.model_name is not None:
            inference_kwargs["model"] = self.model_name

        try:
            stream = await self.client.chat.completions.create(
                stream=True,
                messages=messages,
                **response_format,
                **inference_kwargs
            )
        except openai.BadRequestError as e:
            if e.body["message"].startswith("Invalid schema"):
                raise TypeError(
                    f"OpenAI does not support your schema: {e.body['message']}. "
                    "Try a local model or dottxt instead."
                )
            else:
                raise e

        async for chunk in stream:
            if chunk.choices and chunk.choices[0].delta.content is not None:
                yield chunk.choices[0].delta.content


def from_openai(
    client: Union[
        "OpenAIClient",
        "AsyncOpenAIClient",
        "AzureOpenAIClient",
        "AsyncAzureOpenAIClient",
    ],
    model_name: Optional[str] = None,
) -> Union[OpenAI, AsyncOpenAI]:
    """Create an Outlines `OpenAI` or `AsyncOpenAI` model instance from an
    `openai.OpenAI` or `openai.AsyncOpenAI` client.

    Parameters
    ----------
    client
        An `openai.OpenAI`, `openai.AsyncOpenAI`, `openai.AzureOpenAI` or
        `openai.AsyncAzureOpenAI` client instance.
    model_name
        The name of the model to use.

    Returns
    -------
    OpenAI
        An Outlines `OpenAI` or `AsyncOpenAI` model instance.

    """
    import openai

    if isinstance(client, openai.OpenAI):
        return OpenAI(client, model_name)
    elif isinstance(client, openai.AsyncOpenAI):
        return AsyncOpenAI(client, model_name)
    else:
        raise ValueError(
            "Invalid client type. The client must be an instance of "
            "+ `openai.OpenAI` or `openai.AsyncOpenAI`."
        )


================================================
FILE: outlines/models/sglang.py
================================================
"""Integration with an SGLang server."""

import json
import warnings
from typing import (
    TYPE_CHECKING, Any, AsyncIterator, Iterator, Optional, Union
)

from outlines.inputs import Chat
from outlines.models.base import AsyncModel, Model, ModelTypeAdapter
from outlines.models.openai import OpenAITypeAdapter
from outlines.types.dsl import (
    CFG,
    JsonSchema,
    python_types_to_terms,
    to_regex,
)

if TYPE_CHECKING:
    from openai import AsyncOpenAI, OpenAI

__all__ = ["AsyncSGLang", "SGLang", "from_sglang"]


class SGLangTypeAdapter(ModelTypeAdapter):
    """Type adapter for the `SGLang` and `AsyncSGLang` models."""

    def format_input(self, model_input: Union[Chat, list, str]) -> list:
        """Generate the value of the messages argument to pass to the client.

        We rely on the OpenAITypeAdapter to format the input as the sglang
        server expects input in the same format as OpenAI.

        Parameters
        ----------
        model_input
            The input passed by the user.

        Returns
        -------
        list
            The formatted input to be passed to the client.

        """
        return OpenAITypeAdapter().format_input(model_input)

    def format_output_type(self, output_type: Optional[Any] = None) -> dict:
        """Generate the structured output argument to pass to the client.

        Parameters
        ----------
        output_type
            The structured output type provided.

        Returns
        -------
        dict
            The formatted output type to be passed to the client.

        """
        if output_type is None:
            return {}

        term = python_types_to_terms(output_type)
        if isinstance(term, CFG):
            warnings.warn(
                "SGLang grammar-based structured outputs expects an EBNF "
                "grammar instead of a Lark grammar as is generally used in "
                "Outlines. The grammar cannot be used as a structured output "
                "type with an outlines backend, it is only compatible with "
                "the sglang and llguidance backends."
            )
            return {"extra_body": {"ebnf": term.definition}}
        elif isinstance(term, JsonSchema):
            return OpenAITypeAdapter().format_json_output_type(
                json.loads(term.schema)
            )
        else:
            return {"extra_body": {"regex": to_regex(term)}}


class SGLang(Model):
    """Thin wrapper around the `openai.OpenAI` client used to communicate with
    an SGLang server.

    This wrapper is used to convert the input and output types specified by the
    users at a higher level to arguments to the `openai.OpenAI` client for the
    SGLang server.

    """

    def __init__(self, client, model_name: Optional[str] = None):
        """
        Parameters
        ----------
        client
            An `openai.OpenAI` client instance.
        model_name
            The name of the model to use.

        """
        self.client = client
        self.model_name = model_name
        self.type_adapter = SGLangTypeAdapter()

    def generate(
        self,
        model_input: Union[Chat, list, str],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> Union[str, list[str]]:
        """Generate text using SGLang.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. All
            output types available in Outlines are supported provided your
            server uses a structured generation backend that supports them.
        inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        Union[str, list[str]]
            The text generated by the model.

        """
        client_args = self._build_client_args(
            model_input,
            output_type,
            **inference_kwargs,
        )

        response = self.client.chat.completions.create(**client_args)

        messages = [choice.message for choice in response.choices]
        for message in messages:
            if message.refusal is not None:  # pragma: no cover
                raise ValueError(
                    f"The SGLang server refused to answer the request: "
                    f"{message.refusal}"
                )

        if len(messages) == 1:
            return messages[0].content
        else:
            return [message.content for message in messages]

    def generate_batch(
        self,
        model_input,
        output_type = None,
        **inference_kwargs,
    ):
        raise NotImplementedError(
            "SGLang does not support batch inference."
        )

    def generate_stream(
        self,
        model_input: Union[Chat, list, str],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> Iterator[str]:
        """Stream text using SGLang.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. All
            output types available in Outlines are supported provided your
            server uses a structured generation backend that supports them.
        inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        Iterator[str]
            An iterator that yields the text generated by the model.

        """
        client_args = self._build_client_args(
            model_input, output_type, **inference_kwargs,
        )

        stream = self.client.chat.completions.create(
            **client_args, stream=True,
        )

        for chunk in stream:  # pragma: no cover
            if chunk.choices and chunk.choices[0].delta.content is not None:
                yield chunk.choices[0].delta.content

    def _build_client_args(
        self,
        model_input: Union[Chat, str, list],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> dict:
        """Build the arguments to pass to the SGLang client."""
        messages = self.type_adapter.format_input(model_input)
        output_type_args = self.type_adapter.format_output_type(output_type)
        inference_kwargs.update(output_type_args)

        if "model" not in inference_kwargs and self.model_name is not None:
            inference_kwargs["model"] = self.model_name

        client_args = {
            "messages": messages,
            **inference_kwargs,
        }

        return client_args


class AsyncSGLang(AsyncModel):
    """Thin async wrapper around the `openai.OpenAI` client used to communicate
    with an SGLang server.

    This wrapper is used to convert the input and output types specified by the
    users at a higher level to arguments to the `openai.OpenAI` client for the
    SGLang server.

    """

    def __init__(self, client, model_name: Optional[str] = None):
        """
        Parameters
        ----------
        client
            An `openai.AsyncOpenAI` client instance.
        model_name
            The name of the model to use.

        Parameters
        ----------
        client
            An `openai.AsyncOpenAI` client instance.

        """
        self.client = client
        self.model_name = model_name
        self.type_adapter = SGLangTypeAdapter()

    async def generate(
        self,
        model_input: Union[Chat, str, list],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> Union[str, list[str]]:
        """Generate text using `sglang`.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. All
            output types available in Outlines are supported provided your
            server uses a structured generation backend that supports them.
        inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        Union[str, list[str]]
            The text generated by the model.

        """
        client_args = self._build_client_args(
            model_input, output_type, **inference_kwargs,
        )

        response = await self.client.chat.completions.create(**client_args)

        messages = [choice.message for choice in response.choices]
        for message in messages:
            if message.refusal is not None:  # pragma: no cover
                raise ValueError(
                    f"The sglang server refused to answer the request: "
                    f"{message.refusal}"
                )

        if len(messages) == 1:
            return messages[0].content
        else:
            return [message.content for message in messages]

    async def generate_batch(
        self,
        model_input,
        output_type = None,
        **inference_kwargs,
    ):
        raise NotImplementedError(
            "SGLang does not support batch inference."
        )

    async def generate_stream( # type: ignore
        self,
        model_input: Union[Chat, str, list],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> AsyncIterator[str]:
        """Return a text generator.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. All
            output types available in Outlines are supported provided your
            server uses a structured generation backend that supports them.
        inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        AsyncIterator[str]
            An async iterator that yields the text generated by the model.

        """
        client_args = self._build_client_args(
            model_input, output_type, **inference_kwargs,
        )

        stream = await self.client.chat.completions.create(
            **client_args,
            stream=True,
        )

        async for chunk in stream:  # pragma: no cover
            if chunk.choices and chunk.choices[0].delta.content is not None:
                yield chunk.choices[0].delta.content

    def _build_client_args(
        self,
        model_input: Union[Chat, str, list],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> dict:
        """Build the arguments to pass to the SGLang client."""
        messages = self.type_adapter.format_input(model_input)
        output_type_args = self.type_adapter.format_output_type(output_type)
        inference_kwargs.update(output_type_args)

        if "model" not in inference_kwargs and self.model_name is not None:
            inference_kwargs["model"] = self.model_name

        client_args = {
            "messages": messages,
            **inference_kwargs,
        }

        return client_args


def from_sglang(
    client: Union["OpenAI", "AsyncOpenAI"],
    model_name: Optional[str] = None,
) -> Union[SGLang, AsyncSGLang]:
    """Create a `SGLang` or `AsyncSGLang` instance from an `openai.OpenAI` or
    `openai.AsyncOpenAI` instance.

    Parameters
    ----------
    client
        An `openai.OpenAI` or `openai.AsyncOpenAI` instance.
    model_name
        The name of the model to use.

    Returns
    -------
    Union[SGLang, AsyncSGLang]
        An Outlines `SGLang` or `AsyncSGLang` model instance.

    """
    from openai import AsyncOpenAI, OpenAI

    if isinstance(client, OpenAI):
        return SGLang(client, model_name)
    elif isinstance(client, AsyncOpenAI):
        return AsyncSGLang(client, model_name)
    else:
        raise ValueError(
            f"Unsupported client type: {type(client)}.\n"
            "Please provide an OpenAI or AsyncOpenAI instance."
        )


================================================
FILE: outlines/models/tgi.py
================================================
"""Integration with a TGI server."""

import json
from functools import singledispatchmethod
from typing import (
    TYPE_CHECKING,
    Any,
    AsyncIterator,
    Iterator,
    Optional,
    Union,
)

from outlines.models.base import AsyncModel,Model, ModelTypeAdapter
from outlines.types.dsl import python_types_to_terms, to_regex, JsonSchema, CFG

if TYPE_CHECKING:
    from huggingface_hub import AsyncInferenceClient, InferenceClient

__all__ = ["AsyncTGI", "TGI", "from_tgi"]


class TGITypeAdapter(ModelTypeAdapter):
    """Type adapter for the `TGI` and `AsyncTGI` models."""

    @singledispatchmethod
    def format_input(self, model_input):
        """Generate the prompt argument to pass to the client.

        Argument
        --------
        model_input
            The input passed by the user.

        Returns
        -------
        str
            The formatted input to be passed to the model.

        """
        raise NotImplementedError(
            f"The input type {input} is not available with TGI. "
            + "The only available type is `str`."
        )

    @format_input.register(str)
    def format_str_input(self, model_input: str) -> str:
        return model_input

    def format_output_type(self, output_type: Optional[Any] = None) -> dict:
        """Generate the structured output argument to pass to the client.

        Argument
        --------
        output_type
            The structured output type provided.

        Returns
        -------
        dict
            The structured output argument to pass to the client.

        """
        if output_type is None:
            return {}

        term = python_types_to_terms(output_type)
        if isinstance(term, CFG):
            raise NotImplementedError(
                "TGI does not support CFG-based structured outputs."
            )
        elif isinstance(term, JsonSchema):
            return {
                "grammar": {
                    "type": "json",
                    "value": json.loads(term.schema),
                }
            }
        else:
            return {
                "grammar": {
                    "type": "regex",
                    "value": to_regex(term),
                }
            }


class TGI(Model):
    """Thin wrapper around a `huggingface_hub.InferenceClient` client used to
    communicate with a `TGI` server.

    This wrapper is used to convert the input and output types specified by the
    users at a higher level to arguments to the
    `huggingface_hub.InferenceClient` client.

    """

    def __init__(self, client):
        """
        Parameters
        ----------
        client
            A huggingface `InferenceClient` client instance.

        """
        self.client = client
        self.type_adapter = TGITypeAdapter()

    def generate(
        self,
        model_input: str,
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> str:
        """Generate text using TGI.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. All
            output types except `CFG` are supported provided your server uses
            a backend that supports them.
        inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        str
            The text generated by the model.

        """
        client_args = self._build_client_args(
            model_input,
            output_type,
            **inference_kwargs,
        )

        return self.client.text_generation(**client_args)

    def generate_batch(
        self,
        model_input,
        output_type = None,
        **inference_kwargs,
    ):
        raise NotImplementedError("TGI does not support batch inference.")

    def generate_stream(
        self,
        model_input: str,
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> Iterator[str]:
        """Stream text using TGI.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. All
            output types except `CFG` are supported provided your server uses
            a backend that supports them.
        inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        Iterator[str]
            An iterator that yields the text generated by the model.

        """
        client_args = self._build_client_args(
            model_input, output_type, **inference_kwargs,
        )

        stream = self.client.text_generation(
            **client_args, stream=True,
        )

        for chunk in stream:  # pragma: no cover
            yield chunk

    def _build_client_args(
        self,
        model_input: str,
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> dict:
        """Build the arguments to pass to the TGI client."""
        prompt = self.type_adapter.format_input(model_input)
        output_type_args = self.type_adapter.format_output_type(output_type)
        inference_kwargs.update(output_type_args)

        client_args = {
            "prompt": prompt,
            **inference_kwargs,
        }

        return client_args


class AsyncTGI(AsyncModel):
    """Thin async wrapper around a `huggingface_hub.AsyncInferenceClient`
    client used to communicate with a `TGI` server.

    This wrapper is used to convert the input and output types specified by the
    users at a higher level to arguments to the
    `huggingface_hub.AsyncInferenceClient` client.

    """

    def __init__(self, client):
        """
        Parameters
        ----------
        client
            A huggingface `AsyncInferenceClient` client instance.

        """
        self.client = client
        self.type_adapter = TGITypeAdapter()

    async def generate(
        self,
        model_input: str,
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> str:
        """Generate text using TGI.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. All
            output types except `CFG` are supported provided your server uses
            a backend that supports them.
        inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        str
            The text generated by the model.

        """
        client_args = self._build_client_args(
            model_input, output_type, **inference_kwargs,
        )

        response = await self.client.text_generation(**client_args)

        return response

    async def generate_batch(
        self,
        model_input,
        output_type = None,
        **inference_kwargs,
    ):
        raise NotImplementedError("TGI does not support batch inference.")

    async def generate_stream( # type: ignore
        self,
        model_input: str,
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> AsyncIterator[str]:
        """Stream text using TGI.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. All
            output types except `CFG` are supported provided your server uses
            a backend that supports them.
        inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        AsyncIterator[str]
            An async iterator that yields the text generated by the model.

        """
        client_args = self._build_client_args(
            model_input, output_type, **inference_kwargs,
        )

        stream = await self.client.text_generation(
            **client_args, stream=True
        )

        async for chunk in stream:  # pragma: no cover
            yield chunk

    def _build_client_args(
        self,
        model_input: str,
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> dict:
        """Build the arguments to pass to the TGI client."""
        prompt = self.type_adapter.format_input(model_input)
        output_type_args = self.type_adapter.format_output_type(output_type)
        inference_kwargs.update(output_type_args)

        client_args = {
            "prompt": prompt,
            **inference_kwargs,
        }

        return client_args


def from_tgi(
    client: Union["InferenceClient", "AsyncInferenceClient"],
) -> Union[TGI, AsyncTGI]:
    """Create an Outlines `TGI` or `AsyncTGI` model instance from an
    `huggingface_hub.InferenceClient` or `huggingface_hub.AsyncInferenceClient`
    instance.

    Parameters
    ----------
    client
        An `huggingface_hub.InferenceClient` or
        `huggingface_hub.AsyncInferenceClient` instance.

    Returns
    -------
    Union[TGI, AsyncTGI]
        An Outlines `TGI` or `AsyncTGI` model instance.

    """
    from huggingface_hub import AsyncInferenceClient, InferenceClient

    if isinstance(client, InferenceClient):
        return TGI(client)
    elif isinstance(client, AsyncInferenceClient):
        return AsyncTGI(client)
    else:
        raise ValueError(
            f"Unsupported client type: {type(client)}.\n"
            + "Please provide an HuggingFace InferenceClient "
            + "or AsyncInferenceClient instance."
        )


================================================
FILE: outlines/models/tokenizer.py
================================================
from typing import Dict, Hashable, List, Protocol, Set, Tuple, Union, TYPE_CHECKING


if TYPE_CHECKING:
    import numpy as np
    from numpy.typing import NDArray
    from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast


class Tokenizer(Hashable, Protocol):
    eos_token: str
    eos_token_id: int
    pad_token_id: int
    vocabulary: Dict[str, int]
    special_tokens: Set[str]

    def encode(
        self, prompt: Union[str, List[str]]
    ) -> "Tuple['NDArray[np.int64]', 'NDArray[np.int64]']":
        """Translate the input prompts into arrays of token ids and attention mask."""
        ...

    def decode(self, token_ids: "NDArray[np.int64]") -> List[str]:
        """Translate an array of token ids to a string or list of strings."""
        ...

    def convert_token_to_string(self, token: str) -> str:
        """Convert a token to its equivalent string.

        This is for instance useful for BPE tokenizers where whitespaces are
        represented by the special characted `Ġ`. This prevents matching a raw
        token that includes `Ġ` with a string.
        """
        ...


def _check_hf_chat_template(tokenizer: "PreTrainedTokenizer | PreTrainedTokenizerFast") -> bool:
    """Check if the HuggingFace tokenizer has a chat template."""
    try:
        tokenizer.get_chat_template()
        return True
    except ValueError:
        return False


================================================
FILE: outlines/models/transformers.py
================================================
"""Integration with the `transformers` library. """

import warnings

from collections import defaultdict
from functools import singledispatchmethod
from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Union

from outlines.inputs import Audio, Chat, Image, Video
from outlines.models.base import Model, ModelTypeAdapter
from outlines.models.tokenizer import Tokenizer, _check_hf_chat_template
from outlines.processors import OutlinesLogitsProcessor

if TYPE_CHECKING:
    import torch
    from transformers import (
        PreTrainedTokenizer,
        PreTrainedModel,
        ProcessorMixin,
        LogitsProcessorList,
    )

__all__ = ["Transformers", "TransformersMultiModal", "from_transformers"]


def get_llama_tokenizer_types():
    """Get all the Llama tokenizer types/classes that need work-arounds.

    When they can't be imported, a dummy class is created.

    """
    try:
        from transformers.models.llama import LlamaTokenizer
    except ImportError:  # pragma: no cover

        class LlamaTokenizer:  # type: ignore
            pass

    try:
        from transformers.models.llama import LlamaTokenizerFast
    except ImportError:  # pragma: no cover

        class LlamaTokenizerFast:  # type: ignore
            pass

    try:
        from transformers.models.code_llama import CodeLlamaTokenizer
    except ImportError:  # pragma: no cover

        class CodeLlamaTokenizer:  # type: ignore
            pass

    try:
        from transformers.models.code_llama import CodeLlamaTokenizerFast
    except ImportError:  # pragma: no cover

        class CodeLlamaTokenizerFast:  # type: ignore
            pass

    return (
        LlamaTokenizer,
        LlamaTokenizerFast,
        CodeLlamaTokenizer,
        CodeLlamaTokenizerFast,
    )


class TransformerTokenizer(Tokenizer):
    """Represents a tokenizer for models in the `transformers` library."""

    def __init__(self, tokenizer: "PreTrainedTokenizer", **kwargs):
        self.tokenizer = tokenizer
        self.eos_token_id = self.tokenizer.eos_token_id
        self.eos_token = self.tokenizer.eos_token
        self.get_vocab = self.tokenizer.get_vocab

        if self.tokenizer.pad_token_id is None:
            self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
            self.pad_token_id = self.eos_token_id
        else:
            self.pad_token_id = self.tokenizer.pad_token_id
            self.pad_token = self.tokenizer.pad_token

        self.special_tokens = set(self.tokenizer.all_special_tokens)

        self.vocabulary = self.tokenizer.get_vocab()
        self.is_llama = isinstance(self.tokenizer, get_llama_tokenizer_types())

    def encode(
        self, prompt: Union[str, List[str]], **kwargs
    ) -> Tuple["torch.LongTensor", "torch.LongTensor"]:
        kwargs["padding"] = True
        kwargs["return_tensors"] = "pt"
        output = self.tokenizer(prompt, **kwargs)
        return output["input_ids"], output["attention_mask"]

    def decode(self, token_ids: "torch.LongTensor") -> List[str]:
        text = self.tokenizer.batch_decode(token_ids, skip_special_tokens=True)
        return text

    def convert_token_to_string(self, token: str) -> str:
        from transformers.file_utils import SPIECE_UNDERLINE

        string = self.tokenizer.convert_tokens_to_string([token])

        if token.startswith(SPIECE_UNDERLINE) or token == "<0x20>":
            return " " + string

        return string

    def __eq__(self, other):
        if isinstance(other, type(self)):
            if hasattr(self, "model_name") and hasattr(self, "kwargs"):
                return (
                    other.model_name == self.model_name and other.kwargs == self.kwargs
                )
            else:
                return other.tokenizer == self.tokenizer
        return NotImplemented

    def __hash__(self):
        from datasets.fingerprint import Hasher

        return hash(Hasher.hash(self.tokenizer))

    def __getstate__(self):
        state = {"tokenizer": self.tokenizer}
        return state

    def __setstate__(self, state):
        self.__init__(state["tokenizer"])


class TransformersTypeAdapter(ModelTypeAdapter):
    """Type adapter for the `Transformers` model."""

    def __init__(self, tokenizer: "PreTrainedTokenizer", has_chat_template: bool = False):
        self.tokenizer = tokenizer
        self.has_chat_template = has_chat_template

    @singledispatchmethod
    def format_input(self, model_input):
        """Generate the prompt argument to pass to the model.

        Parameters
        ----------
        model_input
            The input passed by the user.

        Returns
        -------
        str
            The formatted input to be passed to the model.

        """
        raise TypeError(
            f"The input type {type(model_input)} is not available."
            "The only available types are `str` and `Chat`."
        )

    @format_input.register(str)
    def format_str_input(self, model_input: str) -> str:
        if self.has_chat_template:
            return self.format_chat_input(Chat([{"role": "user", "content": model_input}]))
        return model_input

    @format_input.register(Chat)
    def format_chat_input(self, model_input: Chat) -> str:
        return self.tokenizer.apply_chat_template(
            model_input.messages,
            tokenize=False,
            add_generation_prompt=True,
        )

    def format_output_type(
        self,
        output_type: Optional[OutlinesLogitsProcessor] = None,
    ) -> Optional["LogitsProcessorList"]:
        """Generate the logits processor argument to pass to the model.

        Parameters
        ----------
        output_type
            The logits processor provided.

        Returns
        -------
        Optional[LogitsProcessorList]
            The logits processor to pass to the model.

        """
        from transformers import LogitsProcessorList

        if output_type is not None:
            return LogitsProcessorList([output_type])
        return None


class Transformers(Model):
    """Thin wrapper around a `transformers` model and a `transformers`
    tokenizer.

    This wrapper is used to convert the input and output types specified by the
    users at a higher level to arguments to the `transformers` model and
    tokenizer.

    """

    def __init__(
        self,
        model: "PreTrainedModel",
        tokenizer: "PreTrainedTokenizer",
        *,
        device_dtype: Optional["torch.dtype"] = None,
    ):
        """
        Parameters:
        ----------
        model
            A `PreTrainedModel`, or any model that is compatible with the
            `transformers` API for models.
        tokenizer
            A `PreTrainedTokenizer`, or any tokenizer that is compatible with
            the `transformers` API for tokenizers.
        device_dtype
            The dtype to use for the model. If not provided, the model will use
            the default dtype.

        """
        # We need to handle the cases in which jax/flax or tensorflow
        # is not available in the environment.
        try:
            from transformers import FlaxPreTrainedModel
        except ImportError:  # pragma: no cover
            FlaxPreTrainedModel = None

        try:
            from transformers import TFPreTrainedModel
        except ImportError:  # pragma: no cover
            TFPreTrainedModel = None

        tokenizer.padding_side = "left"
        self.model = model
        self.hf_tokenizer = tokenizer
        self.tokenizer = TransformerTokenizer(tokenizer)
        self.device_dtype = device_dtype
        self.type_adapter = TransformersTypeAdapter(
            tokenizer=tokenizer,
            has_chat_template=_check_hf_chat_template(tokenizer)
        )

        if (
            FlaxPreTrainedModel is not None
            and isinstance(model, FlaxPreTrainedModel)
        ):  # pragma: no cover
            self.tensor_library_name = "jax"
            warnings.warn("""
                Support for `jax` has been deprecated and will be removed in
                version 1.4.0 of Outlines. Please use `torch` instead.
                Transformers models using `jax` do not support structured
                generation.
                """,
                DeprecationWarning,
                stacklevel=2,
            )
        elif (
            TFPreTrainedModel is not None
            and isinstance(model, TFPreTrainedModel)
        ):  # pragma: no cover
            self.tensor_library_name = "tensorflow"
            warnings.warn("""
                Support for `tensorflow` has been deprecated and will be removed in
                version 1.4.0 of Outlines. Please use `torch` instead.
                Transformers models using `tensorflow` do not support structured
                generation.
                """,
                DeprecationWarning,
                stacklevel=2,
            )
        else:
            self.tensor_library_name = "torch"

    def _prepare_model_inputs(
        self,
        model_input,
        is_batch: bool = False,
    ) -> Tuple[Union[str, List[str]], dict]:
        """Turn the user input into arguments to pass to the model"""
        # Format validation
        if is_batch:
            prompts = [
                self.type_adapter.format_input(item)
                for item in model_input
            ]
        else:
            prompts = self.type_adapter.format_input(model_input)
        input_ids, attention_mask = self.tokenizer.encode(prompts)
        inputs = {
            "input_ids": input_ids.to(self.model.device),
            "attention_mask": (
                attention_mask.to(self.model.device, dtype=self.device_dtype)
                if self.device_dtype is not None
                else attention_mask.to(self.model.device)
            ),
        }

        return prompts, inputs

    def generate(
        self,
        model_input: Union[str, dict, Chat],
        output_type: Optional[OutlinesLogitsProcessor] = None,
        **inference_kwargs: Any,
    ) -> Union[str, List[str]]:
        """Generate text using `transformers`.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response. For
            multi-modal models, the input should be a dictionary containing the
            `text` key with a value of type `Union[str, List[str]]` and the
            other keys required by the model.
        output_type
            The logits processor the model will use to constrain the format of
            the generated text.
        inference_kwargs
            Additional keyword arguments to pass to the `generate` method
            of the `transformers` model.

        Returns
        -------
        Union[str, List[str]]
            The text generated by the model.

        """
        prompts, inputs = self._prepare_model_inputs(model_input, False)
        logits_processor = self.type_adapter.format_output_type(output_type)

        generated_ids = self._generate_output_seq(
            prompts,
            inputs,
            logits_processor=logits_processor,
            **inference_kwargs,
        )

        # required for multi-modal models that return a 2D tensor even when
        # num_return_sequences is 1
        num_samples = inference_kwargs.get("num_return_sequences", 1)
        if num_samples == 1 and len(generated_ids.shape) == 2:
            generated_ids = generated_ids.squeeze(0)

        return self._decode_generation(generated_ids)

    def generate_batch(
        self,
        model_input: List[Union[str, dict, Chat]],
        output_type: Optional[OutlinesLogitsProcessor] = None,
        **inference_kwargs: Any,
    ) -> List[Union[str, List[str]]]:
        """"""
        prompts, inputs = self._prepare_model_inputs(model_input, True) # type: ignore
        logits_processor = self.type_adapter.format_output_type(output_type)

        generated_ids = self._generate_output_seq(
            prompts, inputs, logits_processor=logits_processor, **inference_kwargs
        )

        # if there are multiple samples per input, convert generated_id to 3D
        num_samples = inference_kwargs.get("num_return_sequences", 1)
        if num_samples > 1:
            generated_ids = generated_ids.view(len(model_input), num_samples, -1)

        return self._decode_generation(generated_ids)

    def generate_stream(self, model_input, output_type, **inference_kwargs):
        """Not available for `transformers` models.

        TODO: implement following completion of https://github.com/huggingface/transformers/issues/30810

        """
        raise NotImplementedError(
            "Streaming is not implemented for Transformers models."
        )

    def _generate_output_seq(self, prompts, inputs, **inference_kwargs):
        input_ids = inputs["input_ids"]

        output_ids = self.model.generate(
            **inputs,
            **inference_kwargs,
        )

        # encoder-decoder returns output_ids only, decoder-only returns full seq ids
        if self.model.config.is_encoder_decoder:
            generated_ids = output_ids
        else:
            generated_ids = output_ids[:, input_ids.shape[1] :]

        return generated_ids

    def _decode_generation(self, generated_ids: "torch.Tensor"):
        if len(generated_ids.shape) == 1:
            return self.tokenizer.decode([generated_ids])[0]
        elif len(generated_ids.shape) == 2:
            return self.tokenizer.decode(generated_ids)
        elif len(generated_ids.shape) == 3:
            return [
                self.tokenizer.decode(generated_ids[i])
                for i in range(len(generated_ids))
            ]
        else:  # pragma: no cover
            raise TypeError(
                "Generated outputs aren't 1D, 2D or 3D, but instead are "
                f"{generated_ids.shape}"
            )


class TransformersMultiModalTypeAdapter(ModelTypeAdapter):
    """Type adapter for `TransformersMultiModal` model."""

    def __init__(self, **kwargs):
        self.tokenizer = kwargs.get("tokenizer")

    @singledispatchmethod
    def format_input(self, model_input):
        """Fomat the prompt arguments to pass to the model.

        Argument
        --------
        model_input
            The input passed by the user.

        Returns
        -------
        dict
            The formatted input.

        """
        raise TypeError(
            f"The input type {type(model_input)} is not available. Please "
            + "provide a list containing a text prompt and assets "
            + "(`Image`, `Audio` or `Video` instances) supported by your "
            + "model or a `Chat` instance."
        )

    @format_input.register(Chat)
    def format_chat_input(self, model_input: Chat) -> dict:
        conversation = []
        assets = []

        # process each message, convert if needed to standardized multimodal chat template format
        # and collect assets for HF processor
        for message in model_input.messages:
            processed_message, message_assets = self._prepare_message(
                message["role"], message["content"]
            )
            conversation.append(processed_message)
            assets.extend(message_assets)

        formatted_prompt = self.tokenizer.apply_chat_template(
            conversation,
            tokenize=False,
            add_generation_prompt=True
        )
        # use the formatted prompt and the assets to format the input
        return self.format_list_input([formatted_prompt, *assets])

    def _prepare_message(self, role: str, content: str | list) -> tuple[dict, list]:
        """Create a message."""
        if isinstance(content, str):
            return {"role": role, "content": content}, []

        elif isinstance(content, list):
            if all(isinstance(item, dict) for item in content): # HF multimodal chat template
                return {"role": role, "content": content}, self._extract_assets_from_content(content)
            else: # list of string + assets
                prompt = content[0]
                assets = content[1:]
                assets_dict = [self._format_asset_for_template(asset) for asset in assets]

                return {"role": role, "content": [
                    {"type": "text", "text": prompt},
                    *assets_dict
                ]}, assets
        else:
            raise ValueError(
                f"Invalid content type: {type(content)}. "
                + "The content must be a string or a list containing text and assets "
                + "or a list of dict items with explicit types."
            )

    def _extract_assets_from_content(self, content: list) -> list:
        """Process a list of dict items."""
        assets = []

        for item in content:
            if len(item) > 2:
                raise ValueError(
                    f"Found item with multiple keys: {item}. "
                    + "Each item in the content list must be a dictionary with a 'type' key and a single asset key. "
                    + "To include multiple assets, use separate dictionary items. "
                    + "For example: [{{'type': 'image', 'image': image1}}, {{'type': 'image', 'image': image2}}]. "
                )

            if "type" not in item:
                raise ValueError(
                    "Each item in the content list must be a dictionary with a 'type' key. "
                    + "Valid types are 'text', 'image', 'video', or 'audio'. "
                    + "For instance {{'type': 'text', 'text': 'your message'}}. "
                    + f"Found item without 'type' key: {item}"
                )
            if item["type"] == "text":
                continue
            elif item["type"] in ["image", "video", "audio"]:
                asset_key = item["type"]
                if asset_key not in item:
                    raise ValueError(
                        f"Item with type '{asset_key}' must contain a '{asset_key}' key. "
                        + f"Found item: {item}"
                    )
                if isinstance(item[asset_key], (Image, Video, Audio)):
                    assets.append(item[asset_key])
                else:
                    raise ValueError(
                        "Assets must be of type `Image`, `Video` or `Audio`. "
                        + f"Unsupported asset type: {type(item[asset_key])}"
                    )
            else:
                raise ValueError(
                    "Content must be 'text', 'image', 'video' or 'audio'. "
                    + f"Unsupported content type: {item['type']}")
        return assets

    def _format_asset_for_template(self, asset: Image | Video | Audio) -> dict:
        """Process an asset."""
        if isinstance(asset, Image):
            return {"type": "image", "image": asset}
        elif isinstance(asset, Video):
            return {"type": "video", "video": asset}
        elif isinstance(asset, Audio):
            return {"type": "audio", "audio": asset}
        else:
            raise ValueError(
                "Assets must be of type `Image`, `Video` or `Audio`. "
                + f"Unsupported asset type: {type(asset)}"
            )

    @format_input.register(list)
    def format_list_input(self, model_input: list) -> dict:
        prompt = model_input[0]
        assets = model_input[1:]

        if not assets:  # handle empty assets case
            return {"text": prompt}

        asset_types = set(type(asset) for asset in assets)
        if len(asset_types) > 1:
            raise ValueError(
                "All assets must be of the same type. "
                + f"Found types: {asset_types}"
            )
        asset_type = asset_types.pop()

        if asset_type == Image:
            return {
                "text": prompt,
                "images": [asset.image for asset in assets]
            }
        elif asset_type == Audio: # pragma: no cover
            return {
                "text": prompt,
                "audio": [asset.audio for asset in assets]
            }
        elif asset_type == Video: # pragma: no cover
            return {
                "text": prompt,
                "videos": [asset.video for asset in assets]
            }
        else:
            raise ValueError(f"Unsupported asset type: {asset_type}")

    def format_output_type(
        self,
        output_type: Optional[OutlinesLogitsProcessor] = None,
    ) -> Optional["LogitsProcessorList"]:
        """Generate the logits processor argument to pass to the model.

        Argument
        --------
        output_type
            The logits processor provided.

        Returns
        -------
        Optional[LogitsProcessorList]
            The logits processor to pass to the model.

        """
        from transformers import LogitsProcessorList

        if output_type is not None:
            return LogitsProcessorList([output_type])
        return None


class TransformersMultiModal(Transformers):
    """Thin wrapper around a `transformers` model and a `transformers`
    processor.

    This wrapper is used to convert the input and output types specified by the
    users at a higher level to arguments to the `transformers` model and
    processor.

    """

    def __init__(
        self,
        model: "PreTrainedModel",
        processor,
        *,
        device_dtype: Optional["torch.dtype"] = None,
    ):
        """Create a TransformersMultiModal model instance

        We rely on the `__init__` method of the `Transformers` class to handle
        most of the initialization and then add elements specific to multimodal
        models.

        Parameters
        ----------
        model
            A `PreTrainedModel`, or any model that is compatible with the
            `transformers` API for models.
        processor
            A `ProcessorMixin` instance.
        device_dtype
            The dtype to use for the model. If not provided, the model will use
            the default dtype.

        """
        self.processor = processor
        self.processor.padding_side = "left"
        self.processor.pad_token = "[PAD]"

        tokenizer: "PreTrainedTokenizer" = self.processor.tokenizer

        super().__init__(model, tokenizer, device_dtype=device_dtype)

        self.type_adapter = TransformersMultiModalTypeAdapter(
            tokenizer=tokenizer
        )

    def _prepare_model_inputs(
        self,
        model_input,
        is_batch: bool = False,
    ) -> Tuple[Union[str, List[str]], dict]:
        """Turn the user input into arguments to pass to the model"""
        if is_batch:
            prompts = [
                self.type_adapter.format_input(item) for item in model_input
            ]
        else:
            prompts = self.type_adapter.format_input(model_input)

        # The expected format is a single dict
        if is_batch:
            merged_prompts = defaultdict(list)
            for d in prompts:
                for key, value in d.items():
                    if key == "text":
                        merged_prompts[key].append(value)
                    else:
                        merged_prompts[key].extend(value)
        else:
            merged_prompts = prompts # type: ignore

        inputs = self.processor(
            **merged_prompts, padding=True, return_tensors="pt"
        )
        if self.device_dtype is not None:
            inputs = inputs.to(self.model.device, dtype=self.device_dtype)
        else:
            inputs = inputs.to(self.model.device)

        return merged_prompts["text"], inputs


def from_transformers(
    model: "PreTrainedModel",
    tokenizer_or_processor: Union["PreTrainedTokenizer", "ProcessorMixin"],
    *,
    device_dtype: Optional["torch.dtype"] = None,
) -> Union[Transformers, TransformersMultiModal]:
    """Create an Outlines `Transformers` or `TransformersMultiModal` model
    instance from a `PreTrainedModel` instance and a `PreTrainedTokenizer` or
    `ProcessorMixin` instance.

    `outlines` supports `PreTrainedModelForCausalLM`,
    `PreTrainedMambaForCausalLM`, `PreTrainedModelForSeq2Seq` and any model
    that implements the `transformers` model API.

    Parameters
    ----------
    model
        A `transformers.PreTrainedModel` instance.
    tokenizer_or_processor
        A `transformers.PreTrainedTokenizer` or
        `transformers.ProcessorMixin` instance.
    device_dtype
        The dtype to use for the model. If not provided, the model will use
        the default dtype.

    Returns
    -------
    Union[Transformers, TransformersMultiModal]
        An Outlines `Transformers` or `TransformersMultiModal` model instance.

    """
    from transformers import (
        PreTrainedTokenizer, PreTrainedTokenizerFast, ProcessorMixin)

    if isinstance(
        tokenizer_or_processor, (PreTrainedTokenizer, PreTrainedTokenizerFast)
    ):
        tokenizer = tokenizer_or_processor
        return Transformers(model, tokenizer, device_dtype=device_dtype)
    elif isinstance(tokenizer_or_processor, ProcessorMixin):
        processor = tokenizer_or_processor
        return TransformersMultiModal(model, processor, device_dtype=device_dtype)
    else:
        raise ValueError(
            "We could determine whether the model passed to `from_transformers`"
            + " is a text-2-text or a multi-modal model. Please provide a "
            + "a transformers tokenizer or processor."
        )


================================================
FILE: outlines/models/utils.py
================================================
import jsonpath_ng


def set_additional_properties_false_json_schema(schema: dict) -> dict:
    """Set additionalProperties to False to all objects in the schema using jsonpath.

    Parameters
    ----------
    schema
        The JSON schema to modify

    Returns
    -------
    dict
        The modified schema with additionalProperties set to False
    """
    # Get all nodes
    jsonpath_expr = jsonpath_ng.parse('$..*')
    matches = jsonpath_expr.find(schema)

    # Go over all nodes and set additionalProperties to False if it's an object
    for match in matches:
        if match.value == 'object':
            if 'additionalProperties' not in match.context.value:
                match.context.value['additionalProperties'] = False

    return schema


================================================
FILE: outlines/models/vllm.py
================================================
"""Integration with a vLLM server."""

import json
from typing import TYPE_CHECKING, Any, AsyncIterator, Iterator, Optional, Union

from outlines.inputs import Chat
from outlines.models.base import AsyncModel,Model, ModelTypeAdapter
from outlines.models.openai import OpenAITypeAdapter
from outlines.types.dsl import CFG, JsonSchema, python_types_to_terms, to_regex

if TYPE_CHECKING:
    from openai import AsyncOpenAI, OpenAI

__all__ = ["VLLM", "AsyncVLLM", "from_vllm"]


class VLLMTypeAdapter(ModelTypeAdapter):
    """Type adapter for the `VLLM` and `AsyncVLLM` models."""

    def format_input(self, model_input: Union[Chat, str, list]) -> list:
        """Generate the value of the messages argument to pass to the client.

        We rely on the OpenAITypeAdapter to format the input as the vLLM server
        expects input in the same format as OpenAI.

        Parameters
        ----------
        model_input
            The input passed by the user.

        Returns
        -------
        list
            The formatted input to be passed to the model.

        """
        return OpenAITypeAdapter().format_input(model_input)

    def format_output_type(self, output_type: Optional[Any] = None) -> dict:
        """Generate the structured output argument to pass to the client.

        Parameters
        ----------
        output_type
            The structured output type provided.

        Returns
        -------
        dict
            The structured output argument to pass to the model.

        """
        if output_type is None:
            return {}

        term = python_types_to_terms(output_type)
        if isinstance(term, CFG):
            return {"guided_grammar": term.definition}
        elif isinstance(term, JsonSchema):
            extra_body = {"guided_json": json.loads(term.schema)}
            if term.whitespace_pattern:
                extra_body["whitespace_pattern"] = term.whitespace_pattern
            return extra_body
        else:
            return {"guided_regex": to_regex(term)}


class VLLM(Model):
    """Thin wrapper around the `openai.OpenAI` client used to communicate with
    a `vllm` server.

    This wrapper is used to convert the input and output types specified by the
    users at a higher level to arguments to the `openai.OpenAI` client for the
    `vllm` server.
    """

    def __init__(
        self,
        client: "OpenAI",
        model_name: Optional[str] = None,
    ):
        """
        Parameters
        ----------
        client
            An `openai.OpenAI` client instance.

        """
        self.client = client
        self.model_name = model_name
        self.type_adapter = VLLMTypeAdapter()

    def generate(
        self,
        model_input: Union[Chat, str, list],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> Union[str, list[str]]:
        """Generate text using vLLM.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. All
            output types available in Outlines are supported provided your
            server uses a structured generation backend that supports them.
        inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        Union[str, list[str]]
            The text generated by the model.

        """
        client_args = self._build_client_args(
            model_input,
            output_type,
            **inference_kwargs,
        )

        response = self.client.chat.completions.create(**client_args)

        messages = [choice.message for choice in response.choices]
        for message in messages:
            if message.refusal is not None:  # pragma: no cover
                raise ValueError(
                    f"The vLLM server refused to answer the request: "
                    f"{message.refusal}"
                )

        if len(messages) == 1:
            return messages[0].content
        else:
            return [message.content for message in messages]

    def generate_batch(
        self,
        model_input,
        output_type = None,
        **inference_kwargs,
    ):
        raise NotImplementedError("VLLM does not support batch inference.")

    def generate_stream(
        self,
        model_input: Union[Chat, str, list],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> Iterator[str]:
        """Stream text using vLLM.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. All
            output types available in Outlines are supported provided your
            server uses a structured generation backend that supports them.
        inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        Iterator[str]
            An iterator that yields the text generated by the model.

        """
        client_args = self._build_client_args(
            model_input, output_type, **inference_kwargs,
        )

        stream = self.client.chat.completions.create(
            **client_args, stream=True,
        )

        for chunk in stream:  # pragma: no cover
            if chunk.choices and chunk.choices[0].delta.content is not None:
                yield chunk.choices[0].delta.content

    def _build_client_args(
        self,
        model_input: Union[Chat, str, list],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> dict:
        """Build the arguments to pass to the OpenAI client."""
        messages = self.type_adapter.format_input(model_input)
        output_type_args = self.type_adapter.format_output_type(output_type)
        extra_body = inference_kwargs.pop("extra_body", {})
        extra_body.update(output_type_args)

        if "model" not in inference_kwargs and self.model_name is not None:
            inference_kwargs["model"] = self.model_name

        client_args = {
            "messages": messages,
            **inference_kwargs,
        }
        if extra_body:
            client_args["extra_body"] = extra_body

        return client_args


class AsyncVLLM(AsyncModel):
    """Thin async wrapper around the `openai.OpenAI` client used to communicate
    with a `vllm` server.

    This wrapper is used to convert the input and output types specified by the
    users at a higher level to arguments to the `openai.OpenAI` client for the
    `vllm` server.
    """

    def __init__(
        self,
        client: "AsyncOpenAI",
        model_name: Optional[str] = None,
    ):
        """
        Parameters
        ----------
        client
            An `openai.AsyncOpenAI` client instance.

        """
        self.client = client
        self.model_name = model_name
        self.type_adapter = VLLMTypeAdapter()

    async def generate(
        self,
        model_input: Union[Chat, str, list],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> Union[str, list[str]]:
        """Generate text using vLLM.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. All
            output types available in Outlines are supported provided your
            server uses a structured generation backend that supports them.
        inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        Union[str, list[str]]
            The text generated by the model.

        """
        client_args = self._build_client_args(
            model_input, output_type, **inference_kwargs,
        )

        response = await self.client.chat.completions.create(**client_args)

        messages = [choice.message for choice in response.choices]
        for message in messages:
            if message.refusal is not None:  # pragma: no cover
                raise ValueError(
                    f"The vLLM server refused to answer the request: "
                    f"{message.refusal}"
                )

        if len(messages) == 1:
            return messages[0].content
        else:
            return [message.content for message in messages]

    async def generate_batch(
        self,
        model_input,
        output_type = None,
        **inference_kwargs,
    ):
        raise NotImplementedError("VLLM does not support batch inference.")

    async def generate_stream( # type: ignore
        self,
        model_input: Union[Chat, str, list],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> AsyncIterator[str]:
        """Stream text using vLLM.

        Parameters
        ----------
        model_input
            The prompt based on which the model will generate a response.
        output_type
            The desired format of the response generated by the model. All
            output types available in Outlines are supported provided your
            server uses a structured generation backend that supports them.
        inference_kwargs
            Additional keyword arguments to pass to the client.

        Returns
        -------
        AsyncIterator[str]
            An async iterator that yields the text generated by the model.
        """
        client_args = self._build_client_args(
            model_input, output_type, **inference_kwargs,
        )

        stream = await self.client.chat.completions.create(
            **client_args,
            stream=True,
        )

        async for chunk in stream:  # pragma: no cover
            if chunk.choices and chunk.choices[0].delta.content is not None:
                yield chunk.choices[0].delta.content

    def _build_client_args(
        self,
        model_input: Union[Chat, str, list],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> dict:
        """Build the arguments to pass to the OpenAI client."""
        messages = self.type_adapter.format_input(model_input)
        output_type_args = self.type_adapter.format_output_type(output_type)
        extra_body = inference_kwargs.pop("extra_body", {})
        extra_body.update(output_type_args)

        if "model" not in inference_kwargs and self.model_name is not None:
            inference_kwargs["model"] = self.model_name

        client_args = {
            "messages": messages,
            **inference_kwargs,
        }
        if extra_body:
            client_args["extra_body"] = extra_body

        return client_args


def from_vllm(
    client: Union["OpenAI", "AsyncOpenAI"],
    model_name: Optional[str] = None,
) -> Union[VLLM, AsyncVLLM]:
    """Create an Outlines `VLLM` or `AsyncVLLM` model instance from an
    `openai.OpenAI` or `openai.AsyncOpenAI` instance.

    Parameters
    ----------
    client
        An `openai.OpenAI` or `openai.AsyncOpenAI` instance.
    model_name
        The name of the model to use.

    Returns
    -------
    Union[VLLM, AsyncVLLM]
        An Outlines `VLLM` or `AsyncVLLM` model instance.

    """
    from openai import AsyncOpenAI, OpenAI

    if isinstance(client, OpenAI):
        return VLLM(client, model_name)
    elif isinstance(client, AsyncOpenAI):
        return AsyncVLLM(client, model_name)
    else:
        raise ValueError(
            f"Unsupported client type: {type(client)}.\n"
            "Please provide an OpenAI or AsyncOpenAI instance."
        )


================================================
FILE: outlines/models/vllm_offline.py
================================================
"""Integration with the `vllm` library (offline mode)."""

import json
from functools import singledispatchmethod
from typing import TYPE_CHECKING, Any, List, Optional, Union

from outlines.inputs import Chat
from outlines.models.base import Model, ModelTypeAdapter
from outlines.models.openai import OpenAITypeAdapter
from outlines.types.dsl import CFG, JsonSchema, python_types_to_terms, to_regex

if TYPE_CHECKING:
    from vllm import LLM
    from vllm.sampling_params import SamplingParams

__all__ = ["VLLMOffline", "from_vllm_offline"]


class VLLMOfflineTypeAdapter(ModelTypeAdapter):
    """Type adapter for the `VLLMOffline` model."""

    def __init__(self, has_chat_template: bool = False):
        self.has_chat_template = has_chat_template

    @singledispatchmethod
    def format_input(self, model_input):
        """Generate the prompt argument to pass to the model.

        Argument
        --------
        model_input
            The input passed by the user.

        """
        raise TypeError(
            f"The input type {type(model_input)} is not available with "
            "VLLM offline. The only available types are `str` and "
            "`Chat` (containing a prompt and images)."
        )

    @format_input.register(str)
    def format_input_str(self, model_input: str) -> str | list:
        """Format a `str` input.

        """
        if self.has_chat_template:
            return self.format_input_chat(Chat([{"role": "user", "content": model_input}]))
        return model_input

    @format_input.register(Chat)
    def format_input_chat(self, model_input: Chat) -> list:
        """Format a `Chat` input.

        """
        for message in model_input.messages:
            content = message["content"]
            if isinstance(content, list):
                raise ValueError(
                    "Assets are not supported for vLLM offline."
                    "Please only use text content in the `Chat` input."
                )
        return OpenAITypeAdapter().format_input(model_input)

    def format_output_type(self, output_type: Optional[Any] = None) -> dict:
        """Generate the structured output argument to pass to the model.

        For vLLM, the structured output definition is set in the
        `GuidedDecodingParams` constructor that is provided as a value to the
        `guided_decoding` parameter of the `SamplingParams` constructor, itself
        provided as a value to the `sampling_params` parameter of the `generate`
        method.

        Parameters
        ----------
        output_type
            The structured output type provided.

        Returns
        -------
        dict
            The arguments to provide to the `GuidedDecodingParams` constructor.

        """
        if output_type is None:
            return {}

        term = python_types_to_terms(output_type)
        if isinstance(term, CFG):
            return {"grammar": term.definition}
        elif isinstance(term, JsonSchema):
            guided_decoding_params = {"json": json.loads(term.schema)}
            if term.whitespace_pattern:
                guided_decoding_params["whitespace_pattern"] = term.whitespace_pattern
            return guided_decoding_params
        else:
            return {"regex": to_regex(term)}


class VLLMOffline(Model):
    """Thin wrapper around a `vllm.LLM` model.

    This wrapper is used to convert the input and output types specified by the
    users at a higher level to arguments to the `vllm.LLM` model.

    """

    def __init__(self, model: "LLM"):
        """Create a VLLM model instance.

        Parameters
        ----------
        model
            A `vllm.LLM` model instance.

        """
        self.model = model
        self.tokenizer = self.model.get_tokenizer()
        self.type_adapter = VLLMOfflineTypeAdapter(has_chat_template=self._check_chat_template())

    def _build_generation_args(
        self,
        inference_kwargs: dict,
        output_type: Optional[Any] = None,
    ) -> "SamplingParams":
        """Create the `SamplingParams` object to pass to the `generate` method
        of the `vllm.LLM` model."""
        from vllm.sampling_params import StructuredOutputsParams, SamplingParams

        sampling_params = inference_kwargs.pop("sampling_params", None)

        if sampling_params is None:
            sampling_params = SamplingParams()

        output_type_args = self.type_adapter.format_output_type(output_type)
        if output_type_args:
            original_sampling_params_dict = {f: getattr(sampling_params, f) for f in sampling_params.__struct_fields__}
            sampling_params_dict = {**original_sampling_params_dict, "structured_outputs": StructuredOutputsParams(**output_type_args)}
            sampling_params = SamplingParams(**sampling_params_dict)

        return sampling_params

    def generate(
        self,
        model_input: Chat | str,
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> Union[str, List[str]]:
        """Generate text using vLLM offline.

        Parameters
        ----------
        prompt
            The prompt based on which the model will generate a response.
        output_type
            The logits processor the model will use to constrain the format of
            the generated text.
        inference_kwargs
            Additional keyword arguments to pass to the `generate` method
            in the `vllm.LLM` model.

        Returns
        -------
        Union[str, List[str]]
            The text generated by the model.

        """
        sampling_params = self._build_generation_args(
            inference_kwargs,
            output_type,
        )

        model_input = self.type_adapter.format_input(model_input)

        if isinstance(model_input, list):
            results = self.model.chat(
                messages=model_input,
                sampling_params=sampling_params,
                **inference_kwargs,
            )
        else:
            results = self.model.generate(
                prompts=model_input,
                sampling_params=sampling_params,
                **inference_kwargs,
            )
        results = [completion.text for completion in results[0].outputs]

        if len(results) == 1:
            return results[0]
        else:
            return results

    def generate_batch(
        self,
        model_input: List[Chat | str],
        output_type: Optional[Any] = None,
        **inference_kwargs: Any,
    ) -> Union[List[str], List[List[str]]]:
        """Generate a batch of completions using vLLM offline.

        Parameters
        ----------
        prompt
            The list of prompts based on which the model will generate a
            response.
        output_type
            The logits processor the model will use to constrain the format of
            the generated text.
        inference_kwargs
            Additional keyword arguments to pass to the `generate` method
            in the `vllm.LLM` model.

        Returns
        -------
        Union[List[str], List[List[str]]]
            The text generated by the model.

        """
        sampling_params = self._build_generation_args(
            inference_kwargs,
            output_type,
        )

        model_inputs = [self.type_adapter.format_input(item) for item in model_input]

        if model_inputs and isinstance(model_inputs[0], list):
            results = self.model.chat(
                messages=model_inputs,
                sampling_params=sampling_params,
                **inference_kwargs,
            )
        else:
            results = self.model.generate(
                prompts=model_inputs,
                sampling_params=sampling_params,
                **inference_kwargs,
            )
        return [[sample.text for sample in batch.outputs] for batch in results]

    def generate_stream(self, model_input, output_type, **inference_kwargs):
        """Not available for `vllm.LLM`.

        TODO: Implement the streaming functionality ourselves.

        """
        raise NotImplementedError(
            "Streaming is not available for the vLLM offline integration."
        )

    def _check_chat_template(self) -> bool:
        """Check if the tokenizer has a chat template."""
        from vllm.transformers_utils.tokenizer import (
            PreTrainedTokenizer,
            PreTrainedTokenizerFast,
            TokenizerBase
        )
        from outlines.models.tokenizer import _check_hf_chat_template

        if isinstance(self.tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)):
            return _check_hf_chat_template(self.tokenizer)
        elif isinstance(self.tokenizer, TokenizerBase):
            # vLLM defines its own TokenizerBase class, and only provides
            # limited compatibility with HuggingFace tokenizers. So we
            # need to check for chat template support differently.
            try:
                self.tokenizer.apply_chat_template([{"role": "user", "content": "test"}])
                return True
            except Exception:
                return False
        else:  # Never reached  # pragma: no cover
            return False

def from_vllm_offline(model: "LLM") -> VLLMOffline:
    """Create an Outlines `VLLMOffline` model instance from a `vllm.LLM`
    instance.

    Parameters
    ----------
    model
        A `vllm.LLM` instance.

    Returns
    -------
    VLLMOffline
        An Outlines `VLLMOffline` model instance.

    """
    return VLLMOffline(model)


================================================
FILE: outlines/processors/__init__.py
================================================
"""Processors to control generation in steerable models."""

from .base_logits_processor import OutlinesLogitsProcessor

__all__ = [
    "OutlinesLogitsProcessor",
]


================================================
FILE: outlines/processors/base_logits_processor.py
================================================
"""Base class for logits processors."""

from abc import abstractmethod
from typing import TypeVar

from outlines.processors.tensor_adapters import (
    TensorAdapterImplementation,
    tensor_adapters,
)

TensorType = TypeVar('TensorType')


class OutlinesLogitsProcessor:
    """Base class for logits processors.
    This class implements a shared `__call__` method is called by the models
    and returns the processed logits. It relies on the `process_logits` method
    that must be implemented by the subclasses to do the actual processing. The
    `tensor_adapter` attribute, created at initialization based on the
    tensor library name specified in the constructor, is used to manipulate the
    tensors using the appropriate library for the model (numpy, torch...).
    """
    tensor_adapter: TensorAdapterImplementation

    def __init__(self, tensor_library_name: str):
        """
        Parameters
        ----------
        tensor_library_name
            The name of the library to use to manipulate tensors. Possible
            values are "mlx", "numpy" and "torch". You must choose the library
            that your model is using.
        """
        # Temporary fix as torch raises a warning that can cause can an error
        # with python 3.12.
        if tensor_library_name == "torch":
            import torch._dynamo

            torch._dynamo.config.suppress_errors = True

        tensor_adapter_class = tensor_adapters.get(tensor_library_name)
        if tensor_adapter_class is None:
            raise NotImplementedError(
                f"Library {tensor_library_name} is not available"
            )
        self.tensor_adapter = tensor_adapter_class()  # type: ignore

    def reset(self):
        """Reset the logits processor for a new generation

        Only implement this method in subclasses if the logits processor
        needs to be reset for a new generation.

        """
        pass # pragma: no cover

    @abstractmethod
    def process_logits(
        self, input_ids: TensorType, logits: TensorType
    ) -> TensorType:
        """Main method to implement for logits processors subclasses.
        This method applies a mask on the logits to bias the generation.
        It is called by the `__call__` method that standardizes the shape of
        `input_ids` and `logits` to ensure they are 2D tensors.
        Elements to keep in mind when designing universal logits processors:
        - logits processors are only used once and never re-applied for a new
        sequence generator
        - Some models only pass output_ids, some models such as llamacpp and
        transformers prefix with input_ids
        - Some sampling methods, such as beam search, result in unstable
        sequence ordering in models like vLLM
        Parameters
        ----------
        input_ids
            The ids of the tokens of the existing sequences in a 2D tensor.
        logits
            The logits for the current generation step in a 2D tensor.
        Returns
        -------
        TensorType
            The processed logits as a 2D tensor.
        """
        ...

    def __call__(
        self, input_ids: TensorType, logits: TensorType
    ) -> TensorType:
        """Entrypoint for logits processors, this is the method that is
        called by the model.
        Because different models use different structures to store the
        input_ids and logits, we standardize their format to 2D tensors
        before calling the `process_logits` method. After processing, the
        logits are cast back to the original array library type before being
        returned.
        Parameters
        ----------
        input_ids
            The ids of the tokens of the existing sequences in a tensor.
        logits
            The logits for the current generation step in a tensor.
        Returns
        -------
        TensorType
            The processed logits as a tensor.
        """
        # if input_ids is 1D and logits is 2D with a single sequence,
        # reshape input_ids to 2D (needed for mlx-lm)
        if (
            len(self.tensor_adapter.shape(input_ids)) == 1
            and len(self.tensor_adapter.shape(logits)) == 2
            and self.tensor_adapter.shape(logits)[0] == 1
        ):
            input_ids = self.tensor_adapter.unsqueeze(input_ids)

        assert (
            self.tensor_adapter.shape(logits)[:-1]
            == self.tensor_adapter.shape(input_ids)[:-1]
        )

        # Guarantee passed as 2D Tensors, then covert back to original
        # (1D or 2D) shape
        if len(self.tensor_adapter.shape(logits)) == 2:
            processed_logits = self.process_logits(input_ids, logits)
        elif len(self.tensor_adapter.shape(logits)) == 1:
            processed_logits = self.tensor_adapter.squeeze(
                self.process_logits(
                    self.tensor_adapter.unsqueeze(input_ids),
                    self.tensor_adapter.unsqueeze(logits),
                ),
            )
        else:
            raise ValueError(
                f"Logits shape {self.tensor_adapter.shape(logits)} is not "
                + "supported"
            )

        return processed_logits


================================================
FILE: outlines/processors/tensor_adapters/__init__.py
================================================
"""Library specific objects to manipulate tensors."""

from typing import Union

from .mlx import MLXTensorAdapter
from .numpy import NumpyTensorAdapter
from .torch import TorchTensorAdapter

tensor_adapters = {
    "mlx": MLXTensorAdapter,
    "numpy": NumpyTensorAdapter,
    "torch": TorchTensorAdapter,
}

TensorAdapterImplementation = Union[
    MLXTensorAdapter,
    NumpyTensorAdapter,
    TorchTensorAdapter,
]

__all__ = [
    "MLXTensorAdapter",
    "NumpyTensorAdapter",
    "TorchTensorAdapter",
    "tensor_adapters",
    "TensorAdapterImplementation",
]


================================================
FILE: outlines/processors/tensor_adapters/base.py
================================================
"""Base class for tensor adapters."""

from abc import ABC, abstractmethod
from typing import TYPE_CHECKING, TypeVar, Any, Union

if TYPE_CHECKING:
    import torch

TensorType = TypeVar('TensorType')


class TensorAdapter(ABC):
    """Abstract base class for tensor adapters.

    This class defines the interface for tensor adapters that are used to
    manipulate tensors in different libraries. Concrete implementations of
    this class should provide specific implementations for each method as
    well as providing a `library_name` attribute.

    TODO: Update the version of outlines-core used to receive plain arrays
    instead of torch tensors. In the meantime, implementations of this class
    must make sure that their `full_like` and `concatenate` methods can
    handle torch tensors.

    """
    library_name: str

    @abstractmethod
    def shape(self, tensor: TensorType) -> list[int]:
        """Get the shape of the tensor.

        Parameters
        ----------
        tensor
            The tensor to get the shape of.

        Returns
        -------
        list[int]
            The shape of the tensor. The list contains as many elements as
            there are dimensions in the tensor.

        """
        ...

    @abstractmethod
    def unsqueeze(self, tensor: TensorType) -> TensorType:
        """Add a dimension to the tensor at axis 0.

        Parameters
        ----------
        tensor
            The tensor to add a dimension to.

        Returns
        -------
        TensorType
            The tensor with an additional dimension.

        """
        ...

    @abstractmethod
    def squeeze(self, tensor: TensorType) -> TensorType:
        """Remove a dimension from the tensor at axis 0.

        Parameters
        ----------
        tensor
            The tensor to remove a dimension from.

        Returns
        -------
        TensorType
            The tensor with one less dimension.

        """
        ...

    @abstractmethod
    def to_list(self, tensor: TensorType) -> list:
        """Convert the tensor to a list.

        Parameters
        ----------
        tensor
            The tensor to convert to a list.

        Returns
        -------
        list
            The tensor as a list.

        """
        ...

    @abstractmethod
    def to_scalar(self, tensor: TensorType) -> Any:
        """Return the only element of the tensor.

        Parameters
        ----------
        tensor
            The tensor to return the only element of.

        Returns
        -------
        Any
            The only element of the tensor.

        """
        ...

    @abstractmethod
    def full_like(self, tensor: "torch.Tensor", fill_value: Any) -> TensorType: # type: ignore
        """Create a tensor with the same shape as the input tensor filled
        with a scalar value.

        ATTENTION: This method receives a torch tensor regardless of the
        library used.

        Parameters
        ----------
        tensor
            The tensor to create a new tensor with the same shape.
        fill_value
            The value to fill the new tensor with.

        Returns
        -------
        TensorType
            A tensor with the same shape as the input tensor filled with the
            specified value.

        """
        ...

    @abstractmethod
    def concatenate(
        self, tensors: list[Union["torch.Tensor", TensorType]]
    ) -> TensorType:
        """Concatenate a list of tensors along axis 0.

        ATTENTION: This method can either receive a list of torch tensors or
        a list of tensors from the library used.

        Parameters
        ----------
        tensors
            The list of tensors to concatenate.

        Returns
        -------
        TensorType
            The concatenated tensor.

        """
        ...

    @abstractmethod
    def get_device(self, tensor: TensorType) -> str:
        """Get the name of the tensor's device.

        Parameters
        ----------
        tensor
            The tensor to get the device of.

        Returns
        -------
        str
            The name of the tensor's device.

        """
        ...

    @abstractmethod
    def to_device(self, tensor: TensorType, device: str) -> TensorType:
        """Move the tensor to a specified device.

        Parameters
        ----------
        tensor
            The tensor to move to a specified device.
        device
            The name of the device to move the tensor to.

        Returns
        -------
        TensorType
            The tensor moved to the specified device.

        """
        ...

    @abstractmethod
    def boolean_ones_like(self, tensor: TensorType) -> TensorType:
        """Create a boolean ones tensor with the same shape as the input
        tensor.

        Parameters
        ----------
        tensor
            The tensor to create a boolean ones tensor with the same shape.

        Returns
        -------
        TensorType
            A boolean ones tensor with the same shape as the input tensor.

        """
        ...

    @abstractmethod
    def apply_mask(
        self, tensor: TensorType, mask: TensorType, value: Any
    ) -> TensorType:
        """Fill the elements of the tensor where the mask is True with the
        specified value.

        Parameters
        ----------
        tensor
            The tensor to fill.
        mask
            The mask to apply to the tensor.
        value
            The value to fill the tensor with.

        Returns
        -------
        TensorType
            The tensor with the mask applied.

        """
        ...

    @abstractmethod
    def argsort_descending(
        self, tensor: TensorType
    ) -> TensorType:
        """Return the indices that would sort the tensor in descending order
        along axis -1.

        Parameters
        ----------
        tensor
            The tensor to sort.

        Returns
        -------
        TensorType
            The indices that would sort the tensor in descending order along
            axis -1.

        """
        ...


================================================
FILE: outlines/processors/tensor_adapters/mlx.py
================================================
"""Tensor adapter for the `mlx` library."""

from outlines.processors.tensor_adapters.base import TensorAdapter


class MLXTensorAdapter(TensorAdapter):
    library_name = "mlx"

    def __init__(self):
        import mlx.core

        self.mlx = mlx.core

    def shape(self, tensor):
        return tensor.shape

    def unsqueeze(self, tensor):
        return self.mlx.expand_dims(tensor, 0)

    def squeeze(self, tensor):
        if tensor.shape[0] == 1:
            return tensor[0]
        return tensor

    def to_list(self, tensor):
        return tensor.tolist()

    def to_scalar(self, tensor):
        return tensor.item()

    def full_like(self, tensor, fill_value):
        # Compatible with receiving a torch tensor
        return self.mlx.full(tensor.shape, fill_value)

    def concatenate(self, tensors):
        # Can handle both torch and mlx tensors
        return self.mlx.concatenate(
            [
                self.mlx.array(t) if not isinstance(t, self.mlx.array) else t
                for t in tensors
            ],
            axis=0
        )

    def get_device(self, tensor):
        return None

    def to_device(self, tensor, device):
        return tensor

    def boolean_ones_like(self, tensor):
        return self.mlx.ones(tensor.shape, dtype=self.mlx.bool_)

    def apply_mask(self, tensor, mask, value):
        result = tensor.astype(tensor.dtype)
        result = self.mlx.where(mask, self.mlx.array(value), result)
        return result

    def argsort_descending(self, tensor):
        return self.mlx.argsort(-tensor)


================================================
FILE: outlines/processors/tensor_adapters/numpy.py
================================================
"""Tensor adapter for the `numpy` library."""

from outlines.processors.tensor_adapters.base import TensorAdapter


class NumpyTensorAdapter(TensorAdapter):
    library_name = "numpy"

    def __init__(self):
        import numpy

        self.numpy = numpy

    def shape(self, tensor):
        return tensor.shape

    def unsqueeze(self, tensor):
        return self.numpy.expand_dims(tensor, axis=0)

    def squeeze(self, tensor):
        return self.numpy.squeeze(tensor, axis=0)

    def to_list(self, tensor):
        return tensor.tolist()

    def to_scalar(self, tensor):
        return tensor.item()

    def full_like(self, tensor, fill_value):
        return self.numpy.full_like(tensor, fill_value)

    def concatenate(self, tensors):
        return self.numpy.concatenate(tensors, axis=0)

    def get_device(self, tensor):
        return None

    def to_device(self, tensor, device):
        return tensor

    def boolean_ones_like(self, tensor):
        return self.numpy.ones_like(tensor, dtype=bool)

    def apply_mask(self, tensor, mask, value):
        result = tensor.copy()
        result[mask] = value
        return result

    def argsort_descending(self, tensor):
        return self.numpy.argsort(-tensor)


================================================
FILE: outlines/processors/tensor_adapters/torch.py
================================================
"""Tensor adapter for the `torch` library."""

from outlines.processors.tensor_adapters.base import TensorAdapter


class TorchTensorAdapter(TensorAdapter):
    library_name = "torch"

    def __init__(self):
        import torch

        self.torch = torch

    def shape(self, tensor):
        return tensor.shape

    def unsqueeze(self, tensor):
        return tensor.unsqueeze(0)

    def squeeze(self, tensor):
        return tensor.squeeze(0)

    def to_list(self, tensor):
        return tensor.tolist()

    def to_scalar(self, tensor):
        return tensor.item()

    def full_like(self, tensor, fill_value):
        return self.torch.full_like(tensor, fill_value)

    def concatenate(self, tensors):
        return self.torch.cat(tensors, dim=0)

    def get_device(self, tensor):
        return tensor.device

    def to_device(self, tensor, device):
        return tensor.to(device)

    def boolean_ones_like(self, tensor):
        return self.torch.ones_like(tensor, dtype=self.torch.bool)

    def apply_mask(self, tensor, mask, value):
        return self.torch.masked_fill(tensor, mask, value)

    def argsort_descending(self, tensor):
        return self.torch.argsort(tensor, descending=True)


================================================
FILE: outlines/py.typed
================================================


================================================
FILE: outlines/release_note.md
================================================
# Release Note

### Why a new major version?

The v1 intends on making Outlines more closely focused on constrained generation. To do so, we delegate a wider range of tasks to the users and inference libraries. On top of making Outlines leaner, this design provides more flexibility to the users and let them use interfaces they are already familiar with.

Our approach is inspired by the unix best practices — each element does one thing well, and we compose those functional elements.

As this new version deprecates some previously available features of Outlines, we have written a migration guide that gives detailed information on how to upgrade your v0 code to v1.

### Deprecated

All deprecated features listed below will be removed in version 1.1.0. Until then, a warning will be displayed with information on how to migrate your code to v1.

- The model loader functions from the `models` module (`transformers`, `openai`, etc.) have been deprecated. They are replaced by equivalent functions prefixed with `from_` such as `from_transformers`, `from_openai`, etc. The new loader functions accept different arguments compared to the old ones. They now typically require an instance of an engine/client from the associated inference library. This change was made to avoid duplicating inference library logic and to give users more control over inference engine/client initialization.
[Documentation](https://dottxt-ai.github.io/outlines/latest/features/models)

```python
# v0
from outlines import models
from transformers import BertForSequenceClassification, BertTokenizer

model = models.transformers(
    model_name="prajjwal1/bert-tiny",
    model_class=BertForSequenceClassification,
    tokenizer_class=BertTokenizer,
    model_kwargs={"use_cache": False},
    tokenizer_kwargs={"model_max_length": 512},
)

# v1
import outlines
from transformers import BertForSequenceClassification, BertTokenizer

hf_model = BertForSequenceClassification.from_pretrained("prajjwal1/bert-tiny", use_cache=False)
hf_tokenizer = BertTokenizer.from_pretrained("prajjwal1/bert-tiny", model_max_length=512)
model = outlines.from_transformers(hf_model, hf_tokenizer)
```

- The `generate` module and the associated functions (`json`, `choice`…) have been deprecated. They are replaced by the `Generator` constructor. While you had to select the right generate function for your output type, you can now provide any output type supported by Outlines to the unique `Generator` object.
[Documentation](https://dottxt-ai.github.io/outlines/latest/features/core/generator)


```python
# v0
from pydantic import BaseModel
from outlines import generate, models

class Character(BaseModel):
	name: str

model = models.openai("gpt-4o")
generator = generate.json(model, Character)

# v1
from openai import OpenAI
from pydantic import BaseModel
from outlines import Generator, from_openai

class Character(BaseModel):
	name: str

model = from_openai(OpenAI())
generator = Generator(model, Character)
```

- The `TransformersVision` model has been deprecated. It's replaced by `TransformersMultiModal`, which is more general as it supports additional input types beyond images, such as audio. When calling it, instead of providing the prompt and image assets separately, both should now be included in a single dictionary. The model is loaded with `from_transformers` just like the `Transformers` model, but the second argument must be a processor instead of a tokenizer.
[Documentation](https://dottxt-ai.github.io/outlines/latest/features/models/transformers_multimodal)


```python
# v0
from io import BytesIO
from urllib.request import urlopen
from PIL import Image
from transformers import LlavaForConditionalGeneration
from outlines import models, generate

def img_from_url(url):
    img_byte_stream = BytesIO(urlopen(url).read())
    return Image.open(img_byte_stream).convert("RGB")

model = models.transformers_vision(
    model_name="trl-internal-testing/tiny-LlavaForConditionalGeneration",
    model_class=LlavaForConditionalGeneration,
)
generator = generate.text(model)
result = generator(
    "Describe the image <image>",
    img_from_url("https://upload.wikimedia.org/wikipedia/commons/2/25/Siam_lilacpoint.jpg")
)

# v1
from io import BytesIO
from urllib.request import urlopen
from PIL import Image
from transformers import LlavaForConditionalGeneration, AutoProcessor
import outlines

def img_from_url(url):
    img_byte_stream = BytesIO(urlopen(url).read())
    return Image.open(img_byte_stream).convert("RGB")

model = outlines.from_transformers(
	LlavaForConditionalGeneration.from_pretrained("trl-internal-testing/tiny-LlavaForConditionalGeneration"),
	AutoProcessor.from_pretrained("trl-internal-testing/tiny-LlavaForConditionalGeneration")
)
image = img_from_url("https://upload.wikimedia.org/wikipedia/commons/2/25/Siam_lilacpoint.jpg")
result = model({"text": "Describe the image <image>", "images": image})
```

- The `Exllamav2` model has been deprecated without replacement because its interface is not fully compatible with Outlines. We had to implement cumbersome patching to make it work, so we decided to remove it entirely.

- The `function` module and the associated `Function` class have been deprecated. They are replaced by the `Application` class, which serves a similar purpose to `Function`. There are two notable differences: an `Application` is not initialized with a model (a model must be provided when calling the object), and template variables must be provided in a dictionary instead of as keyword arguments when calling the `Application`.
[Documentation](https://dottxt-ai.github.io/outlines/latest/features/utility/application)


```python
# v0
from pydantic import BaseModel
from outlines import Function, Template

class Character(BaseModel):
	name: str

template = Template.from_string("Create a {{ gender }} character.")
fn = Function(template, Character, "hf-internal-testing/tiny-random-GPTJForCausalLM")
response = fn(gender="female")

# v1
from pydantic import BaseModel
from outlines import Application, Template, from_transformers
from transformers import AutoModelForCausalLM, AutoTokenizer

class Character(BaseModel):
	name: str

model = from_transformers(
    AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct"),
    AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
)

template = Template.from_string("Create a {{ gender }} character.")
app = Application(template, Character)
response = app(model, {"gender": "female"})
```

- The `samplers` module and the associated objects (`multinomial`, `greedy`…) have been deprecated. You should now use the inference arguments specific to the inference library your  model is based on to control the sampling.

```python
# v0
from outlines import generate, models, samplers

model = models.transformers("microsoft/Phi-3-mini-4k-instruct")
generator = generate.text(model, samplers.beam_search(2))
response = generator("Write a short story about a cat", max_tokens=10)

# v1
from outlines import Generator, from_transformers
from transformers import AutoModelForCausalLM, AutoTokenizer

model = from_transformers(
    AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct"),
    AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
)
response = model("Write a short story about a cat", num_beams=2)
```

- The `load_lora` methods on the `VLLM` and `LlamaCpp` models have been deprecated. You should now load through the `Llama` instance provided when initializing the model in the case of the `LlamaCpp` model, and provide it as a keyword argument when calling the model in the case of the `VLLM` model.

```python
# v0
from outlines import models
from vllm import LLM

model = models.vllm("erwanf/gpt2-mini")
model.load_lora("path/to/lora/file")
response = model("Write a short story about a cat.")

#v1
from outlines import from_vllm
from vllm import LLM
from vllm.lora.request import LoRARequest

model = from_vllm(
    LLM("microsoft/Phi-3-mini-4k-instruct")
)
lora_request = LoRARequest("path/to/lora/file", 1, "path/to/lora/file")
response = model("Write a short story about a cat.", lora_request=lora_request)
```

### Modified

Some objects are maintained, but their interface or behavior has been modified.

- The interface of `Model` classes (`Transformers`, `OpenAI`, etc.) has been significantly modified. Models can now be called directly with a prompt and an output type without having to create a generator first. Additionally, all models have a `stream` method that can be invoked directly by the user.
[Documentation](https://dottxt-ai.github.io/outlines/latest/features/models)


```python
# v0
from pydantic import BaseModel
from outlines import generate, models

class Character(BaseModel):
		name: str

model = models.openai("gpt-4o")
generator = generate.json(model, Character)
result = generator("Create a character")

# v1
from openai import OpenAI
from pydantic import BaseModel
from outlines import from_openai

class Character(BaseModel):
	name: str

model = from_openai(OpenAI(), "gpt-4o")
result = model("Create a character", Character)
```

- The interface of the `__init__` method of the `OpenAI` model class has been modified. While it previously accepted a client and an `OpenAIConfig` object instance, it now accepts a client and a model name. The inference arguments from the config object should now be specified when calling the model to more closely align with the OpenAI Python library's functionality. If you provide an `OpenAIConfig` instance when initializing the model, a deprecation warning will appear and your model will behave like a v0 model.
We recommend using the `from_openai` function instead of initializing models directly.
[Documentation](https://dottxt-ai.github.io/outlines/latest/features/models/openai)


```python
# v0
from outlines.models.openai import OpenAI, OpenAIConfig
from openai import OpenAI as OpenAIClient

model = OpenAI(
	OpenAIClient(),
	OpenAIConfig(model="gpt-4o", stop=["."])
)

# v1
import outlines
from openai import OpenAI

model = outlines.from_openai(OpenAIClient(), "gpt-4o")
```

- The return type of text generation is now consistently a string (or list/lists of strings for multiple samples or batching). In v0, Outlines automatically cast the inference result into the type provided by the user for constrained generation, but we have removed this behavior. This change was made to create more consistent behavior and to give users more freedom in deciding how to handle the generation result.
[Documentation](https://dottxt-ai.github.io/outlines/latest/features/models)

```python
# v0
from pydantic import BaseModel
from outlines import generate, models

class Character(BaseModel):
	name: str

model = models.openai("gpt-4o")
generator = generate.json(model, Character)
result = generator("Create a character")
print(result) # name='James'

# v1
import openai
from pydantic import BaseModel
from outlines import from_openai

class Character(BaseModel):
		name: str

model = from_openai(OpenAI())
result = model("Create a character", Character)
print(result) # {"name": "James"}
print(Character.model_validate_json(result)) # name='James'
```

- While Outlines was trying to standardize inference argument names across models in v0, we decided to stop doing so and to directly pass on the inference arguments provided by the user to the inference engine/client. Our objective is to let the user use all arguments they are accustomed to with their inference library instead of having to learn Outlines-defined arguments. The deprecation of the `samplers` mentioned above is a part of this change of approach.
[Documentation](https://dottxt-ai.github.io/outlines/latest/features/models)

```python
# v0
from outlines import generate, models

model = models.transformers("microsoft/Phi-3-mini-4k-instruct")
generator = generate.text(model)
result = generator("Create a character", max_tokens=256, stop_at=".")

# v1
from outlines import from_transformers
from transformers import AutoModelForCausalLM, AutoTokenizer

model = from_transformers(
	AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct"),
	AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
)
result = model("Create a character", max_new_tokens=256, stop_strings=".")
```

### Added features

- There are 8 additional models available. All of them are loaded with an associated `from_` function that accepts an inference engine/client instance.
[Documentation](https://dottxt-ai.github.io/outlines/latest/features/models)
    - `Dottxt`
    - `Anthopic`
    - `Gemini`
    - `Ollama`
    - `SGLang`
    - `TGI`
    - `TransformersMultiModel`
    - `VLLM`
- Some server-based models now have an async version. To create an async model, just provide an async client instance when using the loader function. The async models are the following.
[Documentation](https://dottxt-ai.github.io/outlines/latest/features/models)
    - `AsyncSGLang`
    - `AsyncTGI`
    - `AsyncVLLM`

```python
import outlines
from huggingface_hub import AsyncInferenceClient

async_model = outlines.from_tgi(AsyncInferenceClient("http://localhost:11434"))
```

- As explained previously, the `Generator` constructor has been added. It accepts a model and an output type as arguments and returns a generator object that can be used to generate text by providing a prompt and inference arguments. The interest of a generator is that it's reusable such that the user does not have to specify the output type they want each time and the output type compilation (when applicable) happens only once.
[Documentation](https://dottxt-ai.github.io/outlines/latest/features/core/generator)

```python
# direct model calling
from typing import Literal
from outlines import from_transformers
from transformers import AutoModelForCausalLM, AutoTokenizer

model = from_transformers(
		AutoModelForCausalLM.from_pretrained("fmicrosoft/Phi-3-mini-4k-instruct"),
		AutoTokenizer.from_pretrained("fmicrosoft/Phi-3-mini-4k-instruct")
)
result = model("Pizza or burger", Literal["pizza", "burger"])

# using a generator
from outlines import Generator, from_transformers
from transformers import AutoModelForCausalLM, AutoTokenizer

model = from_transformers(
		AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct"),
		AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
)
generator = Generator(model, Literal["pizza", "burger"])
result = generator("Pizza or burger")
```

- As explained previously, the `Application` class has been added. An `Application` is initialized with a prompt template and an output type. The application object returned can then be called with a model, a dictionary containing values for the template variables and inference arguments. The objective of this object is to let users easily switch from a model to another for a given set of prompt and output type.
[Documentation](https://dottxt-ai.github.io/outlines/latest/features/utility/application)

```python
from pydantic import BaseModel
from outlines import Application, Template

class Character(BaseModel):
	name: str

template = Template.from_string("Create a {{ gender }} character.")
app = Application(template, Character)
response = app(model, {"gender": "female"})
```

- The regex DSL and the associated `Term` classes and functions have been added. Terms (`Regex`, `String`…) can be used as output types to generate text with models or generators (they are turned into a regex). The term functions (`either`, `optional`, `at_least`…) are useful to build more complex regex patterns by combining terms. On top of the objects related to regex patterns, there are also 2 terms that are intended to be used by themselves as output types: `JsonSchema` and `CFG`.
[Documentation](https://dottxt-ai.github.io/outlines/latest/features/core/ouput_types)

```python
# term used directly as an output type
from outlines import from_transformers
from outlines.types import JsonSchema
from transformers import AutoModelForCausalLM, AutoTokenizer

model = from_transformers(
		AutoModelForCausalLM.from_pretrained("fmicrosoft/Phi-3-mini-4k-instruct"),
		AutoTokenizer.from_pretrained("fmicrosoft/Phi-3-mini-4k-instruct")
)
json_schema = '{"type": "object", "properties": {"answer": {"type": "number"}}}'
result = model("What's 2 + 2? Respond in a json", JsonSchema(json_schema))

# creating a complex regex pattern
from outlines import from_transformers
from outlines.types import at_least, either, integer, optional
from transformers import AutoModelForCausalLM, AutoTokenizer

model = from_transformers(
	AutoModelForCausalLM.from_pretrained("fmicrosoft/Phi-3-mini-4k-instruct"),
	AutoTokenizer.from_pretrained("fmicrosoft/Phi-3-mini-4k-instruct")
)
regex_term = "I have " + integer + either("dog", "cat") + optional("s")
result = model("How many pets do you have", regex_term)
```


================================================
FILE: outlines/templates.py
================================================
"""Create templates to easily build prompts."""

import functools
import inspect
import json
import os
import re
import textwrap
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Callable, Dict, Optional, Type, cast
import warnings

import jinja2
from pydantic import BaseModel
from PIL import Image as PILImage

from outlines.inputs import Image


def Vision(prompt: str, image: PILImage.Image) -> list:
    """This factory function replaces the deprecated `Vision` class until it is
    fully removed in outlines v1.2.0.

    Parameters
    ----------
    prompt
        The prompt to use to generate the response.
    image
        The image to use to generate the response.

    Returns
    -------
    list
        A list containing the prompt and Image instance.
    """
    warnings.warn("""
        The Vision function is deprecated and will be removed in outlines 1.2.0.
        Instead of using Vision, please use a prompt along with an
        outlines.inputs.Image instance.
        For instance:
        ```python
        import openai
        from outlines import Image, from_openai
        model = from_openai("gpt-4o")
        response = model(
            ["A beautiful image of a cat", Image(my_image)],
            max_tokens=100
        )
        ```
        """,
        DeprecationWarning,
        stacklevel=2,
    )
    return [prompt, Image(image)]


@dataclass
class Template:
    """Represents a prompt template.

    We return a `Template` class instead of a simple function so the
    template can be accessed by callers.

    """
    template: jinja2.Template

    def __call__(self, *args, **kwargs) -> str:
        """Render and return the template.

        Returns
        -------
        str
            The rendered template as a Python string.

        """
        return self.template.render(**kwargs)

    @classmethod
    def from_string(cls, content: str, filters: Dict[str, Callable] = {}):
        """Create a `Template` instance from a string containing a Jinja
        template.

        Parameters
        ----------
        content : str
            The string content to be converted into a template.

        Returns
        -------
        Template
            An instance of the class with the provided content as a template.

        """
        return cls(build_template_from_string(content, filters))

    @classmethod
    def from_file(cls, path: Path, filters: Dict[str, Callable] = {}):
        """Create a `Template` instance from a file containing a Jinja
        template.

        Note: This method does not allow to include and inheritance to
        reference files that are outside the folder or subfolders of the file
        given to `from_file`.

        Parameters
        ----------
        path : Path
            The path to the file containing the Jinja template.

        Returns
        -------
        Template
            An instance of the Template class with the template loaded from the
            file.

        """
        # We don't use a `Signature` here because it seems not feasible to
        # infer one from a Jinja2 environment that is
        # split across multiple files (since e.g. we support features like
        # Jinja2 includes and template inheritance)
        return cls(build_template_from_file(path, filters))


def build_template_from_string(
    content: str, filters: Dict[str, Callable] = {}
) -> jinja2.Template:
    # Dedent, and remove extra linebreak
    cleaned_template = inspect.cleandoc(content)

    # Add linebreak if there were any extra linebreaks that
    # `cleandoc` would have removed
    ends_with_linebreak = content.replace(" ", "").endswith("\n\n")
    if ends_with_linebreak:
        cleaned_template += "\n"

    # Remove extra whitespaces, except those that immediately follow a newline symbol.
    # This is necessary to avoid introducing whitespaces after backslash `\` characters
    # used to continue to the next line without linebreak.
    cleaned_template = re.sub(r"(?![\r\n])(\b\s+)", " ", cleaned_template)

    env = create_jinja_env(None, filters)

    return env.from_string(cleaned_template)


def build_template_from_file(
    path: Path, filters: Dict[str, Callable] = {}
) -> jinja2.Template:
    file_directory = os.path.dirname(os.path.abspath(path))
    env = create_jinja_env(jinja2.FileSystemLoader(file_directory), filters)

    return env.get_template(os.path.basename(path))


def create_jinja_env(
    loader: Optional[jinja2.BaseLoader], filters: Dict[str, Callable]
) -> jinja2.Environment:
    """Create a new Jinja environment.

    The Jinja environment is loaded with a set of pre-defined filters:
    - `name`: get the name of a function
    - `description`: get a function's docstring
    - `source`: get a function's source code
    - `signature`: get a function's signature
    - `args`: get a function's arguments
    - `schema`: display a JSON Schema

    Users may pass additional filters, and/or override existing ones.

    Parameters
    ----------
    loader
       An optional `BaseLoader` instance
    filters
       A dictionary of filters, map between the filter's name and the
       corresponding function.

    """
    env = jinja2.Environment(
        loader=loader,
        trim_blocks=True,
        lstrip_blocks=True,
        keep_trailing_newline=True,
        undefined=jinja2.StrictUndefined,
    )

    env.filters["name"] = get_fn_name
    env.filters["description"] = get_fn_description
    env.filters["source"] = get_fn_source
    env.filters["signature"] = get_fn_signature
    env.filters["schema"] = get_schema
    env.filters["args"] = get_fn_args

    # The filters passed by the user may override the
    # pre-defined filters.
    for name, filter_fn in filters.items():
        env.filters[name] = filter_fn

    return env


def get_fn_name(fn: Callable):
    """Returns the name of a callable."""
    if not callable(fn):
        raise TypeError("The `name` filter only applies to callables.")

    if not hasattr(fn, "__name__"):
        name = type(fn).__name__
    else:
        name = fn.__name__

    return name


def get_fn_args(fn: Callable):
    """Returns the arguments of a function with annotations and default values if provided."""
    if not callable(fn):
        raise TypeError("The `args` filter only applies to callables.")

    arg_str_list = []
    signature = inspect.signature(fn)
    arg_str_list = [str(param) for param in signature.parameters.values()]
    arg_str = ", ".join(arg_str_list)
    return arg_str


def get_fn_description(fn: Callable):
    """Returns the first line of a callable's docstring."""
    if not callable(fn):
        raise TypeError("The `description` filter only applies to callables.")

    docstring = inspect.getdoc(fn)
    if docstring is None:
        description = ""
    else:
        description = docstring.split("\n")[0].strip()

    return description


def get_fn_source(fn: Callable):
    """Return the source code of a callable."""
    if not callable(fn):
        raise TypeError("The `source` filter only applies to callables.")

    source = textwrap.dedent(inspect.getsource(fn))
    re_search = re.search(re.compile(r"(\bdef\b.*)", re.DOTALL), source)
    if re_search is not None:
        source = re_search.group(0)
    else:  # pragma: no cover
        raise TypeError("Could not read the function's source code")

    return source


def get_fn_signature(fn: Callable):
    """Return the signature of a callable."""
    if not callable(fn):
        raise TypeError("The `source` filter only applies to callables.")

    source = textwrap.dedent(inspect.getsource(fn))
    re_search = re.search(re.compile(r"\(([^)]+)\)"), source)
    if re_search is None:  # pragma: no cover
        signature = ""
    else:
        signature = re_search.group(1)

    return signature


@functools.singledispatch
def get_schema(model: Any):
    raise NotImplementedError(
        f"No schema rendering function defined for type {type(model)}."
    )


@get_schema.register(dict)
def get_schema_dict(model: Dict):
    """Return a pretty-printed dictionary"""
    return json.dumps(model, indent=2)


@get_schema.register(type(BaseModel))
def get_schema_pydantic(model: Type[BaseModel]):
    """Return the schema of a Pydantic model."""
    if hasattr(model, "model_json_schema"):
        def_key = "$defs"
        raw_schema = model.model_json_schema()
    else:  # pragma: no cover
        def_key = "definitions"
        raw_schema = model.schema()

    definitions = raw_schema.get(def_key, None)
    schema = parse_pydantic_schema(raw_schema, definitions)

    return json.dumps(schema, indent=2)


def parse_pydantic_schema(raw_schema, definitions):
    """Parse the output of `Basemodel.[schema|model_json_schema]()`.

    This recursively follows the references to other schemas in case
    of nested models. Other schemas are stored under the "definitions"
    key in the schema of the top-level model.

    """
    simple_schema = {}
    for name, value in raw_schema["properties"].items():
        if "description" in value:
            simple_schema[name] = value["description"]
        elif "$ref" in value: # pragma: no cover
            refs = value["$ref"].split("/")
            simple_schema[name] = parse_pydantic_schema(
                definitions[refs[2]], definitions
            )
        else:
            simple_schema[name] = f"<{name}>"

    return simple_schema


================================================
FILE: outlines/types/__init__.py
================================================
"""Output types for structured generation and regex DSL."""

from outlines.types.dsl import (
    CFG,
    Choice,
    JsonSchema,
    Regex,
    at_least,
    at_most,
    between,
    cfg,
    either,
    exactly,
    json_schema,
    one_or_more,
    optional,
    regex,
    zero_or_more,
)

from . import locale

try:
    from . import airports
except ImportError:  # pragma: no cover
    class AirportImportError:
        """Dummy module that raises an error when accessed."""
        def __getattr__(self, name):
            raise ImportError(
                "The 'airportsdata' package is required to use airport types. "
                "Install it with: pip install 'outlines[airports]'"
            )

    airports = AirportImportError()  # type: ignore

try:
    from . import countries
except ImportError:  # pragma: no cover
    class CountryImportError:
        """Dummy module that raises an error when accessed."""
        def __getattr__(self, name):
            raise ImportError(
                "The 'iso3166' package is required to use country types. "
                "Install it with: pip install 'outlines[countries]'"
            )

    countries = CountryImportError()  # type: ignore

__all__ = [
    # Submodules
    "airports",
    "countries",
    "locale",
    # DSL functions and classes
    "Regex",
    "CFG",
    "Choice",
    "JsonSchema",
    "regex",
    "cfg",
    "json_schema",
    "optional",
    "either",
    "exactly",
    "at_least",
    "at_most",
    "between",
    "zero_or_more",
    "one_or_more",
    # Python types
    "string",
    "integer",
    "boolean",
    "number",
    "date",
    "time",
    "datetime",
    # Basic regex types
    "digit",
    "char",
    "newline",
    "whitespace",
    "hex_str",
    "uuid4",
    "ipv4",
    # Document-specific types
    "sentence",
    "paragraph",
    "email",
    "isbn",
]


# Python types
string = Regex(r'"[^"]*"')
integer = Regex(r"[+-]?(0|[1-9][0-9]*)")
boolean = Regex("(True|False)")
number = Regex(rf"{integer.pattern}(\.[0-9]+)?([eE][+-][0-9]+)?")
date = Regex(r"(\d{4})-(0[1-9]|1[0-2])-([0-2][0-9]|3[0-1])")
time = Regex(r"([0-1][0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])")
datetime = Regex(rf"({date.pattern})(\s)({time.pattern})")

# Basic regex types
digit = Regex(r"\d")
char = Regex(r"\w")
newline = Regex(r"(\r\n|\r|\n)")  # Matched new lines on Linux, Windows & MacOS
whitespace = Regex(r"\s")
hex_str = Regex(r"(0x)?[a-fA-F0-9]+")
uuid4 = Regex(
    r"[a-fA-F0-9]{8}-"
    r"[a-fA-F0-9]{4}-"
    r"4[a-fA-F0-9]{3}-"
    r"[89abAB][a-fA-F0-9]{3}-"
    r"[a-fA-F0-9]{12}"
)
ipv4 = Regex(
    r"((25[0-5]|2[0-4][0-9]|1?[0-9]{1,2})\.){3}"
    r"(25[0-5]|2[0-4][0-9]|1?[0-9]{1,2})"
)

# Document-specific types
sentence = Regex(r"[A-Z].*\s*[.!?]")
paragraph = Regex(rf"{sentence.pattern}(?:\s+{sentence.pattern})*\n+")


# The following regex is FRC 5322 compliant and was found at:
# https://emailregex.com/
email = Regex(
    r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"""
)

# Matches any ISBN number. Note that this is not completely correct as not all
# 10 or 13 digits numbers are valid ISBNs. See https://en.wikipedia.org/wiki/ISBN
# Taken from O'Reilly's Regular Expression Cookbook:
# https://www.oreilly.com/library/view/regular-expressions-cookbook/9781449327453/ch04s13.html
#
# TODO: The check digit can only be computed by calling a function to compute it dynamically
isbn = Regex(
    r"(?:ISBN(?:-1[03])?:? )?(?=[0-9X]{10}$|(?=(?:[0-9]+[- ]){3})[- 0-9X]{13}$|97[89][0-9]{10}$|(?=(?:[0-9]+[- ]){4})[- 0-9]{17}$)(?:97[89][- ]?)?[0-9]{1,5}[- ]?[0-9]+[- ]?[0-9]+[- ]?[0-9X]"
)


================================================
FILE: outlines/types/airports.py
================================================
"""Generate valid airport codes."""

from enum import Enum

import airportsdata

AIRPORT_IATA_LIST = [
    (v["iata"], v["iata"]) for v in airportsdata.load().values() if v["iata"]
]
IATA = Enum("Airport", AIRPORT_IATA_LIST)  # type:ignore


================================================
FILE: outlines/types/countries.py
================================================
"""Generate valid country codes and names."""

from enum import Enum

from iso3166 import countries


def get_country_flags():
    """Generate Unicode flags for all ISO 3166-1 alpha-2 country codes in Alpha2 Enum."""
    base = ord("🇦")
    return {
        code.name: chr(base + ord(code.name[0]) - ord("A"))
        + chr(base + ord(code.name[1]) - ord("A"))
        for code in Alpha2
    }


ALPHA_2_CODE = [(country.alpha2, country.alpha2) for country in countries]
Alpha2 = Enum("Alpha_2", ALPHA_2_CODE)  # type:ignore

ALPHA_3_CODE = [(country.alpha3, country.alpha3) for country in countries]
Alpha3 = Enum("Alpha_3", ALPHA_3_CODE)  # type:ignore

NUMERIC_CODE = [(str(country.numeric), str(country.numeric)) for country in countries]
Numeric = Enum("Numeric_code", NUMERIC_CODE)  # type:ignore

NAME = [(country.name, country.name) for country in countries]
Name = Enum("Name", NAME)  # type:ignore

flag_mapping = get_country_flags()
FLAG = [(flag, flag) for code, flag in flag_mapping.items()]
Flag = Enum("Flag", FLAG)  # type:ignore


================================================
FILE: outlines/types/dsl.py
================================================
"""Regular expression DSL and output types for structured generation.

This module contains elements related to three logical steps in the use of
output types for structured generation:

1. Definition of `Term` classes that contain output type definitions. That
   includes both terms intended to be used by themselves such as `JsonSchema`
   or `CFG` and terms that are part of the regular expression DSL such as
   `Alternatives` or `KleeneStar` (and the related functions).
2. Conversion of Python types into `Term` instances (`python_types_to_terms`).
3. Conversion of a `Term` instance into a regular expression (`to_regex`).

"""

import json
import re
import sys
import warnings
from dataclasses import dataclass
from enum import EnumMeta
from types import FunctionType
from typing import (
    Any,
    List,
    Literal,
    Optional as OptionalType,
    Union,
    get_args,
)
import jsonschema
from genson import SchemaBuilder
from pydantic import (
    BaseModel,
    GetCoreSchemaHandler,
    GetJsonSchemaHandler,
    TypeAdapter,
)
from pydantic.json_schema import JsonSchemaValue
from pydantic_core import core_schema as cs
from outlines_core.json_schema import build_regex_from_schema

import outlines.types as types
from outlines import grammars
from outlines.types.json_schema_utils import (
    json_schema_dict_to_pydantic,
    json_schema_dict_to_typeddict,
    json_schema_dict_to_dataclass,
)
from outlines.types.utils import (
    get_schema_from_signature,
    is_int,
    is_int_instance,
    is_float,
    is_float_instance,
    is_str,
    is_str_instance,
    is_bool,
    is_datetime,
    is_date,
    is_time,
    is_native_dict,
    is_dict_instance,
    is_dataclass,
    is_typed_dict,
    is_pydantic_model,
    is_genson_schema_builder,
    is_literal,
    is_union,
    is_enum,
    is_callable,
    is_typing_list,
    is_typing_tuple,
    is_typing_dict,
)

if sys.version_info >= (3, 12):  # pragma: no cover
    from typing import _TypedDictMeta  # type: ignore
else:  # pragma: no cover
    from typing_extensions import _TypedDictMeta  # type: ignore


class Term:
    """Represents types defined with a regular expression.

    `Regex` instances can be used as a type in a Pydantic model definittion.
    They will be translated to JSON Schema as a "string" field with the
    "pattern" keyword set to the regular expression this class represents. The
    class also handles validation.

    Examples
    --------

    >>> from outlines.types import Regex
    >>> from pydantic import BaseModel
    >>>
    >>> age_type = Regex("[0-9]+")
    >>>
    >>> class User(BaseModel):
    >>>     name: str
    >>>     age: age_type

    """

    def __add__(self: "Term", other: "Term") -> "Sequence":
        if is_str_instance(other):
            other = String(str(other))

        return Sequence([self, other])

    def __radd__(self: "Term", other: "Term") -> "Sequence":
        if is_str_instance(other):
            other = String(str(other))

        return Sequence([other, self])

    def __or__(self: "Term", other: "Term") -> "Alternatives":
        if is_str_instance(other):
            other = String(str(other))

        return Alternatives([self, other])

    def __ror__(self: "Term", other: "Term") -> "Alternatives":
        if is_str_instance(other):
            other = String(str(other))

        return Alternatives([other, self])

    def __get_validator__(self, _core_schema):
        def validate(input_value):
            return self.validate(input_value)

        return validate

    def __get_pydantic_core_schema__(
        self, source_type: Any, handler: GetCoreSchemaHandler
    ) -> cs.CoreSchema:
        return cs.no_info_plain_validator_function(lambda value: self.validate(value))

    def __get_pydantic_json_schema__(
        self, core_schema: cs.CoreSchema, handler: GetJsonSchemaHandler
    ) -> JsonSchemaValue:
        return {"type": "string", "pattern": to_regex(self)}

    def validate(self, value: str) -> str:
        pattern = to_regex(self)
        compiled = re.compile(pattern)
        if not compiled.fullmatch(str(value)):
            raise ValueError(
                f"Input should be in the language of the regular expression {pattern}"
            )
        return value

    def matches(self, value: str) -> bool:
        """Check that a given value is in the language defined by the Term.

        We make the assumption that the language defined by the term can
        be defined with a regular expression.

        """
        pattern = to_regex(self)
        compiled = re.compile(pattern)
        if compiled.fullmatch(str(value)):
            return True
        return False

    def display_ascii_tree(self, indent="", is_last=True) -> str:
        """Display the regex tree in ASCII format."""
        branch = "└── " if is_last else "├── "
        result = indent + branch + self._display_node() + "\n"

        # Calculate the new indent for children
        new_indent = indent + ("    " if is_last else "│   ")

        # Let each subclass handle its children
        result += self._display_children(new_indent)
        return result

    def _display_node(self):
        raise NotImplementedError

    def _display_children(self, indent: str) -> str:
        """Display the children of this node. Override in subclasses with children."""
        return ""

    def __str__(self):
        return self.display_ascii_tree()

    def optional(self) -> "Optional":
        return optional(self)

    def exactly(self, count: int) -> "QuantifyExact":
        return exactly(count, self)

    def at_least(self, count: int) -> "QuantifyMinimum":
        return at_least(count, self)

    def at_most(self, count: int) -> "QuantifyMaximum":
        return at_most(count, self)

    def between(self, min_count: int, max_count: int) -> "QuantifyBetween":
        return between(min_count, max_count, self)

    def one_or_more(self) -> "KleenePlus":
        return one_or_more(self)

    def zero_or_more(self) -> "KleeneStar":
        return zero_or_more(self)


@dataclass
class String(Term):
    value: str

    def _display_node(self) -> str:
        return f"String('{self.value}')"

    def __repr__(self):
        return f"String(value='{self.value}')"


@dataclass
class Regex(Term):
    """Class representing a regular expression.

    Parameters
    ----------
    pattern
        The regular expression as a string.

    """
    pattern: str

    def _display_node(self) -> str:
        return f"Regex('{self.pattern}')"

    def __repr__(self):
        return f"Regex(pattern='{self.pattern}')"


@dataclass
class CFG(Term):
    """Class representing a context-free grammar.

    Parameters
    ----------
    definition
        The definition of the context-free grammar as a string.

    """
    definition: str

    def _display_node(self) -> str:
        return f"CFG('{self.definition}')"

    def __repr__(self):
        return f"CFG(definition='{self.definition}')"

    def __eq__(self, other):
        if not isinstance(other, CFG):
            return False
        return self.definition == other.definition

    @classmethod
    def from_file(cls, path: str) -> "CFG":
        """Create a CFG instance from a file containing a CFG definition.

        Parameters
        ----------
        path : str
            The path to the file containing the CFG definition.
        Returns
        -------
        CFG
            A CFG instance.

        """
        with open(path, "r") as f:
            definition = f.read()
        return cls(definition)


class JsonSchema(Term):
    """Class representing a JSON schema.

    The JSON schema object from which to instantiate the class can be a
    dictionary, a string, a Pydantic model, a typed dict, a dataclass, or a
    genSON schema builder.

    """
    schema: str
    whitespace_pattern: OptionalType[str]

    def __init__(
        self,
        schema: Union[
            dict, str, type[BaseModel], _TypedDictMeta, type, SchemaBuilder
        ],
        whitespace_pattern: OptionalType[str] = None,
        ensure_ascii: bool = True,
    ):
        """
        Parameters
        ----------
        schema
            The object containing the JSON schema.
        whitespace_pattern
            The pattern to use to match whitespace characters.
        ensure_ascii
            Whether to ensure the schema is ASCII-only.

        """
        schema_str: str

        if is_dict_instance(schema):
            schema_str = json.dumps(schema, ensure_ascii=ensure_ascii)
        elif is_str_instance(schema):
            schema_str = str(schema)
        elif is_pydantic_model(schema):
            schema_str = json.dumps(schema.model_json_schema(), ensure_ascii=ensure_ascii) # type: ignore
        elif is_typed_dict(schema):
            schema_str = json.dumps(TypeAdapter(schema).json_schema(), ensure_ascii=ensure_ascii)
        elif is_dataclass(schema):
            schema_str = json.dumps(TypeAdapter(schema).json_schema(), ensure_ascii=ensure_ascii)
        elif is_genson_schema_builder(schema):
            schema_str = schema.to_json(ensure_ascii=ensure_ascii)  # type: ignore
        else:
            raise ValueError(
                f"Cannot parse schema {schema}. The schema must be either "
                + "a Pydantic class, typed dict, a dataclass, a genSON schema "
                + "builder or a string or dict that contains the JSON schema "
                + "specification"
            )

        jsonschema.Draft7Validator.check_schema(json.loads(schema_str))
        self.schema = schema_str
        self.whitespace_pattern = whitespace_pattern

    @classmethod
    def is_json_schema(cls, obj: Any) -> bool:
        """Check if the object provided is a JSON schema type.

        Parameters
        ----------
        obj: Any
            The object to check

        Returns
        -------
        bool
            True if the object is a JSON schema type, False otherwise

        """
        return (
            isinstance(obj, cls)
            or is_pydantic_model(obj)
            or is_typed_dict(obj)
            or is_dataclass(obj)
            or is_genson_schema_builder(obj)
        )

    @classmethod
    def convert_to(
        cls,
        schema: Union[
            "JsonSchema",
            type[BaseModel],
            _TypedDictMeta,
            type,
            SchemaBuilder,
        ],
        target_types: List[Literal[
            "str",
            "dict",
            "pydantic",
            "typeddict",
            "dataclass",
            "genson",
        ]],
    ) -> Union[str, dict, type[BaseModel], _TypedDictMeta, type, SchemaBuilder]:
        """Convert a JSON schema type to a different JSON schema type.

        If the schema provided is already of a type in the target_types, return
        it unchanged.

        Parameters
        ----------
        schema: Union[JsonSchema, type[BaseModel], _TypedDictMeta, type, SchemaBuilder]
            The schema to convert
        target_types: List[Literal["str", "dict", "pydantic", "typeddict", "dataclass", "genson"]]
            The target types to convert to

        """
        # If the schema provided is already of a type in the target_types,
        # just return it
        if isinstance(schema, cls):
            if "str" in target_types:
                return schema.schema
            elif "dict" in target_types:
                return json.loads(schema.schema)
        elif is_pydantic_model(schema) and "pydantic" in target_types:
            return schema
        elif is_typed_dict(schema) and "typeddict" in target_types:
            return schema
        elif is_dataclass(schema) and "dataclass" in target_types:
            return schema
        elif is_genson_schema_builder(schema) and "genson" in target_types:
            return schema

        # Convert the schema to a JSON schema string/dict
        if isinstance(schema, cls):
            schema_str = schema.schema
        else:
            schema_str = cls(schema).schema
        schema_dict = json.loads(schema_str)

        for target_type in target_types:
            try:
                # Convert the JSON schema string to the target type
                if target_type == "str":
                    return schema_str
                elif target_type == "dict":
                    return schema_dict
                elif target_type == "pydantic":
                    return json_schema_dict_to_pydantic(schema_dict)
                elif target_type == "typeddict":
                    return json_schema_dict_to_typeddict(schema_dict)
                elif target_type == "dataclass":
                    return json_schema_dict_to_dataclass(schema_dict)
                # No conversion available for genson
            except Exception as e:  # pragma: no cover
                warnings.warn(
                    f"Cannot convert schema type {type(schema)} to {target_type}: {e}"
                )
                continue

        raise ValueError(
            f"Cannot convert schema type {type(schema)} to any of the target "
            f"types {target_types}"
        )

    def _display_node(self) -> str:
        return f"JsonSchema('{self.schema}')"

    def __repr__(self):
        return f"JsonSchema(schema='{self.schema}')"

    def __eq__(self, other):
        if not isinstance(other, JsonSchema):
            return False
        try:
            self_dict = json.loads(self.schema)
            other_dict = json.loads(other.schema)
            return self_dict == other_dict
        except json.JSONDecodeError:  # pragma: no cover
            return self.schema == other.schema

    @classmethod
    def from_file(cls, path: str) -> "JsonSchema":
        """Create a JsonSchema instance from a .json file containing a JSON
        schema.

        Parameters
        ----------
        path:
            The path to the file containing the JSON schema.
        Returns
        -------
        JsonSchema
            A JsonSchema instance.

        """
        with open(path, "r") as f:
            schema = json.load(f)
        return cls(schema)


@dataclass
class Choice(Term):
    """Class representing a choice between different items.

    Parameters
    ----------
    items
        The items to choose from.

    """
    items: List[Any]

    def _display_node(self) -> str:
        return f"Choice({repr(self.items)})"

    def __repr__(self):
        return f"Choice(items={repr(self.items)})"


@dataclass
class KleeneStar(Term):
    term: Term

    def _display_node(self) -> str:
        return "KleeneStar(*)"

    def _display_children(self, indent: str) -> str:
        return self.term.display_ascii_tree(indent, True)

    def __repr__(self):
        return f"KleeneStar(term={repr(self.term)})"


@dataclass
class KleenePlus(Term):
    term: Term

    def _display_node(self) -> str:
        return "KleenePlus(+)"

    def _display_children(self, indent: str) -> str:
        return self.term.display_ascii_tree(indent, True)

    def __repr__(self):
        return f"KleenePlus(term={repr(self.term)})"


@dataclass
class Optional(Term):
    term: Term

    def _display_node(self) -> str:
        return "Optional(?)"

    def _display_children(self, indent: str) -> str:
        return self.term.display_ascii_tree(indent, True)

    def __repr__(self):
        return f"Optional(term={repr(self.term)})"


@dataclass
class Alternatives(Term):
    terms: List[Term]

    def _display_node(self) -> str:
        return "Alternatives(|)"

    def _display_children(self, indent: str) -> str:
        return "".join(
            term.display_ascii_tree(indent, i == len(self.terms) - 1)
            for i, term in enumerate(self.terms)
        )

    def __repr__(self):
        return f"Alternatives(terms={repr(self.terms)})"


@dataclass
class Sequence(Term):
    terms: List[Term]

    def _display_node(self) -> str:
        return "Sequence"

    def _display_children(self, indent: str) -> str:
        return "".join(
            term.display_ascii_tree(indent, i == len(self.terms) - 1)
            for i, term in enumerate(self.terms)
        )

    def __repr__(self):
        return f"Sequence(terms={repr(self.terms)})"


@dataclass
class QuantifyExact(Term):
    term: Term
    count: int

    def _display_node(self) -> str:
        return f"Quantify({{{self.count}}})"

    def _display_children(self, indent: str) -> str:
        return self.term.display_ascii_tree(indent, True)

    def __repr__(self):
        return f"QuantifyExact(term={repr(self.term)}, count={repr(self.count)})"


@dataclass
class QuantifyMinimum(Term):
    term: Term
    min_count: int

    def _display_node(self) -> str:
        return f"Quantify({{{self.min_count},}})"

    def _display_children(self, indent: str) -> str:
        return self.term.display_ascii_tree(indent, True)

    def __repr__(self):
        return (
            f"QuantifyMinimum(term={repr(self.term)}, min_count={repr(self.min_count)})"
        )


@dataclass
class QuantifyMaximum(Term):
    term: Term
    max_count: int

    def _display_node(self) -> str:
        return f"Quantify({{,{self.max_count}}})"

    def _display_children(self, indent: str) -> str:
        return self.term.display_ascii_tree(indent, True)

    def __repr__(self):
        return (
            f"QuantifyMaximum(term={repr(self.term)}, max_count={repr(self.max_count)})"
        )


@dataclass
class QuantifyBetween(Term):
    term: Term
    min_count: int
    max_count: int

    def __post_init__(self):
        if self.min_count > self.max_count:
            raise ValueError(
                "QuantifyBetween: `max_count` must be greater than `min_count`."
            )

    def _display_node(self) -> str:
        return f"Quantify({{{self.min_count},{self.max_count}}})"

    def _display_children(self, indent: str) -> str:
        return self.term.display_ascii_tree(indent, True)

    def __repr__(self):
        return f"QuantifyBetween(term={repr(self.term)}, min_count={repr(self.min_count)}, max_count={repr(self.max_count)})"


def regex(pattern: str):
    return Regex(pattern)


def cfg(definition: str):
    return CFG(definition)


def json_schema(schema: Union[str, dict, type[BaseModel]]):
    return JsonSchema(schema)


def either(*terms: Union[str, Term]):
    """Represents an alternative between different terms or strings.

    This factory function automatically translates string arguments
    into `String` objects.

    """
    terms = [String(arg) if isinstance(arg, str) else arg for arg in terms]
    return Alternatives(terms)


def optional(term: Union[Term, str]) -> Optional:
    term = String(term) if isinstance(term, str) else term
    return Optional(term)


def exactly(count: int, term: Union[Term, str]) -> QuantifyExact:
    """Repeat the term exactly `count` times."""
    term = String(term) if isinstance(term, str) else term
    return QuantifyExact(term, count)


def at_least(count: int, term: Union[Term, str]) -> QuantifyMinimum:
    """Repeat the term at least `count` times."""
    term = String(term) if isinstance(term, str) else term
    return QuantifyMinimum(term, count)


def at_most(count: int, term: Union[Term, str]) -> QuantifyMaximum:
    """Repeat the term exactly `count` times."""
    term = String(term) if isinstance(term, str) else term
    return QuantifyMaximum(term, count)


def between(min_count: int, max_count: int, term: Union[Term, str]) -> QuantifyBetween:
    term = String(term) if isinstance(term, str) else term
    return QuantifyBetween(term, min_count, max_count)


def zero_or_more(term: Union[Term, str]) -> KleeneStar:
    term = String(term) if isinstance(term, str) else term
    return KleeneStar(term)


def one_or_more(term: Union[Term, str]) -> KleenePlus:
    term = String(term) if isinstance(term, str) else term
    return KleenePlus(term)


def python_types_to_terms(ptype: Any, recursion_depth: int = 0) -> Term:
    """Convert Python types to Outlines DSL terms that constrain LLM output.

    Parameters
    ----------
    ptype
        The Python type to convert
    recursion_depth
        Current recursion depth to prevent infinite recursion

    Returns
    -------
    Term
        The corresponding DSL `Term` instance.

    """
    if recursion_depth > 10:
        raise RecursionError(
            f"Maximum recursion depth exceeded when converting {ptype}. "
            "This might be due to a recursive type definition."
        )

    # First handle Term instances
    if isinstance(ptype, Term):
        return ptype

    # Basic types
    if is_int(ptype):
        return types.integer
    elif is_float(ptype):
        return types.number
    elif is_bool(ptype):
        return types.boolean
    elif is_str(ptype):
        return types.string
    elif is_native_dict(ptype):
        return CFG(grammars.json)
    elif is_time(ptype):
        return types.time
    elif is_date(ptype):
        return types.date
    elif is_datetime(ptype):
        return types.datetime

    # Basic type instances
    if is_str_instance(ptype):
        return String(ptype)
    elif is_int_instance(ptype) or is_float_instance(ptype):
        return Regex(str(ptype))

    # Structured types
    structured_type_checks = [
        lambda x: is_dataclass(x),
        lambda x: is_typed_dict(x),
        lambda x: is_pydantic_model(x),
    ]
    if any(check(ptype) for check in structured_type_checks):
        schema = TypeAdapter(ptype).json_schema()
        return JsonSchema(schema)

    elif is_genson_schema_builder(ptype):
        schema = ptype.to_json()
        return JsonSchema(schema)

    if is_enum(ptype):
        return Alternatives(
            [
                python_types_to_terms(member, recursion_depth + 1)
                for member in _get_enum_members(ptype)
            ]
        )

    args = get_args(ptype)
    if is_literal(ptype):
        return _handle_literal(args)
    elif is_union(ptype):
        return _handle_union(args, recursion_depth)
    elif is_typing_list(ptype):
        return _handle_list(args, recursion_depth)
    elif is_typing_tuple(ptype):
        return _handle_tuple(args, recursion_depth)
    elif is_typing_dict(ptype):
        return _handle_dict(args, recursion_depth)

    if is_callable(ptype):
        return JsonSchema(get_schema_from_signature(ptype))

    type_name = getattr(ptype, "__name__", ptype)
    raise TypeError(
        f"Type {type_name} is currently not supported. Please open an issue: "
        "https://github.com/dottxt-ai/outlines/issues"
    )


def _get_enum_members(ptype: EnumMeta) -> List[Any]:
    regular_members = [member.value for member in ptype]  # type: ignore
    function_members = []
    for key, value in ptype.__dict__.items():
        if (
            isinstance(value, FunctionType)
            and not (key.startswith('__') and key.endswith('__'))
            and key != '_generate_next_value_'  # Skip this specific method that causes issues
        ):
            function_members.append(value)
    return regular_members + function_members


def _handle_literal(args: tuple) -> Alternatives:
    return Alternatives([python_types_to_terms(arg) for arg in args])


def _ensure_json_quoted(term: Term) -> Term:
    """Wrap bare ``String`` terms in double quotes for JSON container contexts.

    When string literal values (from ``Literal`` or ``Enum``) appear inside
    container types (``List``, ``Tuple``, ``Dict``), they must be JSON-quoted
    so the generated regex matches valid JSON.  ``Regex``-based terms (e.g.
    ``types.string``) already include their own quotes and are left untouched.
    """
    if isinstance(term, String):
        return String(f'"{term.value}"')
    if isinstance(term, Alternatives):
        quoted = [_ensure_json_quoted(t) for t in term.terms]
        return Alternatives(quoted)
    return term


def _handle_union(args: tuple, recursion_depth: int) -> Alternatives:
    # Handle the Optional[T] type
    if len(args) == 2 and (type(None) in args or None in args):
        other_ptype = next(arg for arg in args if arg not in (type(None), None))
        return Alternatives(
            [
                python_types_to_terms(other_ptype, recursion_depth + 1),
                String("None"),
            ]
        )
    return Alternatives(
        [python_types_to_terms(arg, recursion_depth + 1) for arg in args]
    )


def _handle_list(args: tuple, recursion_depth: int) -> Sequence:
    if args is None or len(args) != 1:
        raise TypeError(
            "Only homogeneous lists are supported. You should provide exactly "
            + "one argument to `List`, got {args}."
        )
    item_type = _ensure_json_quoted(python_types_to_terms(args[0], recursion_depth + 1))
    return Sequence(
        [
            String("["),
            item_type,
            KleeneStar(Sequence([String(", "), item_type])),
            String("]"),
        ]
    )


def _handle_tuple(args: tuple, recursion_depth: int) -> Union[Sequence, String]:
    if len(args) == 0 or args == ((),):
        return String("()")
    elif len(args) == 2 and args[1] is Ellipsis:
        item_term = _ensure_json_quoted(python_types_to_terms(args[0], recursion_depth + 1))
        return Sequence(
            [
                String("("),
                item_term,
                KleeneStar(Sequence([String(", "), item_term])),
                String(")"),
            ]
        )
    else:
        items = [_ensure_json_quoted(python_types_to_terms(arg, recursion_depth + 1)) for arg in args]
        separator = String(", ")
        elements = []
        for i, item in enumerate(items):
            elements.append(item)
            if i < len(items) - 1:
                elements.append(separator)
        return Sequence([String("("), *elements, String(")")])


def _handle_dict(args: tuple, recursion_depth: int) -> Sequence:
    if args is None or len(args) != 2:
        raise TypeError(f"Dict must have exactly two type arguments. Got {args}.")
    # Add dict support with key:value pairs
    key_type = _ensure_json_quoted(python_types_to_terms(args[0], recursion_depth + 1))
    value_type = _ensure_json_quoted(python_types_to_terms(args[1], recursion_depth + 1))
    return Sequence(
        [
            String("{"),
            Optional(
                Sequence(
                    [
                        key_type,
                        String(":"),
                        value_type,
                        KleeneStar(
                            Sequence([String(", "), key_type, String(":"), value_type])
                        ),
                    ]
                )
            ),
            String("}"),
        ]
    )


def to_regex(term: Term) -> str:
    """Convert a term to a regular expression.

    We only consider self-contained terms that do not refer to another rule.

    Parameters
    ----------
    term
        The term to convert to a regular expression.

    Returns
    -------
    str
        The regular expression as a string.

    """
    if isinstance(term, String):
        return re.escape(term.value)
    elif isinstance(term, Regex):
        return f"({term.pattern})"
    elif isinstance(term, JsonSchema):
        regex_str = build_regex_from_schema(term.schema, term.whitespace_pattern)
        return f"({regex_str})"
    elif isinstance(term, Choice):
        regexes = [to_regex(python_types_to_terms(item)) for item in term.items]
        return f"({'|'.join(regexes)})"
    elif isinstance(term, KleeneStar):
        return f"({to_regex(term.term)})*"
    elif isinstance(term, KleenePlus):
        return f"({to_regex(term.term)})+"
    elif isinstance(term, Optional):
        return f"({to_regex(term.term)})?"
    elif isinstance(term, Alternatives):
        regexes = [to_regex(subterm) for subterm in term.terms]
        return f"({'|'.join(regexes)})"
    elif isinstance(term, Sequence):
        regexes = [to_regex(subterm) for subterm in term.terms]
        return f"{''.join(regexes)}"
    elif isinstance(term, QuantifyExact):
        return f"({to_regex(term.term)}){{{term.count}}}"
    elif isinstance(term, QuantifyMinimum):
        return f"({to_regex(term.term)}){{{term.min_count},}}"
    elif isinstance(term, QuantifyMaximum):
        return f"({to_regex(term.term)}){{,{term.max_count}}}"
    elif isinstance(term, QuantifyBetween):
        return f"({to_regex(term.term)}){{{term.min_count},{term.max_count}}}"
    else:
        raise TypeError(
            f"Cannot convert object {repr(term)} to a regular expression."
        )


================================================
FILE: outlines/types/json_schema_utils.py
================================================
"""Convert JSON Schema dicts to Python types."""

import sys
from dataclasses import dataclass, field
from typing import Any, Dict, List, Literal, Optional

from pydantic import BaseModel, create_model

if sys.version_info >= (3, 12):  # pragma: no cover
    from typing import _TypedDictMeta, TypedDict  # type: ignore
else:  # pragma: no cover
    from typing_extensions import _TypedDictMeta, TypedDict  # type: ignore


def schema_type_to_python(
    schema: dict,
    caller_target_type: Literal["pydantic", "typeddict", "dataclass"]
) -> Any:
    """Get a Python type from a JSON Schema dict.

    Parameters
    ----------
    schema: dict
        The JSON Schema dict to convert to a Python type
    caller_target_type: Literal["pydantic", "typeddict", "dataclass"]
        The type of the caller

    Returns
    -------
    Any
        The Python type

    """
    if "enum" in schema:
        values = schema["enum"]
        return Literal[tuple(values)]

    t = schema.get("type")

    if t == "string":
        return str
    elif t == "integer":
        return int
    elif t == "number":
        return float
    elif t == "boolean":
        return bool
    elif t == "array":
        items = schema.get("items", {})
        if items:
            item_type = schema_type_to_python(items, caller_target_type)
        else:
            item_type = Any
        return List[item_type]  # type: ignore
    elif t == "object":
        name = schema.get("title")
        if caller_target_type == "pydantic":
            return json_schema_dict_to_pydantic(schema, name)
        elif caller_target_type == "typeddict":
            return json_schema_dict_to_typeddict(schema, name)
        elif caller_target_type == "dataclass":
            return json_schema_dict_to_dataclass(schema, name)

    return Any


def json_schema_dict_to_typeddict(
    schema: dict,
    name: Optional[str] = None
) -> _TypedDictMeta:
    """Convert a JSON Schema dict into a TypedDict class.

    Parameters
    ----------
    schema: dict
        The JSON Schema dict to convert to a TypedDict
    name: Optional[str]
        The name of the TypedDict

    Returns
    -------
    _TypedDictMeta
        The TypedDict class

    """
    required = set(schema.get("required", []))
    properties = schema.get("properties", {})

    annotations: Dict[str, Any] = {}

    for property, details in properties.items():
        typ = schema_type_to_python(details, "typeddict")
        if property not in required:
            typ = Optional[typ]
        annotations[property] = typ

    return TypedDict(name or "AnonymousTypedDict", annotations)  # type: ignore


def json_schema_dict_to_pydantic(
    schema: dict,
    name: Optional[str] = None
) -> type[BaseModel]:
    """Convert a JSON Schema dict into a Pydantic BaseModel class.

    Parameters
    ----------
    schema: dict
        The JSON Schema dict to convert to a Pydantic BaseModel
    name: Optional[str]
        The name of the Pydantic BaseModel

    Returns
    -------
    type[BaseModel]
        The Pydantic BaseModel class

    """
    required = set(schema.get("required", []))
    properties = schema.get("properties", {})

    field_definitions: Dict[str, Any] = {}

    for property, details in properties.items():
        typ = schema_type_to_python(details, "pydantic")
        if property not in required:
            field_definitions[property] = (Optional[typ], None)
        else:
            field_definitions[property] = (typ, ...)

    return create_model(name or "AnonymousPydanticModel", **field_definitions)


def json_schema_dict_to_dataclass(
    schema: dict,
    name: Optional[str] = None
) -> type:
    """Convert a JSON Schema dict into a dataclass.

    Parameters
    ----------
    schema: dict
        The JSON Schema dict to convert to a dataclass
    name: Optional[str]
        The name of the dataclass

    Returns
    -------
    type
        The dataclass

    """
    required = set(schema.get("required", []))
    properties = schema.get("properties", {})

    annotations: Dict[str, Any] = {}
    defaults: Dict[str, Any] = {}

    for property, details in properties.items():
        typ = schema_type_to_python(details, "dataclass")
        annotations[property] = typ

        if property not in required:
            defaults[property] = None

    class_dict = {
        '__annotations__': annotations,
        '__module__': __name__,
    }

    for property, default_val in defaults.items():
        class_dict[property] = field(default=default_val)

    cls = type(name or "AnonymousDataclass", (), class_dict)
    return dataclass(cls)


================================================
FILE: outlines/types/locale/__init__.py
================================================
"""Locale-specific regex patterns."""

from . import us

__all__ = [
    "us",
]


================================================
FILE: outlines/types/locale/us.py
================================================
"""Locale-specific regex patterns for the United States."""

from outlines.types.dsl import Regex

zip_code = Regex(r"\d{5}(?:-\d{4})?")
phone_number = Regex(r"(\([0-9]{3}\) |[0-9]{3}-)[0-9]{3}-[0-9]{4}")


================================================
FILE: outlines/types/utils.py
================================================
"""Utility functions for the types module."""

import dataclasses
import datetime
import inspect
import sys
import warnings
from enum import Enum, EnumMeta
from typing import (
    Annotated,
    Any,
    Callable,
    Dict,
    Literal,
    List,
    NewType,
    Tuple,
    Union,
    get_args,
    get_origin,
)

from genson import SchemaBuilder
from pydantic import BaseModel, create_model

if sys.version_info >= (3, 12): # pragma: no cover
    from typing import _TypedDictMeta  # type: ignore
else: # pragma: no cover
    from typing_extensions import _TypedDictMeta  # type: ignore


# Type identification


def is_int(value: Any) -> bool:
    return (
        value is int
        or get_origin(value) is int
        or (get_origin(value) is Annotated and get_args(value)[0] is int)
        or (hasattr(value, "__supertype__") and value.__supertype__ is int)
    )


def is_int_instance(value: Any) -> bool:
    return isinstance(value, int) and not isinstance(value, bool)


def is_float(value: Any) -> bool:
    return (
        value is float
        or get_origin(value) is float
        or (get_origin(value) is Annotated and get_args(value)[0] is float)
        or (hasattr(value, "__supertype__") and value.__supertype__ is float)
    )


def is_float_instance(value: Any) -> bool:
    return isinstance(value, float)


def is_str(value: Any) -> bool:
    return (
        value is str
        or get_origin(value) is str
        or (get_origin(value) is Annotated and get_args(value)[0] is str)
        or (hasattr(value, "__supertype__") and value.__supertype__ is str)
    )


def is_str_instance(value: Any) -> bool:
    return isinstance(value, str)


def is_bool(value: Any) -> bool:
    return (
        value is bool
        or get_origin(value) is bool
        or (get_origin(value) is Annotated and get_args(value)[0] is bool)
        or (hasattr(value, "__supertype__") and value.__supertype__ is bool)
    )


def is_dict_instance(value: Any) -> bool:
    return isinstance(value, dict)


def is_datetime(value: Any) -> bool:
    return value is datetime.datetime or get_origin(value) is datetime.datetime


def is_date(value: Any) -> bool:
    return value is datetime.date or get_origin(value) is datetime.date


def is_time(value: Any) -> bool:
    return value is datetime.time or get_origin(value) is datetime.time


def is_native_dict(value: Any) -> bool:
    return value is dict


def is_typing_dict(value: Any) -> bool:
    return get_origin(value) is dict


def is_typing_list(value: Any) -> bool:
    return get_origin(value) is list


def is_typing_tuple(value: Any) -> bool:
    return get_origin(value) is tuple


def is_union(value: Any) -> bool:
    return get_origin(value) is Union


def is_literal(value: Any) -> bool:
    return get_origin(value) is Literal


def is_dataclass(value: Any) -> bool:
    return isinstance(value, type) and dataclasses.is_dataclass(value)


def is_typed_dict(value: Any) -> bool:
    return isinstance(value, _TypedDictMeta)


def is_pydantic_model(value):
    # needed because generic type cannot be used with `issubclass`    # for Python versions < 3.11
    if get_origin(value) is not None:
        return False

    return isinstance(value, type) and issubclass(value, BaseModel)


def is_genson_schema_builder(value: Any) -> bool:
    return isinstance(value, SchemaBuilder)


def is_enum(value: Any) -> bool:
    return isinstance(value, EnumMeta)


def is_callable(value: Any) -> bool:
    return callable(value) and not isinstance(value, type)


# Type conversion


def get_enum_from_literal(value) -> Enum:
    return Enum(
        value.__name__,
        {str(arg): arg for arg in get_args(value)}
    )


def get_enum_from_choice(value) -> Enum:
    return Enum(
        'Choice',
        {str(item): item for item in value.items}
    )


def get_schema_from_signature(fn: Callable) -> dict:
    """Turn a function signature into a JSON schema.

    Every JSON object valid to the output JSON Schema can be passed
    to `fn` using the ** unpacking syntax.

    """
    signature = inspect.signature(fn)
    arguments = {}
    for name, arg in signature.parameters.items():
        if arg.annotation == inspect._empty:
            raise ValueError("Each argument must have a type annotation")
        else:
            arguments[name] = (arg.annotation, ...)

    try:
        fn_name = fn.__name__
    except Exception as e:
        fn_name = "Arguments"
        warnings.warn(
            f"The function name could not be determined. Using default name 'Arguments' instead. For debugging, here is exact error:\n{e}",
            category=UserWarning,
        )
    model = create_model(fn_name, **arguments)

    return model.model_json_schema()


def get_schema_from_enum(myenum: type[Enum]) -> dict:
    if len(myenum) == 0:
        raise ValueError(
            f"Your enum class {myenum.__name__} has 0 members. If you are working with an enum of functions, do not forget to register them as callable (using `partial` for instance)"
        )
    choices = [
        get_schema_from_signature(elt.value.func)
        if callable(elt.value)
        else {"const": elt.value}
        for elt in myenum
    ]
    schema = {"title": myenum.__name__, "oneOf": choices}
    return schema


================================================
FILE: pyproject.toml
================================================
[build-system]
requires = ["setuptools>=45", "setuptools_scm[toml]>=6.2"]
build-backend = "setuptools.build_meta"

[project]
name = "outlines"
authors= [{name = "Outlines Developers"}]
description = "Probabilistic Generative Model Programming"
requires-python = ">=3.10,<3.14"
license = {text = "Apache-2.0"}
keywords=[
    "machine learning",
    "deep learning",
    "language models",
    "structured generation",
]
classifiers = [
    "Development Status :: 5 - Production/Stable",
    "Intended Audience :: Developers",
    "Intended Audience :: Information Technology",
    "Intended Audience :: Science/Research",
    "Operating System :: OS Independent",
    "Programming Language :: Python :: 3",
    "Topic :: Scientific/Engineering :: Artificial Intelligence",
]
dependencies = [
   "jinja2",
   "cloudpickle",
   "diskcache",
   "pydantic>=2.0",
   "jsonschema",
   "pillow",
   "typing_extensions",
   "outlines_core==0.2.14",
   "genson",
   "jsonpath_ng",
]
dynamic = ["version"]

[project.optional-dependencies]
anthropic = ["anthropic"]
dottxt = ["dottxt"]
gemini = ["google-genai"]
llamacpp = ["huggingface-hub", "llama-cpp-python", "numba"]
mlxlm = ["datasets", "mlx", "mlx-lm"]
lmstudio = ["lmstudio"]
ollama = ["ollama"]
openai = ["openai"]
mistral = ["mistralai"]
sglang = ["openai"]
tgi = ["huggingface_hub"]
transformers = ["accelerate", "datasets", "transformers", "setuptools", "sentencepiece"]
vllm = ["openai"]
xgrammar = ["xgrammar"]
llguidance = ["llguidance"]
airports = ["airportsdata"]
countries = ["iso3166"]
test = [
    "pre-commit",
    "pytest",
    "pytest-benchmark",
    "pytest-cov",
    "pytest-mock",
    "pytest-asyncio",
    "coverage[toml]>=5.1",
    "diff-cover",
    "accelerate",
    "beartype<0.16.0",
    "responses",
    "llama-cpp-python",
    "mlx-lm>=0.19.2; platform_machine == 'arm64' and sys_platform == 'darwin'",
    "huggingface_hub",
    "openai>=1.0.0",
    "datasets",
    "anthropic",
    "google-genai",
    "mistralai",
    "transformers",
    "pillow",
    "jax",
    "flax",
    "numpy>=2.0.0,<2.2.0",
    "numba",
    "torch>2.3.0",
    "tensorflow",
    "tf-keras",
    "ollama",
    "lmstudio",
    "dottxt",
    "sentencepiece",
    "mkdocs_gen_files",
    "llguidance",
    "xgrammar",
    "airportsdata",
    "iso3166",
    "requests",
]

[dependency-groups]
# Note: vllm is excluded from the lock file due to circular dependency with outlines-core.
# For GPU testing, install vllm manually: pip install vllm
test-gpu = ["outlines[test]"]

[project.urls]
homepage = "https://github.com/dottxt-ai/outlines"
documentation = "https://dottxt-ai.github.io/outlines/"
repository = "https://github.com/dottxt-ai/outlines"

[project.readme]
file="README.md"
content-type = "text/markdown"

[tool.setuptools.packages.find]
include = ["outlines*"]

[tool.setuptools.package-data]
"outlines" = ["py.typed", "grammars/*.lark"]

[tool.setuptools_scm]
write_to = "outlines/_version.py"

[tool.pytest.ini_options]
testpaths = ["tests"]
filterwarnings = [
    "error",
    "ignore::pydantic.warnings.PydanticDeprecatedSince20",
    "ignore::FutureWarning:transformers.*",
    "ignore::FutureWarning:huggingface_hub.*",
    "ignore::UserWarning",
    "ignore::DeprecationWarning:pyairports.*",
    "ignore::DeprecationWarning:jax.*",
    "ignore::DeprecationWarning:flax.*",
    "ignore::DeprecationWarning:torch.*",
]

[tool.mypy]
exclude=["examples"]
enable_incomplete_feature = ["Unpack"]

[[tool.mypy.overrides]]
module = [
    "jax",
    "jaxlib",
    "jax.numpy",
    "jinja2",
    "jsonschema.*",
    "anthropic.*",
    "google.*",
    "mistralai.*",
    "mamba_ssm.*",
    "mlx_lm.*",
    "mlx.*",
    "numpy.*",
    "cloudpickle.*",
    "diskcache.*",
    "pydantic.*",
    "pydantic_core.*",
    "pytest",
    "referencing.*",
    "torch.*",
    "transformers.*",
    "llama_cpp",
    "huggingface_hub",
    "datasets.*",
    "openai.*",
    "requests.*",
    "responses.*",
    "vllm.*",
    "iso3166.*",
    "airportsdata.*",
    "outlines_core.*",
    "genson",
    "lmstudio.*",
    "ollama.*",
    "dottxt.*",
    "tensorflow",
    "tensorflow.*",
    "tf-keras",
    "tf-keras.*",
    "mkdocs_gen_files.*",
    "jsonpath_ng.*",
    "llguidance.*",
    "xgrammar.*",
]
ignore_missing_imports = true

[tool.coverage.run]
# we omit the files that require a GPU or Apple Silicon
# as well as the models that make API calls
omit = [
    "outlines/_version.py",
    "outlines/models/anthropic.py",
    "outlines/models/dottxt.py",
    "outlines/models/gemini.py",
    "outlines/models/lmstudio.py",
    "outlines/models/mlxlm.py",
    "outlines/models/openai.py",
    "outlines/models/mistral.py",
    "outlines/models/vllm_offline.py",
    "outlines/processors/tensor_adapters/mlx.py",
    "tests/*",
]
branch = true
relative_files = true

[tool.coverage.report]
show_missing = true
exclude_lines = [
    "pragma: no cover",
    "if TYPE_CHECKING:",
    "\\.\\.\\.",
]

[tool.diff_cover]
compare_branch = "origin/main"
diff_range_notation = ".."

[tool.docformatter]
style = "numpy"
in-place = true

[tool.ruff.lint]
ignore = [ "E731", "F401" ]


================================================
FILE: requirements-doc.txt
================================================
mkdocs
mkdocs-material
mkdocs-material[imaging]
mkdocs-mermaid2-plugin
mkdocs-section-index
mkdocstrings[python]
mkdocs-git-committers-plugin-2
mkdocs-git-revision-date-localized-plugin
mkdocs-redirects
mkdocs-gen-files
mkdocs-literate-nav
mike


================================================
FILE: scripts/gen_ref_pages.py
================================================
"""Generate the API reference pages and navigation automatically.

This script is based on the `gen_ref_pages.py` script in the
[mkdocstrings](https://mkdocstrings.github.io/recipes/#automatic-code-reference-pages) project.

To exclude a file or module from being included in the generated API reference,
add a part of its path to the `EXCLUDED_FILES` list.
"""

from pathlib import Path

import mkdocs_gen_files

CODEBASE_DIR_NAME = "outlines"
OUTPUT_DIR_NAME = "api_reference"
EXCLUDED_FILES = ["_version"]


nav = mkdocs_gen_files.Nav()
root = Path(__file__).parent.parent
src = root / CODEBASE_DIR_NAME

for path in sorted(src.rglob("*.py")):
    module_path = path.relative_to(src).with_suffix("")
    doc_path = path.relative_to(src).with_suffix(".md")
    full_doc_path = Path(OUTPUT_DIR_NAME, doc_path)

    parts = tuple(module_path.parts)

    if any(part in EXCLUDED_FILES for part in parts):
        continue

    if parts[-1] == "__init__":
        if len(parts) == 1:
            doc_path = Path("index.md")
            full_doc_path = Path(OUTPUT_DIR_NAME, doc_path)
            parts = (CODEBASE_DIR_NAME,)
        else:
            parts = parts[:-1]
            doc_path = doc_path.with_name("index.md")
            full_doc_path = full_doc_path.with_name("index.md")

    nav[parts] = doc_path.as_posix()

    with mkdocs_gen_files.open(full_doc_path, "w") as fd:
        ident = ".".join(parts)
        if len(parts) == 1 and parts[0] == CODEBASE_DIR_NAME:
            # For root module, just use the package name
            fd.write(f"::: {CODEBASE_DIR_NAME}")
        else:
            fd.write(f"::: {CODEBASE_DIR_NAME}.{ident}")

    mkdocs_gen_files.set_edit_path(full_doc_path, path.relative_to(root))

with mkdocs_gen_files.open(f"{OUTPUT_DIR_NAME}/SUMMARY.md", "w") as nav_file:
    nav_file.writelines(nav.build_literate_nav())


================================================
FILE: setup.cfg
================================================
[flake8]
max-line-length = 88
select = C,E,F,W
ignore = E203,E231,E501,E741,W503,W504,C901,E731
per-file-ignores =
    **/__init__.py:F401,F403
exclude =
    normalai/_version.py


================================================
FILE: shell.nix
================================================
{ pkgs ? import <nixpkgs> { config = { allowUnfree = true; }; } }:

(pkgs.buildFHSEnv {
  name = "dottxt-ai";
  targetPkgs = pkgs:
    with pkgs; [
      autoconf
      binutils
      cmake
      cudatoolkit
      curl
      freeglut
      gcc13
      git
      gitRepo
      gnumake
      gnupg
      gperf
      libGL
      libGLU
      linuxPackages.nvidia_x11
      m4
      ncurses5
      procps
      python311
      stdenv.cc
      unzip
      util-linux
      uv
      xorg.libX11
      xorg.libXext
      xorg.libXi
      xorg.libXmu
      xorg.libXrandr
      xorg.libXv
      zlib
    ];

  multiPkgs = pkgs: with pkgs; [ zlib ];

  runScript = "bash";

  profile = ''
    # CUDA paths
    export CUDA_HOME=${pkgs.cudatoolkit}
    export CUDA_PATH=${pkgs.cudatoolkit}

    # Ensure proper binary paths are included
    export PATH=${pkgs.gcc13}/bin:${pkgs.cudatoolkit}/bin:$PATH

    # Set library paths, including additional directories for CUPTI
    export LD_LIBRARY_PATH=${pkgs.cudatoolkit}/lib64:${pkgs.cudatoolkit}/extras/CUPTI/lib64:${pkgs.linuxPackages.nvidia_x11}/lib:$LD_LIBRARY_PATH

    # Add static library paths to EXTRA_LDFLAGS for the linker
    export EXTRA_LDFLAGS="-L${pkgs.cudatoolkit}/lib64 -L${pkgs.cudatoolkit}/extras/CUPTI/lib64 -L${pkgs.linuxPackages.nvidia_x11}/lib -L${pkgs.cudatoolkit}/libdevice $EXTRA_LDFLAGS"
    export EXTRA_CCFLAGS="-I${pkgs.cudatoolkit}/include $EXTRA_CCFLAGS"

    # Set CMake paths
    export CMAKE_PREFIX_PATH=${pkgs.cudatoolkit}:${pkgs.linuxPackages.nvidia_x11}:$CMAKE_PREFIX_PATH

    # C++ and CC flags
    export CXXFLAGS="--std=c++17 $EXTRA_CCFLAGS"
    export CC=${pkgs.gcc13}/bin/gcc
    export CXX=${pkgs.gcc13}/bin/g++

    # NVCC flags to use the right compiler
    export NVCC_FLAGS="-ccbin ${pkgs.gcc13}/bin/gcc"
  '';

  structuredAttrs__ = {
    stdenv = pkgs.stdenv.overrideCC pkgs.stdenv.cc pkgs.gcc13;
  };
}).env


================================================
FILE: tests/__init__.py
================================================


================================================
FILE: tests/backends/test_backends.py
================================================
import outlines
import pytest
import transformers

from outlines.backends import (
    _get_backend,
    get_json_schema_logits_processor,
    get_regex_logits_processor,
    get_cfg_logits_processor,
)
from outlines.backends.outlines_core import (
    OutlinesCoreBackend,
    OutlinesCoreLogitsProcessor,
)
from outlines.backends.llguidance import (
    LLGuidanceBackend,
    LLGuidanceLogitsProcessor
)
from outlines.backends.xgrammar import XGrammarBackend, XGrammarLogitsProcessor


@pytest.fixture
def model():
    return outlines.from_transformers(
        transformers.AutoModelForCausalLM.from_pretrained("erwanf/gpt2-mini"),
        transformers.AutoTokenizer.from_pretrained("erwanf/gpt2-mini"),
    )

@pytest.fixture
def json_schema():
    return (
        '{"type": "object", "properties": {"name": {"type": "string"}, '
        + '"age": {"type": "integer"}}, "required": ["name", "age"], '
        + '"additionalProperties": false}'
    )

@pytest.fixture
def regex():
    return r"[0-9]{3}"

@pytest.fixture
def cfg_lark():
    return """
?start: sum

?sum: product
| sum "+" product   -> add
| sum "-" product   -> sub

?product: atom
| product "*" atom  -> mul
| product "/" atom  -> div

?atom: NUMBER           -> number
| "-" atom         -> neg
| "(" sum ")"

%import common.NUMBER
%import common.WS_INLINE

%ignore WS_INLINE
"""

@pytest.fixture
def cfg_ebnf():
    return """
root ::= answer
answer ::= "yes" | "no"
"""


def test_get_backend(model):
    backend = _get_backend("outlines_core", model)
    assert isinstance(backend, OutlinesCoreBackend)

    backend = _get_backend("xgrammar", model)
    assert isinstance(backend, XGrammarBackend)

    backend = _get_backend("llguidance", model)
    assert isinstance(backend, LLGuidanceBackend)

    with pytest.raises(ValueError, match="not supported"):
        _get_backend("not_supported", model)


def test_get_json_schema_logits_processor(model, json_schema):
    processor = get_json_schema_logits_processor("outlines_core", model, json_schema)
    assert isinstance(processor, OutlinesCoreLogitsProcessor)

    processor = get_json_schema_logits_processor("llguidance", model, json_schema)
    assert isinstance(processor, LLGuidanceLogitsProcessor)

    processor = get_json_schema_logits_processor("xgrammar", model, json_schema)
    assert isinstance(processor, XGrammarLogitsProcessor)


def test_get_regex_logits_processor(model, regex):
    processor = get_regex_logits_processor("outlines_core", model, regex)
    assert isinstance(processor, OutlinesCoreLogitsProcessor)

    processor = get_regex_logits_processor("llguidance", model, regex)
    assert isinstance(processor, LLGuidanceLogitsProcessor)

    processor = get_regex_logits_processor("xgrammar", model, regex)
    assert isinstance(processor, XGrammarLogitsProcessor)


def test_get_cfg_logits_processor(model, cfg_lark, cfg_ebnf):
    with pytest.raises(
        NotImplementedError,
        match="Outlines Core does not support context-free grammar."
    ):
        get_cfg_logits_processor("outlines_core", model, cfg_lark)

    processor = get_cfg_logits_processor("llguidance", model, cfg_lark)
    assert isinstance(processor, LLGuidanceLogitsProcessor)

    processor = get_cfg_logits_processor("xgrammar", model, cfg_ebnf)
    assert isinstance(processor, XGrammarLogitsProcessor)


================================================
FILE: tests/backends/test_backends_utils.py
================================================
import torch
import numpy as np


def simulate_model_calling_processor(processor, tensor_library_name, vocabulary_size, eos_token_id, batch_size):
    if tensor_library_name == "torch":
        tensor_adapter = TorchTensorAdapter()
    elif tensor_library_name == "numpy":
        tensor_adapter = NumpyTensorAdapter()
    elif tensor_library_name == "mlx":
        tensor_adapter = MLXTensorAdapter()

    processor.reset()
    i = 0
    input_ids = tensor_adapter.randint(0, vocabulary_size, (batch_size, 10))
    while True:
        i += 1
        logits = tensor_adapter.randn((batch_size, vocabulary_size))
        output = processor(input_ids, logits)
        assert output.shape == (batch_size, vocabulary_size)
        if all(input_ids[:, -1] == eos_token_id):
            break
        input_ids = tensor_adapter.add_token_inputs_ids(input_ids, output)
        print(input_ids)
        if i > 20:
            break
    return input_ids[:, 10:]

class TorchTensorAdapter():
    def randn(self, shape):
        return torch.randn(*shape)

    def randint(self, low, high, size):
        return torch.randint(low, high, size)

    def add_token_inputs_ids(self, input_ids, logits):
        next_token_ids = torch.argmax(logits, dim=-1)
        input_ids = torch.cat([input_ids, next_token_ids.unsqueeze(-1)], dim=-1)
        return input_ids


class NumpyTensorAdapter():
    def randn(self, shape):
        return np.random.randn(*shape)

    def randint(self, low, high, size):
        return np.random.randint(low, high, size)

    def add_token_inputs_ids(self, input_ids, logits):
        next_token_ids = np.argmax(logits, axis=-1)
        print("next_token_ids",next_token_ids)
        input_ids = np.concatenate([input_ids, next_token_ids[..., None]], axis=-1)
        return input_ids


class MLXTensorAdapter():
    def __init__(self):
        import mlx
        self.mlx = mlx

    def randn(self, shape):
        return self.mlx.random.randn(*shape)

    def randint(self, low, high, size):
        return self.mlx.random.randint(low, high, size)

    def add_token_inputs_ids(self, input_ids, logits):
        next_token_ids = self.mlx.argmax(logits, axis=-1)
        input_ids = self.mlx.concatenate([input_ids, next_token_ids[..., None]], axis=-1)
        return input_ids


================================================
FILE: tests/backends/test_llguidance.py
================================================
import re

import llama_cpp
import llguidance
import pytest
import transformers
from llguidance import LLTokenizer

import outlines
from outlines.backends.llguidance import (
    LLGuidanceBackend,
    LLGuidanceLogitsProcessor
)
from tests.backends.test_backends_utils import simulate_model_calling_processor

try:
    import mlx_lm
    HAS_MLX = True
except ImportError:
    HAS_MLX = False


def model_transformers():
    return outlines.from_transformers(
        transformers.AutoModelForCausalLM.from_pretrained("erwanf/gpt2-mini"),
        transformers.AutoTokenizer.from_pretrained("erwanf/gpt2-mini"),
    )

def model_llamacpp():
    return outlines.from_llamacpp(
        llama_cpp.Llama.from_pretrained(
            repo_id="M4-ai/TinyMistral-248M-v2-Instruct-GGUF",
            filename="TinyMistral-248M-v2-Instruct.Q4_K_M.gguf",
            chat_format="qwen",
        )
    )

def model_mlxlm():
    return outlines.from_mlxlm(
        *mlx_lm.load("mlx-community/SmolLM-135M-Instruct-4bit")
    )

@pytest.fixture
def json_schema():
    return (
        '{"type": "object", "properties": {"name": {"type": "string"}, '
        + '"age": {"type": "integer"}}, "required": ["name", "age"], '
        + '"additionalProperties": false}'
    )

@pytest.fixture
def regex():
    return r"[0-9]{3}"

@pytest.fixture
def cfg_lark():
    return """
?start: sum

?sum: product
| sum "+" product   -> add
| sum "-" product   -> sub

?product: atom
| product "*" atom  -> mul
| product "/" atom  -> div

?atom: NUMBER           -> number
| "-" atom         -> neg
| "(" sum ")"

%import common.NUMBER
%import common.WS_INLINE

%ignore WS_INLINE
"""

@pytest.fixture
def cfg_ebnf():
    return """
root ::= answer
answer ::= "yes" | "no"
"""


def test_llguidance_processor_torch(regex):
    model = model_transformers()
    tokenizer = model.tokenizer
    hf_tokenizer = model.hf_tokenizer
    llg_tokenizer = LLGuidanceBackend(model).llg_tokenizer
    grammar_spec = llguidance.grammar_from("regex", regex)
    processor = LLGuidanceLogitsProcessor(grammar_spec, llg_tokenizer, "torch")
    for _ in range(2):
        input_ids = simulate_model_calling_processor(
            processor,
            "torch",
            len(tokenizer.get_vocab()),
            tokenizer.eos_token_id,
            2
        )
        assert re.match(regex, hf_tokenizer.decode(input_ids[0]))
        assert re.match(regex, hf_tokenizer.decode(input_ids[1]))


def test_llguidance_processor_numpy(regex):
    model = model_llamacpp()
    tokenizer = model.tokenizer
    llg_tokenizer = LLGuidanceBackend(model).llg_tokenizer
    grammar_spec = llguidance.grammar_from("regex", regex)
    processor = LLGuidanceLogitsProcessor(grammar_spec, llg_tokenizer, "numpy")
    for _ in range(2):
        input_ids = simulate_model_calling_processor(
            processor,
            "numpy",
            len(tokenizer.vocabulary),
            tokenizer.eos_token_id,
            2
        )
        assert re.match(regex, tokenizer.decode(input_ids[0])[0])
        assert re.match(regex, tokenizer.decode(input_ids[1])[0])


@pytest.mark.skipif(not HAS_MLX, reason="MLX tests require Apple Silicon")
def test_llguidance_processor_mlx(regex):
    model = model_mlxlm()
    tokenizer = model.mlx_tokenizer
    llg_tokenizer = LLGuidanceBackend(model).llg_tokenizer
    grammar_spec = llguidance.grammar_from("regex", regex)
    processor = LLGuidanceLogitsProcessor(grammar_spec, llg_tokenizer, "mlx")
    for _ in range(2):
        input_ids = simulate_model_calling_processor(
            processor,
            "mlx",
            len(tokenizer.vocabulary),
            tokenizer.eos_token_id,
            2
        )
        assert re.match(regex, tokenizer.decode(input_ids[0]))
        assert re.match(regex, tokenizer.decode(input_ids[1]))


models = [
    (model_transformers(), "torch"),
    (model_llamacpp(), "numpy"),
]
if HAS_MLX:
    models.append((model_mlxlm(), "mlx"))

@pytest.mark.parametrize("model, tensor_library_name", models)
def test_llguidance_backend(model, tensor_library_name, json_schema, regex, cfg_lark, cfg_ebnf):
    # initialization
    backend = LLGuidanceBackend(model)
    assert isinstance(backend.llg_tokenizer, LLTokenizer)
    assert backend.tensor_library_name == tensor_library_name

    # json schema
    processor = backend.get_json_schema_logits_processor(json_schema)
    assert isinstance(processor, LLGuidanceLogitsProcessor)
    generator = outlines.Generator(model, backend="llguidance", processor=processor)
    response = generator("Hello, how are you?")
    assert response[0] == "{"

    # regex
    processor = backend.get_regex_logits_processor(regex)
    assert isinstance(processor, LLGuidanceLogitsProcessor)
    generator = outlines.Generator(model, backend="llguidance", processor=processor)
    response = generator("Hello, how are you?")
    assert len(response) == 3
    assert int(response)

    # cfg lark
    processor = backend.get_cfg_logits_processor(cfg_lark)
    assert isinstance(processor, LLGuidanceLogitsProcessor)
    generator = outlines.Generator(model, backend="llguidance", processor=processor)
    response = generator("Hello, how are you?")
    assert (
        "+" in response
        or "-" in response
        or "*" in response
        or "/" in response
        or float(response.strip())
    )

    # cfg ebnf
    processor = backend.get_cfg_logits_processor(cfg_ebnf)
    assert isinstance(processor, LLGuidanceLogitsProcessor)
    generator = outlines.Generator(model, backend="llguidance", processor=processor)
    response = generator("Hello, how are you?")
    assert response == "yes" or response == "no"

    # batch + multiple generations
    processor = backend.get_json_schema_logits_processor(json_schema)
    generator = outlines.Generator(model, backend="llguidance", processor=processor)
    for _ in range(2):
        if tensor_library_name == "torch":
            response = generator.batch(["Create a character", "Hello, how are you?"], max_new_tokens=200)
            assert len(response) == 2
            for r in response:
                assert r[0] == "{"
        else:
            response = generator("Create a character", max_tokens=20)
            assert response[0] == "{"


================================================
FILE: tests/backends/test_outlines_core.py
================================================
import re

import llama_cpp
import pytest
import transformers
from outlines_core import Index, Vocabulary

import outlines
from outlines.backends.outlines_core import (
    OutlinesCoreBackend,
    OutlinesCoreLogitsProcessor,
)
from tests.backends.test_backends_utils import simulate_model_calling_processor

try:
    import mlx_lm

    HAS_MLX = True
except ImportError:
    HAS_MLX = False


def model_transformers():
    return outlines.from_transformers(
        transformers.AutoModelForCausalLM.from_pretrained("erwanf/gpt2-mini"),
        transformers.AutoTokenizer.from_pretrained("erwanf/gpt2-mini"),
    )


def model_llamacpp():
    return outlines.from_llamacpp(
        llama_cpp.Llama.from_pretrained(
            repo_id="M4-ai/TinyMistral-248M-v2-Instruct-GGUF",
            filename="TinyMistral-248M-v2-Instruct.Q4_K_M.gguf",
            chat_format="qwen",
        )
    )


def model_mlxlm():
    return outlines.from_mlxlm(*mlx_lm.load("mlx-community/SmolLM-135M-Instruct-4bit"))


@pytest.fixture
def json_schema():
    return (
        '{"type": "object", "properties": {"name": {"type": "string"}, '
        + '"age": {"type": "integer"}}, "required": ["name", "age"], '
        + '"additionalProperties": false}'
    )


@pytest.fixture
def regex():
    return r"[0-9]{3}"


@pytest.fixture
def cfg():
    return """
?start: sum

?sum: product
| sum "+" product   -> add
| sum "-" product   -> sub

?product: atom
| product "*" atom  -> mul
| product "/" atom  -> div

?atom: NUMBER           -> number
| "-" atom         -> neg
| "(" sum ")"

%import common.NUMBER
%import common.WS_INLINE

%ignore WS_INLINE
"""


def test_outlines_core_processor_torch(regex):
    model = model_transformers()
    tokenizer = model.tokenizer
    hf_tokenizer = model.hf_tokenizer
    backend = OutlinesCoreBackend(model)
    index = Index(regex, backend.vocabulary)
    processor = OutlinesCoreLogitsProcessor(index, "torch")
    for _ in range(2):
        input_ids = simulate_model_calling_processor(
            processor, "torch", len(tokenizer.get_vocab()), tokenizer.eos_token_id, 2
        )
        assert re.match(regex, hf_tokenizer.decode(input_ids[0]))
        assert re.match(regex, hf_tokenizer.decode(input_ids[1]))


def test_outlines_core_processor_numpy(regex):
    model = model_llamacpp()
    tokenizer = model.tokenizer
    backend = OutlinesCoreBackend(model)
    index = Index(regex, backend.vocabulary)
    processor = OutlinesCoreLogitsProcessor(index, "numpy")
    for _ in range(2):
        input_ids = simulate_model_calling_processor(
            processor, "numpy", len(tokenizer.vocabulary), tokenizer.eos_token_id, 2
        )
        assert re.match(regex, tokenizer.decode(input_ids[0])[0])
        assert re.match(regex, tokenizer.decode(input_ids[1])[0])


@pytest.mark.skipif(not HAS_MLX, reason="MLX tests require Apple Silicon")
def test_outlines_core_processor_mlx():
    model = model_mlxlm()
    tokenizer = model.mlx_tokenizer
    backend = OutlinesCoreBackend(model)
    index = Index(r"[0-9]{3}", backend.vocabulary)
    processor = OutlinesCoreLogitsProcessor(index, "mlx")
    for _ in range(2):
        input_ids = simulate_model_calling_processor(
            processor, "mlx", len(tokenizer.vocabulary), tokenizer.eos_token_id, 2
        )
        assert re.match(regex, tokenizer.decode(input_ids[0]))
        assert re.match(regex, tokenizer.decode(input_ids[1]))


def test_create_vocabulary_preserves_duplicate_token_ids():
    vocab = {
        "hello": 1,
        "world": 2,
        "<0x20>": 3,
        "▁": 4,
    }

    def token_to_str(token):
        if token in ("<0x20>", "▁"):
            return " "
        return token

    vocabulary = OutlinesCoreBackend.create_outlines_core_vocabulary(
        vocab=vocab,
        eos_token_id=0,
        eos_token="hello",
        token_to_str=token_to_str,
    )

    # 4 original IDs - 1 popped (hello) + 1 EOS added by Vocabulary = 4
    assert len(vocabulary) == 4


models = [
    (model_transformers(), "torch"),
    (model_llamacpp(), "numpy"),
]
if HAS_MLX:
    models.append((model_mlxlm(), "mlx"))


@pytest.mark.parametrize("model, tensor_library_name", models)
def test_outlines_core_backend(model, tensor_library_name, json_schema, regex, cfg):
    # initialization
    backend = OutlinesCoreBackend(model)
    assert isinstance(backend.vocabulary, Vocabulary)
    assert backend.tensor_library_name == tensor_library_name

    # json schema
    processor = backend.get_json_schema_logits_processor(json_schema)
    assert isinstance(processor, OutlinesCoreLogitsProcessor)
    generator = outlines.Generator(model, backend="outlines_core", processor=processor)
    response = generator("Hello, how are you?")
    assert "name" in response

    # regex
    processor = backend.get_regex_logits_processor(regex)
    assert isinstance(processor, OutlinesCoreLogitsProcessor)
    generator = outlines.Generator(model, backend="outlines_core", processor=processor)
    response = generator("Hello, how are you?")
    assert len(response) == 3
    assert int(response)

    # cfg
    with pytest.raises(
        NotImplementedError,
        match="Outlines Core does not support context-free grammar.",
    ):
        backend.get_cfg_logits_processor(cfg)

    # batch + multiple generations
    processor = backend.get_json_schema_logits_processor(json_schema)
    generator = outlines.Generator(model, backend="outlines_core", processor=processor)
    for _ in range(2):
        if tensor_library_name == "torch":
            response = generator.batch(
                ["Create a character", "Hello, how are you?"], max_new_tokens=200
            )
            assert len(response) == 2
            for r in response:
                assert r[0] == "{"
                assert "name" in r
        else:
            response = generator("Create a character", max_tokens=20)
            assert response[0] == "{"
            assert "name" in response


================================================
FILE: tests/backends/test_xgrammar.py
================================================
import re

import llama_cpp
import outlines
import pytest
import transformers
from xgrammar import GrammarCompiler, TokenizerInfo

from outlines.backends.xgrammar import XGrammarBackend, XGrammarLogitsProcessor
from tests.backends.test_backends_utils import simulate_model_calling_processor

try:
    import mlx_lm
    HAS_MLX = True
except ImportError:
    HAS_MLX = False


def model_transformers():
    return outlines.from_transformers(
        transformers.AutoModelForCausalLM.from_pretrained("erwanf/gpt2-mini"),
        transformers.AutoTokenizer.from_pretrained("erwanf/gpt2-mini"),
    )

def model_llamacpp():
    return outlines.from_llamacpp(
        llama_cpp.Llama.from_pretrained(
            repo_id="M4-ai/TinyMistral-248M-v2-Instruct-GGUF",
            filename="TinyMistral-248M-v2-Instruct.Q4_K_M.gguf",
            chat_format="qwen",
        )
    )

def model_mlxlm():
    return outlines.from_mlxlm(
        *mlx_lm.load("mlx-community/SmolLM-135M-Instruct-4bit")
    )

@pytest.fixture
def tokenizer_info():
    tokenizer = model_transformers().hf_tokenizer
    tokenizer_info = TokenizerInfo.from_huggingface(
        tokenizer,
        vocab_size=len(tokenizer.get_vocab())
    )
    return tokenizer_info

@pytest.fixture
def json_schema():
    return (
        '{"type": "object", "properties": {"name": {"type": "string"}, '
        + '"age": {"type": "integer"}}, "required": ["name", "age"], '
        + '"additionalProperties": false}'
    )

@pytest.fixture
def regex():
    return r"[0-9]{3}"

@pytest.fixture
def cfg():
    return """
root ::= answer
answer ::= "yes" | "no"
"""


def test_xgr_processor_torch(regex):
    model = model_transformers()
    tokenizer = model.tokenizer
    hf_tokenizer = model.hf_tokenizer
    tokenizer_info = TokenizerInfo.from_huggingface(
        hf_tokenizer,
        vocab_size=len(hf_tokenizer.get_vocab())
    )
    grammar_compiler = GrammarCompiler(tokenizer_info)
    compiled_grammar = grammar_compiler.compile_regex(regex)
    processor = XGrammarLogitsProcessor(compiled_grammar, "torch")
    for _ in range(2):
        input_ids = simulate_model_calling_processor(
            processor,
            "torch",
            len(tokenizer.get_vocab()),
            tokenizer.eos_token_id,
            2
        )
        assert re.match(regex, hf_tokenizer.decode(input_ids[0]))
        assert re.match(regex, hf_tokenizer.decode(input_ids[1]))


@pytest.mark.skipif(not HAS_MLX, reason="MLX tests require Apple Silicon")
def test_xgr_processor_mlx(tokenizer_info):
    model = model_mlxlm()
    tokenizer = model.mlx_tokenizer
    tokenizer_info = TokenizerInfo.from_huggingface(
        tokenizer,
        vocab_size=len(tokenizer.get_vocab())
    )
    grammar_compiler = GrammarCompiler(tokenizer_info)
    compiled_grammar = grammar_compiler.compile_regex(regex)
    processor = XGrammarLogitsProcessor(compiled_grammar, "mlx")
    for _ in range(2):
        input_ids = simulate_model_calling_processor(
            processor,
            "mlx",
            len(tokenizer.get_vocab()),
            tokenizer.eos_token_id,
            2
        )
        assert re.match(regex, tokenizer.decode(input_ids[0]))
        assert re.match(regex, tokenizer.decode(input_ids[1]))


models = [(model_transformers(), "torch")]
if HAS_MLX:
    models.append((model_mlxlm(), "mlx"))

@pytest.mark.parametrize("model, tensor_library_name", models)
def test_xgrammar_backend(model, tensor_library_name, json_schema, regex, cfg):
    # initialization
    backend = XGrammarBackend(model)
    assert isinstance(backend.grammar_compiler, GrammarCompiler)

    # json schema
    processor = backend.get_json_schema_logits_processor(json_schema)
    assert isinstance(processor, XGrammarLogitsProcessor)
    generator = outlines.Generator(model, backend="xgrammar", processor=processor)
    response = generator("Hello, how are you?")
    assert response[0] == "{"
    assert "name" in response

    # regex
    processor = backend.get_regex_logits_processor(regex)
    assert isinstance(processor, XGrammarLogitsProcessor)
    generator = outlines.Generator(model, backend="xgrammar", processor=processor)
    response = generator("Hello, how are you?")
    assert len(response) == 3
    assert int(response)

    # cfg
    processor = backend.get_cfg_logits_processor(cfg)
    assert isinstance(processor, XGrammarLogitsProcessor)
    generator = outlines.Generator(model, backend="xgrammar", processor=processor)
    response = generator("Hello, how are you?")
    assert response == "yes" or response == "no"

    # batch + multiple generations
    processor = backend.get_json_schema_logits_processor(json_schema)
    generator = outlines.Generator(model, backend="xgrammar", processor=processor)
    for _ in range(2):
        if tensor_library_name == "torch":
            response = generator.batch(["Create a character", "Hello, how are you?"], max_new_tokens=200)
            assert len(response) == 2
            for r in response:
                assert r[0] == "{"
                assert "name" in r
        else:
            response = generator("Create a character", max_tokens=20)
            assert response[0] == "{"
            assert "name" in response


def test_xgrammar_backend_invalid_model():
    with pytest.raises(
        ValueError,
        match="The xgrammar backend only supports Transformers and MLXLM models",
    ):
        XGrammarBackend(model_llamacpp())


================================================
FILE: tests/cfg_samples/arithmetic/lots_of_ops.arithmetic.test
================================================
5+1+1+1+1+1+1+1+1+1+1+1+1+1+1+1+1+1


================================================
FILE: tests/cfg_samples/arithmetic/simple_math.arithmetic.test
================================================
(1 * 2) - (0.1 * 2 * 9.42)


================================================
FILE: tests/cfg_samples/json/outlines.generate.samplers.mypy.json.test
================================================
{
    ".class": "MypyFile",
    "_fullname": "outlines.generate.samplers",
    "future_import_flags": [],
    "is_partial_stub_package": false,
    "is_stub": false,
    "names": {
        ".class": "SymbolTable",
        "Protocol": {
            ".class": "SymbolTableNode",
            "cross_ref": "typing.Protocol",
            "kind": "Gdef"
        },
        "Sampler": {
            ".class": "SymbolTableNode",
            "kind": "Gdef",
            "node": {
                ".class": "TypeInfo",
                "_promote": [],
                "abstract_attributes": [
                    [
                        "__call__",
                        2
                    ]
                ],
                "alt_promote": null,
                "bases": [
                    "builtins.object"
                ],
                "dataclass_transform_spec": null,
                "declared_metaclass": null,
                "defn": {
                    ".class": "ClassDef",
                    "fullname": "outlines.generate.samplers.Sampler",
                    "name": "Sampler",
                    "type_vars": []
                },
                "deletable_attributes": [],
                "flags": [
                    "is_abstract",
                    "is_protocol"
                ],
                "fullname": "outlines.generate.samplers.Sampler",
                "has_param_spec_type": false,
                "metaclass_type": "abc.ABCMeta",
                "metadata": {},
                "module_name": "outlines.generate.samplers",
                "mro": [
                    "outlines.generate.samplers.Sampler",
                    "builtins.object"
                ],
                "names": {
                    ".class": "SymbolTable",
                    "__call__": {
                        ".class": "SymbolTableNode",
                        "kind": "Mdef",
                        "node": {
                            ".class": "FuncDef",
                            "abstract_status": 2,
                            "arg_kinds": [
                                0,
                                0,
                                0,
                                0
                            ],
                            "arg_names": [
                                "self",
                                "logits",
                                "samples",
                                "rng"
                            ],
                            "dataclass_transform_spec": null,
                            "flags": [
                                "is_trivial_body"
                            ],
                            "fullname": "outlines.generate.samplers.Sampler.__call__",
                            "name": "__call__",
                            "type": {
                                ".class": "CallableType",
                                "arg_kinds": [
                                    0,
                                    0,
                                    0,
                                    0
                                ],
                                "arg_names": [
                                    "self",
                                    "logits",
                                    "samples",
                                    "rng"
                                ],
                                "arg_types": [
                                    "outlines.generate.samplers.Sampler",
                                    {
                                        ".class": "AnyType",
                                        "missing_import_name": "outlines.generate.samplers.torch",
                                        "source_any": null,
                                        "type_of_any": 3
                                    },
                                    "builtins.int",
                                    {
                                        ".class": "AnyType",
                                        "missing_import_name": "outlines.generate.samplers.torch",
                                        "source_any": null,
                                        "type_of_any": 3
                                    }
                                ],
                                "bound_args": [],
                                "def_extras": {
                                    "first_arg": "self"
                                },
                                "fallback": "builtins.function",
                                "from_concatenate": false,
                                "implicit": false,
                                "is_ellipsis_args": false,
                                "name": "__call__ of Sampler",
                                "ret_type": {
                                    ".class": "AnyType",
                                    "missing_import_name": "outlines.generate.samplers.torch",
                                    "source_any": null,
                                    "type_of_any": 3
                                },
                                "type_guard": null,
                                "unpack_kwargs": false,
                                "variables": []
                            }
                        }
                    }
                },
                "self_type": null,
                "slots": null,
                "tuple_type": null,
                "type_vars": [],
                "typeddict_type": null
            }
        },
        "__annotations__": {
            ".class": "SymbolTableNode",
            "kind": "Gdef",
            "node": {
                ".class": "Var",
                "flags": [
                    "is_ready"
                ],
                "fullname": "outlines.generate.samplers.__annotations__",
                "name": "__annotations__",
                "type": {
                    ".class": "Instance",
                    "args": [
                        "builtins.str",
                        {
                            ".class": "AnyType",
                            "missing_import_name": null,
                            "source_any": null,
                            "type_of_any": 6
                        }
                    ],
                    "type_ref": "builtins.dict"
                }
            }
        },
        "__doc__": {
            ".class": "SymbolTableNode",
            "kind": "Gdef",
            "node": {
                ".class": "Var",
                "flags": [
                    "is_ready"
                ],
                "fullname": "outlines.generate.samplers.__doc__",
                "name": "__doc__",
                "type": "builtins.str"
            }
        },
        "__file__": {
            ".class": "SymbolTableNode",
            "kind": "Gdef",
            "node": {
                ".class": "Var",
                "flags": [
                    "is_ready"
                ],
                "fullname": "outlines.generate.samplers.__file__",
                "name": "__file__",
                "type": "builtins.str"
            }
        },
        "__name__": {
            ".class": "SymbolTableNode",
            "kind": "Gdef",
            "node": {
                ".class": "Var",
                "flags": [
                    "is_ready"
                ],
                "fullname": "outlines.generate.samplers.__name__",
                "name": "__name__",
                "type": "builtins.str"
            }
        },
        "__package__": {
            ".class": "SymbolTableNode",
            "kind": "Gdef",
            "node": {
                ".class": "Var",
                "flags": [
                    "is_ready"
                ],
                "fullname": "outlines.generate.samplers.__package__",
                "name": "__package__",
                "type": "builtins.str"
            }
        },
        "greedy": {
            ".class": "SymbolTableNode",
            "kind": "Gdef",
            "node": {
                ".class": "FuncDef",
                "abstract_status": 0,
                "arg_kinds": [
                    0,
                    0,
                    2
                ],
                "arg_names": [
                    "logits",
                    "samples",
                    "_"
                ],
                "dataclass_transform_spec": null,
                "flags": [],
                "fullname": "outlines.generate.samplers.greedy",
                "name": "greedy",
                "type": {
                    ".class": "CallableType",
                    "arg_kinds": [
                        0,
                        0,
                        2
                    ],
                    "arg_names": [
                        "logits",
                        "samples",
                        "_"
                    ],
                    "arg_types": [
                        {
                            ".class": "AnyType",
                            "missing_import_name": "outlines.generate.samplers.torch",
                            "source_any": null,
                            "type_of_any": 3
                        },
                        "builtins.int",
                        {
                            ".class": "AnyType",
                            "missing_import_name": null,
                            "source_any": null,
                            "type_of_any": 1
                        }
                    ],
                    "bound_args": [],
                    "def_extras": {
                        "first_arg": null
                    },
                    "fallback": "builtins.function",
                    "from_concatenate": false,
                    "implicit": false,
                    "is_ellipsis_args": false,
                    "name": "greedy",
                    "ret_type": {
                        ".class": "AnyType",
                        "missing_import_name": "outlines.generate.samplers.torch",
                        "source_any": null,
                        "type_of_any": 3
                    },
                    "type_guard": null,
                    "unpack_kwargs": false,
                    "variables": []
                }
            }
        },
        "multinomial": {
            ".class": "SymbolTableNode",
            "kind": "Gdef",
            "node": {
                ".class": "FuncDef",
                "abstract_status": 0,
                "arg_kinds": [
                    0,
                    0,
                    0
                ],
                "arg_names": [
                    "logits",
                    "samples",
                    "rng"
                ],
                "dataclass_transform_spec": null,
                "flags": [],
                "fullname": "outlines.generate.samplers.multinomial",
                "name": "multinomial",
                "type": {
                    ".class": "CallableType",
                    "arg_kinds": [
                        0,
                        0,
                        0
                    ],
                    "arg_names": [
                        "logits",
                        "samples",
                        "rng"
                    ],
                    "arg_types": [
                        {
                            ".class": "AnyType",
                            "missing_import_name": "outlines.generate.samplers.torch",
                            "source_any": null,
                            "type_of_any": 3
                        },
                        "builtins.int",
                        {
                            ".class": "AnyType",
                            "missing_import_name": "outlines.generate.samplers.torch",
                            "source_any": null,
                            "type_of_any": 3
                        }
                    ],
                    "bound_args": [],
                    "def_extras": {
                        "first_arg": null
                    },
                    "fallback": "builtins.function",
                    "from_concatenate": false,
                    "implicit": false,
                    "is_ellipsis_args": false,
                    "name": "multinomial",
                    "ret_type": {
                        ".class": "AnyType",
                        "missing_import_name": "outlines.generate.samplers.torch",
                        "source_any": null,
                        "type_of_any": 3
                    },
                    "type_guard": null,
                    "unpack_kwargs": false,
                    "variables": []
                }
            }
        },
        "torch": {
            ".class": "SymbolTableNode",
            "kind": "Gdef",
            "node": {
                ".class": "Var",
                "flags": [
                    "is_suppressed_import",
                    "is_ready",
                    "is_inferred"
                ],
                "fullname": "outlines.generate.samplers.torch",
                "name": "torch",
                "type": {
                    ".class": "AnyType",
                    "missing_import_name": "outlines.generate.samplers.torch",
                    "source_any": null,
                    "type_of_any": 3
                }
            }
        }
    },
    "path": "/home/andrew/p/outlines/outlines/generate/samplers.py"
}


================================================
FILE: tests/cfg_samples/json/simple_fruit.json.test
================================================
[
    {
        "ID": "1",
        "Name": "Andrew \"The Escaper\" Lapp",
        "Age": "30",
        "FavFruit": "Banana"
    },
    {
        "ID": "2",
        "Name": "Mohammad",
        "Age": "40",
        "FavFruit": "\"Any Fruit As Long as It's In Quotes!\""
    },
    {
        "ID": "3",
        "Name": "Alice",
        "Age": "61",
        "FavFruit": "Peaches, but only \n newline separated peaches"
    }
]


================================================
FILE: tests/cfg_samples/json/simple_fruit_no_indent.json.test
================================================
[{"ID": "1", "Name": "Andrew", "Age": "30", "FavFruit": "Banana"}, {"ID": "2", "Name": "Mohammad", "Age": "40", "FavFruit": "Apple"}, {"ID": "3", "Name": "Alice", "Age": "61", "FavFruit": "Peach"}]


================================================
FILE: tests/conftest.py
================================================
import sys

import pytest


def pytest_collection_modifyitems(config, items):
    if sys.platform != "linux":
        if not config.option.keyword or (
            config.option.keyword and "test_integration_vllm" in config.option.keyword
        ):
            print(
                "WARNING: test_integration_vllm tests are skipped because vLLM only supports Linux platform (including WSL)."
            )
        skip_vllm = pytest.mark.skip(reason="vLLM models can only be run on Linux.")
        for item in items:
            if "test_integration_vllm" in item.nodeid:
                item.add_marker(skip_vllm)


================================================
FILE: tests/models/test_anthopic_type_adapter.py
================================================
import io
import pytest
from dataclasses import dataclass

from PIL import Image as PILImage
from outlines.inputs import Chat, Image
from outlines.models.anthropic import AnthropicTypeAdapter


@pytest.fixture
def image():
    width, height = 1, 1
    white_background = (255, 255, 255)
    image = PILImage.new("RGB", (width, height), white_background)

    # Save to an in-memory bytes buffer and read as png
    buffer = io.BytesIO()
    image.save(buffer, format="PNG")
    buffer.seek(0)
    image = PILImage.open(buffer)

    return image


@pytest.fixture
def adapter():
    return AnthropicTypeAdapter()


def test_anthropic_type_adapter_input_text(adapter):
    message = "prompt"
    result = adapter.format_input(message)
    assert result == {"messages": [{"role": "user", "content": message}]}


def test_anthropic_type_adapter_input_vision(adapter, image):
    image_input = Image(image)
    text_input = "hello"
    result = adapter.format_input([text_input, image_input])
    assert result == {
        "messages": [
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "source": {
                            "type": "base64",
                            "media_type": "image/png",
                            "data": image_input.image_str,
                        },
                    },
                    {"type": "text", "text": text_input},
                ],
            },
        ]
    }


def test_anthropic_type_adapter_input_chat(adapter, image):
    image_input = Image(image)
    model_input = Chat(messages=[
        {"role": "system", "content": "prompt"},
        {"role": "user", "content": [
            "hello",
            image_input,
        ]},
        {"role": "assistant", "content": "response"},
    ])
    result = adapter.format_input(model_input)
    assert result == {
        "messages": [
            {"role": "system", "content": "prompt"},
            {"role": "user", "content": [
                {
                    "type": "image",
                    "source": {
                        "type": "base64",
                        "media_type": "image/png",
                        "data": image_input.image_str,
                    },
                },
                {"type": "text", "text": "hello"},
            ]},
            {"role": "assistant", "content": "response"},
        ]
    }


def test_anthropic_type_adapter_input_invalid(adapter):
    @dataclass
    class Audio:
        file: str

    with pytest.raises(TypeError, match="is not available with Anthropic"):
        _ = adapter.format_input(Audio("file"))

    with pytest.raises(
        ValueError,
        match="All assets provided must be of type Image",
    ):
        _ = adapter.format_input(["prompt", Audio("file")])

    with pytest.raises(
        ValueError,
        match="The content must be a string or a list",
    ):
        _ = adapter.format_input(
            Chat(messages=[{"role": "user", "content": {"foo": "bar"}}])
        )


def test_anthropic_type_adapter_output(adapter):
    with pytest.raises(
        NotImplementedError,
        match="is not available with Anthropic"
    ):
        adapter.format_output_type(str)


================================================
FILE: tests/models/test_anthropic.py
================================================
import io
from typing import Generator

from anthropic import Anthropic as AnthropicClient
from PIL import Image as PILImage
import pytest

import outlines
from outlines.inputs import Chat, Image, Video
from outlines.models.anthropic import Anthropic


MODEL_NAME = "claude-3-haiku-20240307"


@pytest.fixture(scope="session")
def model():
    return Anthropic(AnthropicClient(), MODEL_NAME)


@pytest.fixture(scope="session")
def model_no_model_name():
    return Anthropic(AnthropicClient())


@pytest.fixture
def image():
    width, height = 1, 1
    white_background = (255, 255, 255)
    image = PILImage.new("RGB", (width, height), white_background)

    # Save to an in-memory bytes buffer and read as png
    buffer = io.BytesIO()
    image.save(buffer, format="PNG")
    buffer.seek(0)
    image = PILImage.open(buffer)

    return image


def test_init_from_client():
    client = AnthropicClient()

    # With model name
    model = outlines.from_anthropic(client, MODEL_NAME)
    assert isinstance(model, Anthropic)
    assert model.client == client
    assert model.model_name == MODEL_NAME

    # Without model name
    model = outlines.from_anthropic(client)
    assert isinstance(model, Anthropic)
    assert model.client == client
    assert model.model_name is None


def test_anthropic_wrong_inference_parameters():
    with pytest.raises(TypeError, match="got an unexpected"):
        model = Anthropic(AnthropicClient(), MODEL_NAME)
        model.generate("prompt", foo=10, max_tokens=1024)


def test_anthropic_wrong_input_type(image):
    class Foo:
        def __init__(self, foo):
            self.foo = foo

    with pytest.raises(TypeError, match="is not available"):
        model = Anthropic(AnthropicClient(), MODEL_NAME)
        model.generate(Foo("prompt"))

    with pytest.raises(ValueError, match="All assets provided must be of type Image"):
        model.generate(["foo?", Image(image), Video("")])


def test_anthropic_wrong_output_type():
    class Foo:
        def __init__(self, foo):
            self.foo = foo

    with pytest.raises(NotImplementedError, match="is not available"):
        model = Anthropic(AnthropicClient(), MODEL_NAME)
        model.generate("prompt", Foo(1))


@pytest.mark.api_call
def test_anthropic_simple_call(model):
    result = model.generate("Respond with one word. Not more.", max_tokens=1024)
    assert isinstance(result, str)


@pytest.mark.xfail(reason="Anthropic requires the `max_tokens` parameter to be set")
@pytest.mark.api_call
def test_anthropic_direct_call(model_no_model_name):
    result = model_no_model_name(
        "Respond with one word. Not more.",
        model_name=MODEL_NAME,
        max_tokens=1024,
    )
    assert isinstance(result, str)


@pytest.mark.api_call
def test_anthropic_simple_vision(model, image):
    result = model.generate(
        [
            "What does this logo represent?",
            Image(image),
        ],
        max_tokens=1024,
    )
    assert isinstance(result, str)


@pytest.mark.api_call
def test_anthropic_chat(model, image):
    result = model.generate(Chat(messages=[
        {"role": "assistant", "content": "How can I help you today?"},
        {
            "role": "user",
            "content": ["What does this logo represent?", Image(image)]
        },
    ]), max_tokens=10)
    assert isinstance(result, str)


@pytest.mark.api_call
def test_anthopic_streaming(model):
    result = model.stream("Respond with one word. Not more.", max_tokens=1024)
    assert isinstance(result, Generator)
    assert isinstance(next(result), str)


def test_anthropic_batch(model):
    with pytest.raises(NotImplementedError, match="does not support"):
        model.batch(
            ["Respond with one word.", "Respond with one word."],
            max_tokens=1024,
        )


================================================
FILE: tests/models/test_dottxt.py
================================================
import json
import os

import pytest
from dottxt.client import Dottxt as DottxtClient
from pydantic import BaseModel

import outlines
from outlines import Generator
from outlines.models.dottxt import Dottxt


MODEL_NAME = "dottxt/dottxt-v1-alpha"
MODEL_REVISION = "d06c86726aadd8dadb92c5b9b9e3ce8ef246c471"


class User(BaseModel):
    first_name: str
    last_name: str
    user_id: int


@pytest.fixture(scope="session")
def api_key():
    """Get the Dottxt API key from the environment, providing a default value
    if not found.

    This fixture should be used for tests that do not make actual api calls,
    but still require to initialize the Dottxt client.

    """
    api_key = os.getenv("DOTTXT_API_KEY")
    if not api_key:
        return "MOCK_API_KEY"
    return api_key


@pytest.fixture(scope="session")
def model_name_and_revision(api_key):
    client = DottxtClient(api_key=api_key)
    model_list = client.list_models()
    return (model_list[0].name, model_list[0].revision)


@pytest.fixture(scope="session")
def model(api_key, model_name_and_revision):
    client = DottxtClient(api_key=api_key)
    return Dottxt(
        client,
        model_name_and_revision[0],
        model_name_and_revision[1],
    )


@pytest.fixture(scope="session")
def model_no_model_name(api_key):
    client = DottxtClient(api_key=api_key)
    return Dottxt(client)


@pytest.mark.api_call
def test_dottxt_init_from_client(api_key, model_name_and_revision):
    client = DottxtClient(api_key=api_key)

    # Without model name
    model = outlines.from_dottxt(client)
    assert isinstance(model, Dottxt)
    assert model.client == client
    assert model.model_name is None

    # With model name
    model = outlines.from_dottxt(
        client,
        model_name_and_revision[0],
        model_name_and_revision[1],
    )
    assert isinstance(model, Dottxt)
    assert model.client == client
    assert model.model_name == model_name_and_revision[0]
    assert model.model_revision == model_name_and_revision[1]


def test_dottxt_wrong_output_type(model_no_model_name):
    with pytest.raises(TypeError, match="You must provide an output type"):
        model_no_model_name("prompt")


def test_dottxt_wrong_input_type(model_no_model_name):
    with pytest.raises(TypeError, match="is not available"):
        model_no_model_name(["prompt"], User)


@pytest.mark.api_call
def test_dottxt_wrong_inference_parameters(model_no_model_name):
    with pytest.raises(TypeError, match="got an unexpected"):
        model_no_model_name("prompt", User, foo=10)


@pytest.mark.api_call
def test_dottxt_direct_pydantic_call(model_no_model_name):
    result = model_no_model_name("Create a user", User)
    assert "first_name" in json.loads(result)


@pytest.mark.api_call
def test_dottxt_direct_jsonschema_call(
    model_no_model_name, model_name_and_revision
):
    result = model_no_model_name(
        "Create a user",
        User,
        model_name=model_name_and_revision[0],
        model_revision=model_name_and_revision[1],
    )
    assert "first_name" in json.loads(result)


@pytest.mark.api_call
def test_dottxt_generator_pydantic_call(model):
    generator = Generator(model, User)
    result = generator("Create a user")
    assert "first_name" in json.loads(result)


@pytest.mark.api_call
def test_dottxt_streaming(model):
    with pytest.raises(
        NotImplementedError,
        match="Dottxt does not support streaming"
    ):
        model.stream("Create a user", User)


@pytest.mark.api_call
def test_dottxt_batch(model):
    with pytest.raises(NotImplementedError, match="does not support"):
        model.batch(
            ["Respond with one word.", "Respond with one word."]
        )


================================================
FILE: tests/models/test_dottxt_type_adapter.py
================================================
import io
import json
import pytest
import sys
from dataclasses import dataclass

from PIL import Image as PILImage
from genson import SchemaBuilder
from pydantic import BaseModel

from outlines.inputs import Image
from outlines.models.dottxt import DottxtTypeAdapter
from outlines.types import cfg, json_schema, regex

if sys.version_info >= (3, 12):
    from typing import TypedDict
else:
    from typing_extensions import TypedDict


@pytest.fixture
def schema():
    return {
        "properties": {
            "user_id": {"title": "User Id", "type": "integer"},
            "name": {"title": "Name", "type": "string"},
        },
        "required": ["user_id", "name"],
        "title": "User",
        "type": "object",
    }


@pytest.fixture
def image():
    width, height = 1, 1
    white_background = (255, 255, 255)
    image = PILImage.new("RGB", (width, height), white_background)

    # Save to an in-memory bytes buffer and read as png
    buffer = io.BytesIO()
    image.save(buffer, format="PNG")
    buffer.seek(0)
    image = PILImage.open(buffer)

    return image


@pytest.fixture
def adapter():
    return DottxtTypeAdapter()


def test_dottxt_type_adapter_input_text(adapter):
    message = "prompt"
    result = adapter.format_input(message)
    assert result == message


def test_dottxt_type_adapter_input_invalid(adapter, image):
    prompt = ["prompt", image]
    with pytest.raises(TypeError, match="The input type"):
        _ = adapter.format_input(prompt)


def test_dottxt_type_adapter_output_invalid(adapter):
    with pytest.raises(TypeError, match="You must provide an output type"):
        adapter.format_output_type(None)

    with pytest.raises(TypeError, match="The type `str` is not supported"):
        adapter.format_output_type(str)

    with pytest.raises(TypeError, match="The type `int` is not supported"):
        adapter.format_output_type(int)

    with pytest.raises(TypeError, match="Regex-based structured outputs will soon be"):
        adapter.format_output_type(regex("[0-9]"))

    with pytest.raises(TypeError, match="CFG-based structured outputs will soon be"):
        adapter.format_output_type(cfg(""))


def test_dottxt_type_adapter_output_dataclass(adapter, schema):
    @dataclass
    class User:
        user_id: int
        name: str

    result = adapter.format_output_type(User)
    assert result == json.dumps(schema)


def test_dottxt_type_adapter_output_typed_dict(adapter, schema):
    class User(TypedDict):
        user_id: int
        name: str

    result = adapter.format_output_type(User)
    assert result == json.dumps(schema)


def test_dottxt_type_adapter_output_pydantic(adapter, schema):
    class User(BaseModel):
        user_id: int
        name: str

    result = adapter.format_output_type(User)
    assert result == json.dumps(schema)


def test_dottxt_type_adapter_output_genson_schema_builder(adapter, schema):
    builder = SchemaBuilder()
    builder.add_schema({"type": "object", "properties": {}})
    builder.add_object({"hi": "there"})
    builder.add_object({"hi": 5})

    result = adapter.format_output_type(builder)
    result_dict = json.loads(result)
    assert isinstance(result_dict, dict)
    expected_schema = {
        "$schema": "http://json-schema.org/schema#",
        "type": "object",
        "properties": {"hi": {"type": ["integer", "string"]}},
        "required": ["hi"],
    }
    assert result_dict == expected_schema


def test_dottxt_type_adapter_json_schema_str(adapter, schema):
    schema_str = json.dumps(schema)
    result = adapter.format_output_type(json_schema(schema_str))
    assert result == json.dumps(schema)


def test_dottxt_type_adapter_json_schema_dict(adapter, schema):
    result = adapter.format_output_type(json_schema(schema))
    assert result == json.dumps(schema)


================================================
FILE: tests/models/test_gemini.py
================================================
import io
import json
import sys
from dataclasses import dataclass
from enum import Enum
from typing import Generator, Literal

import pytest
from PIL import Image as PILImage
from google.genai import Client
from pydantic import BaseModel, ValidationError

import outlines
from outlines.inputs import Chat, Image, Video
from outlines.models.gemini import Gemini
from outlines.types import Choice

if sys.version_info >= (3, 12):
    from typing import TypedDict
else:
    from typing_extensions import TypedDict

MODEL_NAME = "gemini-1.5-flash-latest"


@pytest.fixture(scope="session")
def model():
    return Gemini(Client(), MODEL_NAME)


@pytest.fixture(scope="session")
def model_no_model_name():
    return Gemini(Client())


@pytest.fixture
def image():
    width, height = 1, 1
    white_background = (255, 255, 255)
    image = PILImage.new("RGB", (width, height), white_background)

    # Save to an in-memory bytes buffer and read as png
    buffer = io.BytesIO()
    image.save(buffer, format="PNG")
    buffer.seek(0)
    image = PILImage.open(buffer)

    return image


@pytest.mark.api_call
def test_gemini_init_from_client():
    client = Client()

    # Without model name
    model = outlines.from_gemini(client)
    assert isinstance(model, Gemini)
    assert model.client == client
    assert model.model_name is None

    # With model name
    model = outlines.from_gemini(client, MODEL_NAME)
    assert isinstance(model, Gemini)
    assert model.client == client
    assert model.model_name == MODEL_NAME


@pytest.mark.api_call
def test_gemini_wrong_inference_parameters(model):
    with pytest.raises(ValidationError):
        model.generate("prompt", foo=10)


@pytest.mark.api_call
def test_gemini_wrong_input_type(model, image):
    with pytest.raises(ValueError, match="All assets provided must be of type Image"):
        model.generate(["foo?", Image(image), Video("")])


@pytest.mark.api_call
def test_gemini_simple_call(model):
    result = model.generate("Respond with one word. Not more.")
    assert isinstance(result, str)


@pytest.mark.api_call
def test_gemini_direct_call(model_no_model_name):
    result = model_no_model_name(
        "Respond with one word. Not more.",
        model=MODEL_NAME
    )
    assert isinstance(result, str)


@pytest.mark.api_call
def test_gemini_simple_vision(model, image):
    result = model.generate(["What does this logo represent?", Image(image)])
    assert isinstance(result, str)


@pytest.mark.api_call
def test_gemini_chat(model, image):
    result = model.generate(Chat(messages=[
        {"role": "assistant", "content": "How can I help you today?"},
        {
            "role": "user",
            "content": ["What does this logo represent?", Image(image)]
        },
    ]))
    assert isinstance(result, str)


@pytest.mark.api_call
def test_gemini_simple_pydantic(model):
    class Foo(BaseModel):
        bar: int

    result = model.generate("foo?", Foo)
    assert isinstance(result, str)
    assert "bar" in json.loads(result)


@pytest.mark.api_call
def test_gemini_simple_vision_pydantic(model, image):
    class Logo(BaseModel):
        name: int

    result = model.generate(["What does this logo represent?", Image(image)], Logo)
    assert isinstance(result, str)
    assert "name" in json.loads(result)


@pytest.mark.api_call
def test_gemini_nested_pydantic(model):
    class Bar(BaseModel):
        fu: str

    class Foo(BaseModel):
        sna: int
        bar: Bar

    result = model.generate("foo?", Foo)
    assert isinstance(result, str)
    assert "sna" in json.loads(result)
    assert "bar" in json.loads(result)
    assert "fu" in json.loads(result)["bar"]


@pytest.mark.xfail(
    reason="The Gemini SDK's serialization method does not support Json Schema strings."
)
@pytest.mark.api_call
def test_gemini_simple_json_schema_string(model):
    schema = "{'properties': {'bar': {'title': 'Bar', 'type': 'integer'}}, 'required': ['bar'], 'title': 'Foo', 'type': 'object'}"
    result = model.generate("foo?", schema)
    assert isinstance(result, str)
    assert "bar" in json.loads(result)


@pytest.mark.xfail(
    reason="The Gemini SDK's serialization method does not support Json Schema dictionaries."
)
@pytest.mark.api_call
def test_gemini_simple_json_schema_dict(model):
    schema = {
        "properties": {"bar": {"type": "integer"}},
        "required": ["bar"],
        "type": "object",
    }
    result = model.generate("foo?", schema)
    assert isinstance(result, str)
    assert "bar" in json.loads(result)


@pytest.mark.api_call
def test_gemini_simple_typed_dict(model):
    class Foo(TypedDict):
        bar: int

    result = model.generate("foo?", Foo)
    assert isinstance(result, str)
    assert "bar" in json.loads(result)


@pytest.mark.api_call
def test_gemini_simple_dataclass(model):
    @dataclass
    class Foo:
        bar: int

    result = model.generate("foo?", Foo)
    assert isinstance(result, str)
    assert "bar" in json.loads(result)


@pytest.mark.api_call
def test_gemini_simple_choice_enum(model):
    class Foo(Enum):
        bar = "Bar"
        foor = "Foo"

    result = model.generate("foo?", Foo)
    assert isinstance(result, str)
    assert result == "Foo" or result == "Bar"


@pytest.mark.api_call
def test_gemini_simple_choice_choice(model):
    result = model.generate("foo?", Choice(["Foo", "Bar"]))
    assert isinstance(result, str)
    assert result == "Foo" or result == "Bar"


@pytest.mark.api_call
def test_gemini_sample_choice_literal(model):
    result = model.generate("foo?", Literal["Foo", "Bar"])
    assert isinstance(result, str)
    assert result == "Foo" or result == "Bar"


@pytest.mark.xfail(
    reason="Gemini supports lists for choices but we do not as it is semantically incorrect."
)
@pytest.mark.api_call
def test_gemini_simple_choice_list(model):
    choices = ["Foo", "Bar"]
    result = model.generate("foo?", choices)
    assert isinstance(result, str)
    assert result == "Foo" or result == "Bar"


@pytest.mark.api_call
def test_gemini_simple_list_pydantic(model):
    class Foo(BaseModel):
        bar: int

    result = model.generate("foo?", list[Foo])
    assert isinstance(json.loads(result), list)
    assert isinstance(json.loads(result)[0], dict)
    assert "bar" in json.loads(result)[0]


@pytest.mark.api_call
def test_gemini_streaming(model):
    result = model.stream("Respond with one word. Not more.")
    assert isinstance(result, Generator)
    assert isinstance(next(result), str)


@pytest.mark.api_call
def test_gemini_batch(model):
    with pytest.raises(NotImplementedError, match="does not support"):
        model.batch(
            ["Respond with one word.", "Respond with one word."],
        )


================================================
FILE: tests/models/test_gemini_type_adapter.py
================================================
import io
import pytest
import sys
from dataclasses import dataclass
from enum import Enum, EnumMeta
from typing import Literal, get_args

from PIL import Image as PILImage
from genson import SchemaBuilder
from pydantic import BaseModel

from outlines import cfg, json_schema, regex
from outlines.inputs import Chat, Image
from outlines.models.gemini import GeminiTypeAdapter
from outlines.types.utils import is_dataclass

if sys.version_info >= (3, 12):
    from typing import TypedDict
else:
    from typing_extensions import TypedDict


@pytest.fixture
def schema():
    return {
        "properties": {
            "user_id": {"title": "User Id", "type": "integer"},
            "name": {"title": "Name", "type": "string"},
        },
        "required": ["user_id", "name"],
        "title": "User",
        "type": "object",
    }


@pytest.fixture
def image():
    width, height = 1, 1
    white_background = (255, 255, 255)
    image = PILImage.new("RGB", (width, height), white_background)

    # Save to an in-memory bytes buffer and read as png
    buffer = io.BytesIO()
    image.save(buffer, format="PNG")
    buffer.seek(0)
    image = PILImage.open(buffer)

    return image


@pytest.fixture
def adapter():
    return GeminiTypeAdapter()


def test_gemini_type_adapter_input_text(adapter):
    message = "prompt"
    result = adapter.format_input(message)
    assert result == {"contents": [{"text": message}]}


def test_gemini_type_adapter_input_vision(adapter, image):
    image_input = Image(image)
    text_input = "hello"
    result = adapter.format_input([text_input, image_input])
    assert result == {
        "contents": [
            {
                "role": "user",
                "parts": [
                    {"text": text_input},
                    {
                        "inline_data": {
                            "mime_type": "image/png",
                            "data": image_input.image_str,
                        },
                    },
                ],
            },
        ]
    }


def test_gemini_type_adapter_input_chat(adapter, image):
    image_input = Image(image)
    input_message = Chat(messages=[
        {"role": "assistant", "content": "How can I help you today?"},
        {"role": "user", "content": [
            "What does this logo represent?",
            image_input,
        ]},
    ])
    result = adapter.format_input(input_message)
    assert result == {
        "contents": [
            {"role": "model", "parts": [{"text": "How can I help you today?"}]},
            {
                "role": "user",
                "parts": [
                    {"text": "What does this logo represent?"},
                    {
                        "inline_data": {
                            "mime_type": "image/png",
                            "data": image_input.image_str,
                        },
                    },
                ],
            },
        ]
    }


def test_gemini_type_adapter_input_invalid(adapter):
    @dataclass
    class Audio:
        file: str

    prompt = Audio(
        "file",
    )
    with pytest.raises(TypeError, match="The input type"):
        _ = adapter.format_input(prompt)


def test_gemini_type_adapter_output_invalid(adapter):
    with pytest.raises(TypeError, match="The type `str` is not supported"):
        adapter.format_output_type(str)

    with pytest.raises(TypeError, match="The type `int` is not supported"):
        adapter.format_output_type(int)

    with pytest.raises(TypeError, match="Neither regex-based"):
        adapter.format_output_type(regex("[0-9]"))

    with pytest.raises(TypeError, match="CFG-based structured outputs"):
        adapter.format_output_type(cfg(""))


def test_gemini_type_adapter_output_none(adapter):
    result = adapter.format_output_type(None)
    assert result == {}


def test_gemini_type_adapter_output_json_schema(adapter, schema):
    result = adapter.format_output_type(json_schema(schema))
    assert isinstance(result, dict)
    assert result["response_mime_type"] == "application/json"
    assert is_dataclass(result["response_schema"])


def test_gemini_type_adapter_output_list_json_schema(adapter, schema):
    result = adapter.format_output_type(list[json_schema(schema)])
    assert isinstance(result, dict)
    assert result["response_mime_type"] == "application/json"
    args = get_args(result["response_schema"])
    assert len(args) == 1
    assert is_dataclass(args[0])


def test_gemini_type_adapter_output_dataclass(adapter):
    @dataclass
    class User:
        user_id: int
        name: str

    result = adapter.format_output_type(User)
    assert result == {
        "response_mime_type": "application/json",
        "response_schema": User,
    }


def test_gemini_type_adapter_output_list_dataclass(adapter):
    class User(BaseModel):
        user_id: int
        name: str

    result = adapter.format_output_type(list[User])
    assert result == {
        "response_mime_type": "application/json",
        "response_schema": list[User],
    }


def test_gemini_type_adapter_output_typed_dict(adapter):
    class User(TypedDict):
        user_id: int
        name: str

    result = adapter.format_output_type(User)
    assert result == {
        "response_mime_type": "application/json",
        "response_schema": User,
    }


def test_gemini_type_adapter_output_list_typed_dict(adapter):
    class User(BaseModel):
        user_id: int
        name: str

    result = adapter.format_output_type(list[User])
    assert result == {
        "response_mime_type": "application/json",
        "response_schema": list[User],
    }


def test_gemini_type_adapter_output_pydantic(adapter):
    class User(BaseModel):
        user_id: int
        name: str

    result = adapter.format_output_type(User)
    assert result == {
        "response_mime_type": "application/json",
        "response_schema": User,
    }


def test_gemini_type_adapter_output_list_pydantic(adapter):
    class User(BaseModel):
        user_id: int
        name: str

    result = adapter.format_output_type(list[User])
    assert result == {
        "response_mime_type": "application/json",
        "response_schema": list[User],
    }


def test_gemini_type_adapter_output_genson_schema_builder(adapter):
    builder = SchemaBuilder()
    builder.add_schema({"type": "object", "properties": {"foo": {"type": "string"}, "bar": {"type": "integer"}}, "required": ["foo"]})
    result = adapter.format_output_type(builder)
    assert isinstance(result, dict)
    assert result["response_mime_type"] == "application/json"
    assert is_dataclass(result["response_schema"])


def test_gemini_type_adapter_output_list_genson_schema_builder(adapter):
    builder = SchemaBuilder()
    builder.add_schema({"type": "object", "properties": {"foo": {"type": "string"}, "bar": {"type": "integer"}}, "required": ["foo"]})
    result = adapter.format_output_type(list[builder])
    assert isinstance(result, dict)
    assert result["response_mime_type"] == "application/json"
    args = get_args(result["response_schema"])
    assert len(args) == 1
    assert is_dataclass(args[0])


def test_gemini_type_adapter_output_enum(adapter):
    class Foo(Enum):
        Bar = "bar"
        Fuzz = "fuzz"

    result = adapter.format_output_type(Foo)
    assert result == {
        "response_mime_type": "text/x.enum",
        "response_schema": Foo,
    }


def test_gemini_type_adapter_output_literal(adapter):
    Foo = Literal["bar", "fuzz"]
    result = adapter.format_output_type(Foo)

    assert isinstance(result, dict)
    assert len(result) == 2
    assert result["response_mime_type"] == "text/x.enum"
    assert isinstance(result["response_schema"], EnumMeta)
    assert len(result["response_schema"].__members__) == 2
    assert result["response_schema"].bar.value == "bar"
    assert result["response_schema"].fuzz.value == "fuzz"


================================================
FILE: tests/models/test_llamacpp.py
================================================
import json
from enum import Enum

import pytest
from llama_cpp import Llama
from pydantic import BaseModel

from outlines.inputs import Chat
from outlines.models.llamacpp import (
    LlamaCpp,
    LlamaCppTokenizer,
    LlamaCppTypeAdapter,
    from_llamacpp
)
from outlines.types.dsl import Regex, CFG


def test_load_model():
    model = from_llamacpp(
        Llama.from_pretrained(
            repo_id="M4-ai/TinyMistral-248M-v2-Instruct-GGUF",
            filename="TinyMistral-248M-v2-Instruct.Q4_K_M.gguf",
            chat_format="qwen"
        )
    )

    assert isinstance(model, LlamaCpp)
    assert isinstance(model.model, Llama)
    assert isinstance(model.tokenizer, LlamaCppTokenizer)
    assert isinstance(model.type_adapter, LlamaCppTypeAdapter)
    assert model.tensor_library_name == "numpy"


@pytest.fixture(scope="session")
def model(tmp_path_factory):
    return LlamaCpp(
        Llama.from_pretrained(
            repo_id="M4-ai/TinyMistral-248M-v2-Instruct-GGUF",
            filename="TinyMistral-248M-v2-Instruct.Q4_K_M.gguf",
            chat_format="qwen",
        )
    )

@pytest.fixture(scope="session")
def model_no_chat(tmp_path_factory):
    return LlamaCpp(
        Llama.from_pretrained(
            repo_id="tensorblock/Llama3-1B-Base-GGUF",
            filename="Llama3-1B-Base-Q2_K.gguf",
        ),
        chat_mode=False
    )

@pytest.fixture
def lark_grammar():
    return """
?start: sum

?sum: product
| sum "+" product   -> add
| sum "-" product   -> sub

?product: atom
| product "*" atom  -> mul
| product "/" atom  -> div

?atom: NUMBER           -> number
| "-" atom         -> neg
| "(" sum ")"

%import common.NUMBER
%import common.WS_INLINE

%ignore WS_INLINE
"""

@pytest.fixture
def ebnf_grammar():
    return """
root ::= answer
answer ::= "yes" | "no"
"""


def test_llamacpp_simple(model):
    result = model.generate("Respond with one word. Not more.", None)
    assert isinstance(result, str)


def test_llamacpp_chat(model):
    result = model.generate(
        Chat(
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": "Respond with one word. Not more."}
            ]
        ),
        max_tokens=10
    )
    assert isinstance(result, str)


def test_llamacpp_regex(model):
    result = model("Respond with one word. Not more.", Regex(r"[0-9]"))
    assert isinstance(result, str)
    assert int(result)
    assert len(result) == 1


def test_llamacpp_json(model):
    class Foo(BaseModel):
        bar: str

    result = model("foo? Respond with one word.", Foo, max_tokens=100)
    assert isinstance(result, str)
    assert "bar" in json.loads(result)


def test_llamacpp_choice(model):
    class Foo(Enum):
        bar = "Bar"
        foor = "Foo"

    result = model("foo?", Foo)
    assert result == "Foo" or result == "Bar"


def test_llamacpp_cfg(model, ebnf_grammar):
    response = model("Respond with one word. Not more.", CFG(ebnf_grammar))
    assert response in ["yes", "no"]


def test_llamacpp_cfg_outlines_core(model, lark_grammar):
    with pytest.raises(
        NotImplementedError,
        match="Outlines Core does not support context-free grammar."
    ):
        model(
            "Respond with one word. Not more.",
            CFG(lark_grammar),
            backend="outlines_core"
        )


def test_llamacpp_text_stop(model):
    result = model.generate("Write the letter a.", None, stop="a", max_tokens=100)
    assert "a" not in result


def test_llamacpp_stream_simple(model):
    generator = model.stream("Respond with one word. Not more.", None)

    for x in generator:
        assert isinstance(x, str)


def test_llamacpp_stream_chat(model):
    generator = model.stream(
        Chat(
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": "Respond with one word. Not more."}
            ]
        ),
        max_tokens=10
    )
    for x in generator:
        assert isinstance(x, str)


def test_llamacpp_stream_regex(model):
    generator = model.stream("Respond with one word. Not more.", Regex(r"[0-9]"))

    x = next(generator)
    assert isinstance(x, str)


def test_llamacpp_stream_json(model):
    class Foo(BaseModel):
        bar: int

    generator = model.stream("foo?", Foo)

    # NOTE: The first few chunks may be empty (role info, control tokens, finish chunks)
    # Relevant issue: https://github.com/abetlen/llama-cpp-python/issues/372
    first_non_empty_token = next(x for x in generator if x)
    assert first_non_empty_token == "{"


def test_llamacpp_stream_cfg(model, ebnf_grammar):
    response = ""
    for chunk in model.stream(
        "Respond with one word. Not more.", CFG(ebnf_grammar)
    ):
        response += chunk
    assert response in ["yes", "no"]


def test_llamacpp_stream_cfg_outlines_core(model, lark_grammar):
    with pytest.raises(
        NotImplementedError,
        match="Outlines Core does not support context-free grammar."
    ):
        for chunk in model.stream(
            "Respond with one word. Not more.",
            CFG(lark_grammar),
            backend="outlines_core"
        ):
            pass


def test_llamacpp_stream_choice(model):
    class Foo(Enum):
        bar = "Bar"
        foor = "Foo"

    generator = model.stream("foo?", Foo)

    first_non_empty_token = next(x for x in generator if x)
    assert first_non_empty_token[0] in ("B", "F")


def test_llamacpp_stream_text_stop(model):
    generator = model.stream("Write the letter a.", None, stop="a", max_tokens=100)

    result = next(generator)
    assert isinstance(result, str)
    assert result != "a"


def test_llamacpp_batch(model):
    with pytest.raises(NotImplementedError, match="does not support"):
        model.batch(
            ["Respond with one word.", "Respond with one word."],
        )

def test_llamacpp_no_chat(model_no_chat):
    result = model_no_chat.generate("Respond with one word. Not more.", None)
    assert isinstance(result, str)

    generator = model_no_chat.stream("Respond with one word. Not more.", None)
    for x in generator:
        assert isinstance(x, str)


================================================
FILE: tests/models/test_llamacpp_tokenizer.py
================================================
import ctypes

import pytest
import sys
from unittest.mock import MagicMock, patch

import llama_cpp
import transformers

from outlines.models.llamacpp import LlamaCppTokenizer


@pytest.fixture
def model():
    model = llama_cpp.Llama.from_pretrained(
        repo_id="M4-ai/TinyMistral-248M-v2-Instruct-GGUF",
        filename="TinyMistral-248M-v2-Instruct.Q4_K_M.gguf",
        chat_format="qwen",
    )
    setattr(
        model.tokenizer_,
        "hf_tokenizer",
        transformers.AutoTokenizer.from_pretrained("erwanf/gpt2-mini"),
    )
    return model


@pytest.fixture
def model_no_hf_tokenizer():
    model = llama_cpp.Llama.from_pretrained(
        repo_id="M4-ai/TinyMistral-248M-v2-Instruct-GGUF",
        filename="TinyMistral-248M-v2-Instruct.Q4_K_M.gguf",
        chat_format="qwen",
    )
    del model.tokenizer_
    return model


@pytest.fixture
def different_model():
    model = llama_cpp.Llama.from_pretrained(
        "TheBloke/phi-2-GGUF",
        "phi-2.Q4_K_M.gguf",
    )
    return model


@pytest.fixture
def tokenizer(model):
    return LlamaCppTokenizer(model)


@pytest.fixture
def another_tokenizer(model):
    return LlamaCppTokenizer(model)


@pytest.fixture
def tokenizer_no_hf_tokenizer(model_no_hf_tokenizer):
    return LlamaCppTokenizer(model_no_hf_tokenizer)


@pytest.fixture
def different_tokenizer(different_model):
    return LlamaCppTokenizer(different_model)


def test_llama_cpp_tokenizer_init(tokenizer, tokenizer_no_hf_tokenizer):
    # regular case
    assert tokenizer.eos_token_id is not None
    assert tokenizer.pad_token_id is not None
    assert isinstance(tokenizer.vocabulary, dict)

    # tokenizer with no hf_tokenizer
    assert tokenizer_no_hf_tokenizer.eos_token_id is not None
    assert tokenizer_no_hf_tokenizer.pad_token_id is not None
    assert isinstance(tokenizer_no_hf_tokenizer.vocabulary, dict)


def test_llama_cpp_tokenizer_encode(tokenizer):
    # batch case
    with pytest.raises(NotImplementedError):
        token_ids, attention_mask = tokenizer.encode(["foo", "bar"])

    # regular case
    token_ids, attention_mask = tokenizer.encode("Hello, world!")
    assert token_ids is not None
    assert attention_mask is not None
    assert len(token_ids) == len(attention_mask)


def test_llama_cpp_tokenizer_decode(tokenizer):
    token_ids, _ = tokenizer.encode("Hello, world!")
    decoded_text = tokenizer.decode(token_ids)
    assert isinstance(decoded_text, list)
    assert "".join(decoded_text).strip() == "Hello, world!"


def test_llama_cpp_tokenizer_convert_token_to_string(
    tokenizer,
    tokenizer_no_hf_tokenizer
):
    # with self._hf_tokenizer
    token_str = tokenizer.convert_token_to_string("<0x20>")
    assert isinstance(token_str, str)

    # without self._hf_tokenizer
    token_str = tokenizer_no_hf_tokenizer.convert_token_to_string("<0x20>")
    assert isinstance(token_str, str)


def test_llama_cpp_tokenizer_eq(tokenizer, another_tokenizer, different_tokenizer):
    assert not tokenizer == 1
    assert tokenizer == another_tokenizer
    assert tokenizer != different_tokenizer


def test_llama_cpp_tokenizer_hash(tokenizer, another_tokenizer, different_tokenizer):
    assert isinstance(hash(tokenizer), int)
    assert hash(tokenizer) == hash(another_tokenizer)
    assert hash(tokenizer) != hash(different_tokenizer)


def test_llama_cpp_tokenizer_getstate(tokenizer):
    state = tokenizer.__getstate__()
    assert isinstance(state, tuple)
    assert len(state) == 5
    assert isinstance(state[0], dict)
    assert isinstance(state[1], int)
    assert isinstance(state[2], str)
    assert isinstance(state[3], int)
    assert isinstance(state[4], list)


def test_llama_cpp_tokenizer_setstate(tokenizer):
    with pytest.raises(NotImplementedError):
        tokenizer.__setstate__(None)


def _make_mock_model(n_vocab, eos_id, pieces):
    """Build a mock Llama model whose vocab is defined by *pieces*.

    Parameters
    ----------
    n_vocab : int
        Number of tokens in the vocabulary.
    eos_id : int
        The EOS token id.
    pieces : dict[int, bytes]
        Mapping from token id to the raw bytes of the token piece.
    """
    model = MagicMock()
    # Remove tokenizer_ so the code falls into the C-API branch
    del model.tokenizer_
    model.token_eos.return_value = eos_id
    model.n_vocab.return_value = n_vocab
    model.model = MagicMock()
    return model


def test_vocab_truncation_retry_path():
    """Tokens whose piece length exceeds the 32-byte buffer must trigger the
    retry path with a larger buffer so their text is not collapsed."""
    long_piece = b"x" * 40  # 40 > 32 → triggers the retry branch
    short_piece = b"hi"
    eos_piece = b"</s>"

    pieces = {0: short_piece, 1: long_piece, 2: eos_piece}
    model = _make_mock_model(n_vocab=3, eos_id=2, pieces=pieces)

    def fake_llama_token_to_piece(vocab, token_id, buf, buf_size, *args):
        data = pieces[token_id]
        n = len(data)
        # Only write into the buffer when it is large enough
        if buf_size >= n:
            ctypes.memmove(buf, data, n)
        return n

    with patch(
        "outlines.models.llamacpp.llama_model_get_vocab",
        return_value=MagicMock(),
        create=True,
    ), patch(
        "outlines.models.llamacpp.llama_token_to_piece",
        side_effect=fake_llama_token_to_piece,
        create=True,
    ):
        # Patch the imports inside the __init__ else-branch
        with patch.dict(
            "sys.modules",
            {
                "llama_cpp": MagicMock(
                    llama_model_get_vocab=MagicMock(return_value=MagicMock()),
                    llama_token_to_piece=fake_llama_token_to_piece,
                ),
            },
        ):
            tok = LlamaCppTokenizer.__new__(LlamaCppTokenizer)
            # Re-import inside the else-branch uses llama_cpp module
            tok.__init__(model)

    assert tok.vocabulary[long_piece.decode()] == 1
    assert tok.vocabulary[short_piece.decode()] == 0
    assert tok.eos_token == eos_piece.decode()


def test_attention_mask_all_ones_even_with_eos():
    """The attention mask must be all-ones for every token, including EOS."""
    eos_piece = b"</s>"
    pieces = {0: b"hello", 1: eos_piece}
    model = _make_mock_model(n_vocab=2, eos_id=1, pieces=pieces)

    def fake_llama_token_to_piece(vocab, token_id, buf, buf_size, *args):
        data = pieces[token_id]
        n = len(data)
        if buf_size >= n:
            ctypes.memmove(buf, data, n)
        return n

    with patch.dict(
        "sys.modules",
        {
            "llama_cpp": MagicMock(
                llama_model_get_vocab=MagicMock(return_value=MagicMock()),
                llama_token_to_piece=fake_llama_token_to_piece,
            ),
        },
    ):
        tok = LlamaCppTokenizer.__new__(LlamaCppTokenizer)
        tok.__init__(model)

    # Simulate encoding that returns token ids including the EOS token
    fake_tokenizer = MagicMock()
    fake_tokenizer.tokenize.return_value = [0, 1]  # token 1 == eos_id
    tok.tokenizer = fake_tokenizer

    token_ids, attention_mask = tok.encode("hello</s>")

    assert token_ids == [0, 1]
    assert attention_mask == [1, 1]


def test_negative_n_skips_invalid_token():
    """Tokens that return n < 0 from llama_token_to_piece (error codes)
    must be silently skipped instead of producing garbage vocabulary entries."""
    eos_piece = b"</s>"
    pieces = {0: b"ok", 1: None, 2: eos_piece}  # token 1 returns error
    model = _make_mock_model(n_vocab=3, eos_id=2, pieces=pieces)

    def fake_llama_token_to_piece(vocab, token_id, buf, buf_size, *args):
        data = pieces[token_id]
        if data is None:
            return -1  # error return
        n = len(data)
        if buf_size >= n:
            ctypes.memmove(buf, data, n)
        return n

    with patch.dict(
        "sys.modules",
        {
            "llama_cpp": MagicMock(
                llama_model_get_vocab=MagicMock(return_value=MagicMock()),
                llama_token_to_piece=fake_llama_token_to_piece,
            ),
        },
    ):
        tok = LlamaCppTokenizer.__new__(LlamaCppTokenizer)
        tok.__init__(model)

    # Token 1 (error) must not appear in the vocabulary
    assert 1 not in tok.vocabulary.values()
    assert tok.vocabulary["ok"] == 0
    assert tok.eos_token == eos_piece.decode()


================================================
FILE: tests/models/test_llamacpp_type_adapter.py
================================================
import pytest
import io

from llama_cpp import LogitsProcessorList
from PIL import Image as PILImage
from outlines_core import Index, Vocabulary

from outlines.backends.outlines_core import OutlinesCoreLogitsProcessor
from outlines.inputs import Chat, Image
from outlines.models.llamacpp import LlamaCppTypeAdapter


@pytest.fixture
def adapter():
    return LlamaCppTypeAdapter()


@pytest.fixture
def logits_processor():
    vocabulary = Vocabulary.from_pretrained("openai-community/gpt2")
    index = Index(r"[0-9]{3}", vocabulary)
    return OutlinesCoreLogitsProcessor(index, "numpy")


@pytest.fixture
def image():
    width, height = 1, 1
    white_background = (255, 255, 255)
    image = PILImage.new("RGB", (width, height), white_background)

    # Save to an in-memory bytes buffer and read as png
    buffer = io.BytesIO()
    image.save(buffer, format="PNG")
    buffer.seek(0)
    image = PILImage.open(buffer)

    return image


def test_llamacpp_type_adapter_format_input(adapter, image):
    # Anything else than a string/Chat
    with pytest.raises(NotImplementedError):
        adapter.format_input(["Hello, world!"])

    # string
    assert adapter.format_input("Hello, world!") == "Hello, world!"

    # Chat
    messages = [
        {"role": "user", "content": "Hello, world!"},
        {"role": "assistant", "content": "Hello, world!"},
    ]
    assert adapter.format_input(Chat(messages=messages)) == messages

    # Multi-modal (invalid)
    with pytest.raises(
        ValueError,
        match="LlamaCpp does not support multi-modal messages."
    ):
        adapter.format_input(Chat(messages=[
            {"role": "user", "content": ["prompt", Image(image)]},
        ]))


def test_llamacpp_type_adapter_format_input_with_chat_template():
    adapter = LlamaCppTypeAdapter(has_chat_template=True)
    message = "prompt"
    result = adapter.format_input(message)

    assert result == [{"role": "user", "content": "prompt"}]


def test_llamacpp_type_adapter_format_input_without_chat_template():
    adapter = LlamaCppTypeAdapter(has_chat_template=False)
    message = "prompt"
    result = adapter.format_input(message)

    assert result == "prompt"


def test_llamacpp_type_adapter_format_output_type(adapter, logits_processor):
    formatted = adapter.format_output_type(logits_processor)
    assert isinstance(formatted, LogitsProcessorList)
    assert formatted[0].index == logits_processor.index
    assert formatted[0].tensor_library_name == logits_processor.tensor_library_name


================================================
FILE: tests/models/test_lmstudio.py
================================================
import io
import json
import os
import warnings
from enum import Enum
from typing import Annotated, AsyncGenerator, Generator

import lmstudio
import pytest
from PIL import Image as PILImage
from pydantic import BaseModel, Field

import outlines
from outlines.inputs import Chat, Image, Video
from outlines.models import AsyncLMStudio, LMStudio
from outlines.models.lmstudio import LMStudioTypeAdapter
from tests.test_utils.mock_lmstudio_client import (
    MockLMStudioClient,
    MockAsyncLMStudioClient,
)


# If the LMSTUDIO_SERVER_URL environment variable is set, use the real LMStudio server
# Otherwise, use the mock server
lmstudio_server_url = os.environ.get("LMSTUDIO_SERVER_URL")
lmstudio_model_name = os.environ.get(
    "LMSTUDIO_MODEL_NAME", "openai/gpt-oss-20b"
)

# Image for testing (only create when server is available, as lms.prepare_image requires it)
image_input = None
if lmstudio_server_url:
    width, height = 1, 1
    white_background = (255, 255, 255)
    image = PILImage.new("RGB", (width, height), white_background)
    buffer = io.BytesIO()
    image.save(buffer, format="PNG")
    buffer.seek(0)
    image = PILImage.open(buffer)
    image_input = Image(image)

if lmstudio_server_url:
    lmstudio_client = lmstudio.Client(lmstudio_server_url)
    async_lmstudio_client = lmstudio.AsyncClient(lmstudio_server_url)
else:
    warnings.warn("No LMStudio server URL provided, using mock server")
    lmstudio_client = MockLMStudioClient()
    async_lmstudio_client = MockAsyncLMStudioClient()


class Foo(BaseModel):
    foo: Annotated[str, Field(max_length=10)]


type_adapter = LMStudioTypeAdapter()

# Mock responses for non-image tests (image tests require a running server
# because lms.prepare_image() needs to connect to LM Studio)
mock_responses = [
    (
        {
            "messages": type_adapter.format_input("Respond with one word. Not more."),
        },
        "foo"
    ),
    (
        {
            "messages": type_adapter.format_input(
                "Create a character with a name in the foo field."
            ),
            "response_format": type_adapter.format_output_type(Foo),
        },
        '{"foo": "bar"}'
    ),
    (
        {
            "messages": type_adapter.format_input("Write a sentence about a cat."),
        },
        ["The ", "cat ", "sat."]
    ),
    (
        {
            "messages": type_adapter.format_input("Create a character."),
            "response_format": type_adapter.format_output_type(Foo),
        },
        ['{"foo":', ' "bar"}']
    ),
]


# If the LMSTUDIO_SERVER_URL environment variable is not set, add the mock
# responses to the mock clients
if not lmstudio_server_url:
    lmstudio_client.add_mock_responses(mock_responses)
    async_lmstudio_client.add_mock_responses(mock_responses)


# Skip condition for tests that require a running LM Studio server (image tests)
requires_lmstudio_server = pytest.mark.skipif(
    not lmstudio_server_url,
    reason=(
        "Image tests require a running LM Studio server (lms.prepare_image "
        + "needs connection)"
    )
)


@pytest.fixture
def model():
    return LMStudio(lmstudio_client, lmstudio_model_name)


@pytest.fixture
def model_no_model_name():
    return LMStudio(lmstudio_client)


@pytest.fixture
def async_model():
    if lmstudio_server_url:
        # We need to create a new lmstudio client
        client = lmstudio.AsyncClient(lmstudio_server_url)
        return AsyncLMStudio(client, lmstudio_model_name)
    return AsyncLMStudio(async_lmstudio_client, lmstudio_model_name)


@pytest.fixture
def async_model_no_model_name():
    if lmstudio_server_url:
        # We need to create a new lmstudio client
        client = lmstudio.AsyncClient(lmstudio_server_url)
        return AsyncLMStudio(client)
    return AsyncLMStudio(async_lmstudio_client)


def test_lmstudio_init_from_client():
    if lmstudio_server_url:
        client = lmstudio.Client(lmstudio_server_url)

        # With model name
        model = outlines.from_lmstudio(client, lmstudio_model_name)
        assert isinstance(model, LMStudio)
        assert model.client == client
        assert model.model_name == lmstudio_model_name

        # Without model name
        model = outlines.from_lmstudio(client)
        assert isinstance(model, LMStudio)
        assert model.client == client
        assert model.model_name is None
    else:
        # With mock client, test direct instantiation
        client = MockLMStudioClient()
        client.add_mock_responses(mock_responses)

        model = LMStudio(client, lmstudio_model_name)
        assert model.client == client
        assert model.model_name == lmstudio_model_name

        model = LMStudio(client)
        assert model.client == client
        assert model.model_name is None

    # With invalid client
    with pytest.raises(ValueError, match="Invalid client type"):
        outlines.from_lmstudio(object())


def test_lmstudio_simple(model):
    result = model.generate("Respond with one word. Not more.", None)
    assert isinstance(result, str)


def test_lmstudio_direct(model_no_model_name):
    result = model_no_model_name(
        "Respond with one word. Not more.",
        None,
        model=lmstudio_model_name,
    )
    assert isinstance(result, str)


@requires_lmstudio_server
def test_lmstudio_simple_vision(model):
    result = model.generate(
        ["What does this logo represent?", image_input],
        model=lmstudio_model_name,
    )
    assert isinstance(result, str)


@requires_lmstudio_server
def test_lmstudio_chat(model):
    result = model.generate(
        Chat(
            [
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": [
                    "What does this logo represent?",
                    image_input
                ]},
            ]
        ),
        model=lmstudio_model_name,
    )
    assert isinstance(result, str)


def test_lmstudio_json(model):
    result = model("Create a character with a name in the foo field.", Foo)
    assert isinstance(result, str)
    assert "foo" in json.loads(result)


def test_lmstudio_wrong_output_type(model):
    class BadFoo(Enum):
        bar = "Bar"
        foo = "Foo"

    with pytest.raises(TypeError, match="is not supported"):
        model.generate("foo?", BadFoo)


def test_lmstudio_wrong_input_type(model):
    with pytest.raises(TypeError, match="is not available"):
        model.generate({"foo?": "bar?"}, None)

    with pytest.raises(ValueError, match="All assets provided must be of type Image"):
        model.generate(["foo?", image_input, Video("")], None)


def test_lmstudio_stream(model):
    result = model.stream("Write a sentence about a cat.")
    assert isinstance(result, Generator)
    assert isinstance(next(result), str)


def test_lmstudio_stream_json(model_no_model_name):
    generator = model_no_model_name.stream("Create a character.", Foo, model=lmstudio_model_name)
    generated_text = []
    for text in generator:
        generated_text.append(text)
    assert "foo" in json.loads("".join(generated_text))


def test_lmstudio_batch(model):
    with pytest.raises(NotImplementedError, match="does not support"):
        model.batch(["Respond with one word.", "Respond with one word."])


def test_lmstudio_async_init_from_client():
    if lmstudio_server_url:
        client = lmstudio.AsyncClient(lmstudio_server_url)

        # With model name
        model = outlines.from_lmstudio(client, lmstudio_model_name)
        assert isinstance(model, AsyncLMStudio)
        assert model.client == client
        assert model.model_name == lmstudio_model_name

        # Without model name
        model = outlines.from_lmstudio(client)
        assert isinstance(model, AsyncLMStudio)
        assert model.client == client
        assert model.model_name is None
    else:
        # With mock client, test direct instantiation
        client = MockAsyncLMStudioClient()
        client.add_mock_responses(mock_responses)

        model = AsyncLMStudio(client, lmstudio_model_name)
        assert model.client == client
        assert model.model_name == lmstudio_model_name

        model = AsyncLMStudio(client)
        assert model.client == client
        assert model.model_name is None


@pytest.mark.asyncio
async def test_lmstudio_async_simple(async_model):
    result = await async_model.generate("Respond with one word. Not more.", None)
    assert isinstance(result, str)


@pytest.mark.asyncio
async def test_lmstudio_async_direct(async_model_no_model_name):
    result = await async_model_no_model_name(
        "Respond with one word. Not more.",
        None,
        model=lmstudio_model_name,
    )
    assert isinstance(result, str)


@requires_lmstudio_server
@pytest.mark.asyncio
async def test_lmstudio_async_simple_vision(async_model):
    result = await async_model.generate(
        ["What does this logo represent?", image_input],
        model=lmstudio_model_name,
    )
    assert isinstance(result, str)


@requires_lmstudio_server
@pytest.mark.asyncio
async def test_lmstudio_async_chat(async_model):
    result = await async_model.generate(
        Chat(
            [
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": [
                    "What does this logo represent?",
                    image_input
                ]},
            ]
        ),
        model=lmstudio_model_name,
    )
    assert isinstance(result, str)


@pytest.mark.asyncio
async def test_lmstudio_async_json(async_model):
    result = await async_model("Create a character with a name in the foo field.", Foo)
    assert isinstance(result, str)
    assert "foo" in json.loads(result)


@pytest.mark.asyncio
async def test_lmstudio_async_wrong_output_type(async_model):
    class BadFoo(Enum):
        bar = "Bar"
        foo = "Foo"

    with pytest.raises(TypeError, match="is not supported"):
        await async_model.generate("foo?", BadFoo)


@pytest.mark.asyncio
async def test_lmstudio_async_wrong_input_type(async_model):
    with pytest.raises(TypeError, match="is not available"):
        await async_model.generate({"foo?": "bar?"}, None)

    with pytest.raises(ValueError, match="All assets provided must be of type Image"):
        await async_model.generate(["foo?", image_input, Video("")], None)


@pytest.mark.asyncio
async def test_lmstudio_async_stream(async_model):
    result = async_model.stream("Write a sentence about a cat.")
    assert isinstance(result, AsyncGenerator)
    assert isinstance(await result.__anext__(), str)


@pytest.mark.asyncio
async def test_lmstudio_async_stream_json(async_model_no_model_name):
    async_generator = async_model_no_model_name.stream("Create a character.", Foo, model=lmstudio_model_name)
    generated_text = []
    async for chunk in async_generator:
        generated_text.append(chunk)
    assert "foo" in json.loads("".join(generated_text))


@pytest.mark.asyncio
async def test_lmstudio_async_batch(async_model):
    with pytest.raises(NotImplementedError, match="does not support"):
        await async_model.batch(["Respond with one word.", "Respond with one word."])


================================================
FILE: tests/models/test_lmstudio_type_adapter.py
================================================
import io
import json
import os
import sys
from dataclasses import dataclass

import pytest
from genson import SchemaBuilder
from PIL import Image as PILImage
from pydantic import BaseModel

from outlines.inputs import Chat, Image
from outlines.models.lmstudio import LMStudioTypeAdapter
from outlines.types import cfg, json_schema, regex

if sys.version_info >= (3, 12):
    from typing import TypedDict
else:
    from typing_extensions import TypedDict


# Skip condition for tests that require a running LM Studio server (image tests)
requires_lmstudio_server = pytest.mark.skipif(
    not os.environ.get("LMSTUDIO_SERVER_URL"),
    reason=(
        "Image tests require a running LM Studio server (lms.prepare_image "
        + "needs connection)"
    )
)


@pytest.fixture
def schema():
    return {
        "properties": {
            "user_id": {"title": "User Id", "type": "integer"},
            "name": {"title": "Name", "type": "string"},
        },
        "required": ["user_id", "name"],
        "title": "User",
        "type": "object",
    }


@pytest.fixture
def adapter():
    return LMStudioTypeAdapter()


@pytest.fixture
def image():
    width, height = 1, 1
    white_background = (255, 255, 255)
    image = PILImage.new("RGB", (width, height), white_background)

    # Save to an in-memory bytes buffer and read as png
    buffer = io.BytesIO()
    image.save(buffer, format="PNG")
    buffer.seek(0)
    image = PILImage.open(buffer)

    return image


def test_lmstudio_type_adapter_input_text(adapter):
    text_input = "prompt"
    result = adapter.format_input(text_input)
    assert isinstance(result, str)
    assert result == text_input


@requires_lmstudio_server
def test_lmstudio_type_adapter_input_vision(adapter, image):
    import lmstudio as lms

    image_input = Image(image)
    text_input = "prompt"
    result = adapter.format_input([text_input, image_input])
    assert isinstance(result, lms.Chat)


def test_lmstudio_type_adapter_input_chat(adapter):
    chat_input = Chat(messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Hello"},
        {"role": "assistant", "content": "Hi there!"},
        {"role": "user", "content": "How are you?"},
    ])
    result = adapter.format_input(chat_input)

    # Should return an lmstudio.Chat object
    import lmstudio as lms
    assert isinstance(result, lms.Chat)


def test_lmstudio_type_adapter_input_chat_no_system(adapter):
    chat_input = Chat(messages=[
        {"role": "user", "content": "Hello"},
        {"role": "assistant", "content": "Hi!"},
    ])
    result = adapter.format_input(chat_input)

    import lmstudio as lms
    assert isinstance(result, lms.Chat)


@requires_lmstudio_server
def test_lmstudio_type_adapter_input_chat_with_image(adapter, image):
    import lmstudio as lms

    image_input = Image(image)
    chat_input = Chat(messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": [
            "What is in this image?",
            image_input,
        ]},
        {"role": "assistant", "content": "response"},
    ])
    result = adapter.format_input(chat_input)
    assert isinstance(result, lms.Chat)


def test_lmstudio_type_adapter_input_invalid(adapter):
    prompt = {"foo": "bar"}
    with pytest.raises(TypeError, match="The input type"):
        _ = adapter.format_input(prompt)


def test_lmstudio_type_adapter_input_chat_invalid_content(adapter):
    chat_input = Chat(messages=[
        {"role": "user", "content": {"foo": "bar"}},
    ])
    with pytest.raises(ValueError, match="Invalid content type"):
        _ = adapter.format_input(chat_input)


def test_lmstudio_type_adapter_input_chat_invalid_role(adapter):
    chat_input = Chat(messages=[
        {"role": "unknown", "content": "hello"},
    ])
    with pytest.raises(ValueError, match="Unsupported role"):
        _ = adapter.format_input(chat_input)


def test_lmstudio_type_adapter_output_none(adapter):
    result = adapter.format_output_type(None)
    assert result is None


def test_lmstudio_type_adapter_output_invalid(adapter):
    with pytest.raises(TypeError, match="The type `str` is not supported"):
        adapter.format_output_type(str)

    with pytest.raises(TypeError, match="The type `int` is not supported"):
        adapter.format_output_type(int)

    with pytest.raises(TypeError, match="Regex-based structured outputs are not"):
        adapter.format_output_type(regex("[0-9]"))

    with pytest.raises(TypeError, match="CFG-based structured outputs are not"):
        adapter.format_output_type(cfg(""))


def test_lmstudio_type_adapter_output_dataclass(adapter, schema):
    @dataclass
    class User:
        user_id: int
        name: str

    result = adapter.format_output_type(User)
    assert result == schema


def test_lmstudio_type_adapter_output_typed_dict(adapter, schema):
    class User(TypedDict):
        user_id: int
        name: str

    result = adapter.format_output_type(User)
    assert result == schema


def test_lmstudio_type_adapter_output_pydantic(adapter, schema):
    class User(BaseModel):
        user_id: int
        name: str

    result = adapter.format_output_type(User)
    assert result == schema


def test_lmstudio_type_adapter_output_genson_schema_builder(adapter):
    builder = SchemaBuilder()
    builder.add_schema({"type": "object", "properties": {}})
    builder.add_object({"hi": "there"})
    builder.add_object({"hi": 5})

    result = adapter.format_output_type(builder)
    assert result == {
        "$schema": "http://json-schema.org/schema#",
        "type": "object",
        "properties": {"hi": {"type": ["integer", "string"]}},
        "required": ["hi"]
    }


def test_lmstudio_type_adapter_json_schema_str(adapter, schema):
    schema_str = json.dumps(schema)
    result = adapter.format_output_type(json_schema(schema_str))
    assert result == schema


def test_lmstudio_type_adapter_json_schema_dict(adapter, schema):
    result = adapter.format_output_type(json_schema(schema))
    assert result == schema


================================================
FILE: tests/models/test_mistral.py
================================================
import io
import json
import os
from typing import Annotated, Generator, AsyncGenerator

import pytest
from PIL import Image as PILImage
from mistralai import Mistral as MistralClient
from pydantic import BaseModel, Field

import outlines
from outlines.inputs import Chat, Image, Video
from outlines.models.mistral import AsyncMistral, Mistral
from outlines.types import JsonSchema, Regex


MODEL_NAME = "mistral-large-latest"
VISION_MODEL = "pixtral-large-latest"


@pytest.fixture(scope="session")
def api_key():
    """Get the Mistral API key from the environment, providing a default value if not found.

    This fixture should be used for tests that do not make actual api calls,
    but still require to initialize the Mistral client.

    """
    api_key = os.getenv("MISTRAL_API_KEY")
    if not api_key:
        return "MOCK_VALUE"
    return api_key


@pytest.fixture(scope="session")
def image():
    width, height = 1, 1
    white_background = (255, 255, 255)
    image = PILImage.new("RGB", (width, height), white_background)

    buffer = io.BytesIO()
    image.save(buffer, format="PNG")
    buffer.seek(0)
    image = PILImage.open(buffer)

    return image


@pytest.fixture(scope="session")
def model(api_key):
    return Mistral(MistralClient(api_key=api_key), MODEL_NAME)


@pytest.fixture(scope="session")
def vision_model(api_key):
    return Mistral(MistralClient(api_key=api_key), VISION_MODEL)


@pytest.fixture(scope="session")
def async_model(api_key):
    return AsyncMistral(MistralClient(api_key=api_key), MODEL_NAME)


@pytest.fixture(scope="session")
def async_vision_model(api_key):
    return AsyncMistral(MistralClient(api_key=api_key), VISION_MODEL)


@pytest.fixture(scope="session")
def model_no_model_name(api_key):
    return Mistral(MistralClient(api_key=api_key))


@pytest.fixture(scope="session")
def async_model_no_model_name(api_key):
    return AsyncMistral(MistralClient(api_key=api_key))


def test_mistral_init_from_client(api_key):
    client = MistralClient(api_key=api_key)

    # With model name
    model = outlines.from_mistral(client, MODEL_NAME)
    assert isinstance(model, Mistral)
    assert model.client == client
    assert model.model_name == MODEL_NAME

    # Without model name
    model = outlines.from_mistral(client)
    assert isinstance(model, Mistral)
    assert model.client == client
    assert model.model_name is None


def test_mistral_wrong_inference_parameters(model):
    with pytest.raises(RuntimeError, match="got an unexpected"):
        model("prompt", foo=10)


def test_mistral_wrong_input_type(model):
    with pytest.raises(TypeError, match="is not available"):
        model(123)


def test_mistral_wrong_output_type(model):
    with pytest.raises(
        TypeError,
        match="Regex-based structured outputs are not available with Mistral.",
    ):
        model("prompt", Regex("^.*$"))


@pytest.mark.api_call
def test_mistral_call(model):
    result = model("Respond with one word. Not more.")
    assert isinstance(result, str)


@pytest.mark.api_call
def test_mistral_call_model_name(model_no_model_name):
    result = model_no_model_name(
        "Respond with one word. Not more.",
        model=MODEL_NAME
    )
    assert isinstance(result, str)


@pytest.mark.api_call
def test_mistral_multiple_samples(model):
    result = model("Respond with one word. Not more.", n=2)
    assert isinstance(result, list)
    assert len(result) == 2
    assert isinstance(result[0], str)
    assert isinstance(result[1], str)


@pytest.mark.api_call
def test_mistral_vision(image, vision_model):
    result = vision_model(["What does this logo represent?", Image(image)])
    assert isinstance(result, str)


@pytest.mark.api_call
def test_mistral_chat(image, vision_model):
    result = vision_model(Chat(messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {
            "role": "user",
            "content": ["What does this logo represent?", Image(image)]
        },
    ]), max_tokens=10)
    assert isinstance(result, str)


@pytest.mark.api_call
def test_mistral_pydantic(model):
    class Foo(BaseModel):
        bar: int

    result = model("foo?", Foo)
    assert isinstance(result, str)
    assert "bar" in json.loads(result)


@pytest.mark.api_call
def test_mistral_pydantic_refusal(model):
    class Foo(BaseModel):
        bar: Annotated[str, Field(int, pattern=r"^\d+$")]

    with pytest.raises(TypeError, match="Mistral does not support your schema"):
        _ = model("foo?", Foo)


@pytest.mark.api_call
def test_mistral_vision_pydantic(vision_model, image):
    class Logo(BaseModel):
        name: int

    result = vision_model(["What does this logo represent?", Image(image)], Logo)
    assert isinstance(result, str)
    assert "name" in json.loads(result)


@pytest.mark.api_call
def test_mistral_json_schema(model):
    class Foo(BaseModel):
        bar: int

    schema = json.dumps(Foo.model_json_schema())

    result = model("foo?", JsonSchema(schema))
    assert isinstance(result, str)
    assert "bar" in json.loads(result)


@pytest.mark.api_call
def test_mistral_streaming(model):
    result = model.stream("Respond with one word. Not more.")
    assert isinstance(result, Generator)
    assert isinstance(next(result), str)


def test_mistral_batch(model):
    with pytest.raises(NotImplementedError, match="does not support"):
        model.batch(
            ["Respond with one word.", "Respond with one word."],
        )


def test_mistral_async_init_from_client(api_key):
    client = MistralClient(api_key=api_key)

    # Async with model name
    model = outlines.from_mistral(client, MODEL_NAME, async_client=True)
    assert isinstance(model, AsyncMistral)
    assert model.client == client
    assert model.model_name == MODEL_NAME

    # Async without model name
    model = outlines.from_mistral(client, async_client=True)
    assert isinstance(model, AsyncMistral)
    assert model.client == client
    assert model.model_name is None


@pytest.mark.asyncio
async def test_mistral_async_wrong_inference_parameters(async_model):
    with pytest.raises(RuntimeError, match="got an unexpected"):
        await async_model("prompt", foo=10)


@pytest.mark.asyncio
async def test_mistral_async_wrong_input_type(async_model):
    with pytest.raises(TypeError, match="is not available"):
        await async_model(123)


@pytest.mark.asyncio
async def test_mistral_async_wrong_output_type(async_model):
    with pytest.raises(
        TypeError,
        match="Regex-based structured outputs are not available with Mistral.",
    ):
        await async_model("prompt", Regex("^.*$"))


@pytest.mark.asyncio
@pytest.mark.api_call
async def test_mistral_async_call(async_model):
    result = await async_model("Respond with one word. Not more.")
    assert isinstance(result, str)


@pytest.mark.asyncio
@pytest.mark.api_call
async def test_mistral_async_call_model_name(async_model_no_model_name):
    result = await async_model_no_model_name(
        "Respond with one word. Not more.",
        model=MODEL_NAME,
    )
    assert isinstance(result, str)


@pytest.mark.asyncio
@pytest.mark.api_call
async def test_mistral_async_multiple_samples(async_model):
    result = await async_model("Respond with one word. Not more.", n=2)
    assert isinstance(result, list)
    assert len(result) == 2
    assert isinstance(result[0], str)
    assert isinstance(result[1], str)


@pytest.mark.asyncio
@pytest.mark.api_call
async def test_mistral_async_vision(async_vision_model, image):
    result = await async_vision_model(["What does this logo represent?", Image(image)])
    assert isinstance(result, str)


@pytest.mark.asyncio
@pytest.mark.api_call
async def test_mistral_async_chat(async_vision_model, image):
    result = await async_vision_model(Chat(messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {
            "role": "user",
            "content": ["What does this logo represent?", Image(image)]
        },
    ]), max_tokens=10)
    assert isinstance(result, str)


@pytest.mark.asyncio
@pytest.mark.api_call
async def test_mistral_async_pydantic(async_model):
    class Foo(BaseModel):
        bar: int

    result = await async_model("foo?", Foo)
    assert isinstance(result, str)
    assert "bar" in json.loads(result)


@pytest.mark.asyncio
@pytest.mark.api_call
async def test_mistral_async_pydantic_refusal(async_model):
    class Foo(BaseModel):
        bar: Annotated[str, Field(int, pattern=r"^\d+$")]

    with pytest.raises(TypeError, match="Mistral does not support your schema"):
        _ = await async_model("foo?", Foo)


@pytest.mark.asyncio
@pytest.mark.api_call
async def test_mistral_async_vision_pydantic(async_vision_model, image):
    class Logo(BaseModel):
        name: int

    result = await async_vision_model(["What does this logo represent?", Image(image)], Logo)
    assert isinstance(result, str)
    assert "name" in json.loads(result)


@pytest.mark.asyncio
@pytest.mark.api_call
async def test_mistral_async_json_schema(async_model):
    class Foo(BaseModel):
        bar: int

    schema = json.dumps(Foo.model_json_schema())

    result = await async_model("foo?", JsonSchema(schema))
    assert isinstance(result, str)
    assert "bar" in json.loads(result)


@pytest.mark.asyncio
@pytest.mark.api_call
async def test_mistral_async_streaming(async_model):
    result = async_model.stream("Respond with one word. Not more.")
    assert isinstance(result, AsyncGenerator)
    async for chunk in result:
        assert isinstance(chunk, str)
        break  # Just check the first chunk


@pytest.mark.asyncio
async def test_mistral_async_batch(async_model):
    with pytest.raises(NotImplementedError, match="does not support"):
        _ = await async_model.batch(
            ["Respond with one word.", "Respond with one word."],
        )


================================================
FILE: tests/models/test_mistral_type_adapter.py
================================================
import io
import json
import sys
from dataclasses import dataclass
from typing import Literal

import pytest
from PIL import Image as PILImage
from genson import SchemaBuilder
from mistralai import (
    AssistantMessage,
    SystemMessage,
    UserMessage,
)
from pydantic import BaseModel

from outlines.inputs import Chat, Image
from outlines.models.mistral import MistralTypeAdapter
from outlines.types import CFG, JsonSchema, Regex

if sys.version_info >= (3, 12):
    from typing import TypedDict
else:
    from typing_extensions import TypedDict


@pytest.fixture
def schema():
    return {
        "properties": {
            "user_id": {"title": "User Id", "type": "integer"},
            "name": {"title": "Name", "type": "string"},
        },
        "required": ["user_id", "name"],
        "title": "User",
        "type": "object",
        "additionalProperties": False,
    }

@pytest.fixture
def image():
    width, height = 1, 1
    white_background = (255, 255, 255)
    image = PILImage.new("RGB", (width, height), white_background)

    # Save to an in-memory bytes buffer and read as png
    buffer = io.BytesIO()
    image.save(buffer, format="PNG")
    buffer.seek(0)
    image = PILImage.open(buffer)

    return image

@pytest.fixture
def adapter():
    return MistralTypeAdapter()


def test_mistral_type_adapter_input_text(adapter):
    message = "Hello world"
    result = adapter.format_input(message)
    assert len(result) == 1
    assert isinstance(result[0], UserMessage)
    assert result[0].content == message


def test_mistral_type_adapter_input_list(adapter, image):
    image_input = Image(image)
    message_list = ["Hello world", image_input]
    result = adapter.format_input(message_list)
    assert len(result) == 1
    assert isinstance(result[0], UserMessage)
    message_content = result[0].content
    assert dict(message_content[0]) == {"type": "text", "text": "Hello world"}
    assert message_content[1].type == "image_url"
    assert hasattr(message_content[1], "image_url")


def test_mistral_type_adapter_input_chat(adapter, image):
    image_input = Image(image)
    chat = Chat([
        {"role": "system", "content": "You are helpful"},
        {"role": "user", "content": ["Hello world", image_input]},
        {"role": "assistant", "content": "Hi there"},
    ])
    result = adapter.format_input(chat)
    assert len(result) == 3
    assert isinstance(result[0], SystemMessage)
    assert result[0].content == "You are helpful"
    assert isinstance(result[1], UserMessage)
    assert dict(result[1].content[0]) == {"type": "text", "text": "Hello world"}
    assert result[1].content[1].type == "image_url"
    assert hasattr(result[1].content[1], "image_url")
    assert isinstance(result[2], AssistantMessage)
    assert result[2].content == "Hi there"


def test_mistral_type_adapter_input_invalid(adapter, image):
    @dataclass
    class Audio:
        file: str

    with pytest.raises(TypeError, match="is not available"):
        adapter.format_input(123)

    with pytest.raises(ValueError, match="Content list cannot be empty."):
        adapter.format_input([])

    with pytest.raises(
        ValueError,
        match="The first item in the list should be a string.",
    ):
        adapter.format_input([Image(image)])

    with pytest.raises(
        ValueError,
        match="Expected Image objects after the first string"
    ):
        adapter.format_input(["hello", Audio("file")])

    with pytest.raises(
        TypeError,
        match="Invalid content type",
    ):
        adapter.format_input(Chat([{"role": "user", "content": {}}]))

    with pytest.raises(ValueError, match="Unsupported role"):
        adapter.format_input(Chat([{"role": "invalid", "content": "Hello"}]))


def test_mistral_type_adapter_output_none(adapter):
    result = adapter.format_output_type(None)
    assert result == {}


def test_mistral_type_adapter_output_json_mode(adapter):
    result = adapter.format_output_type(dict)
    assert result == {"type": "json_object"}


def test_mistral_type_adapter_dataclass(adapter, schema):
    @dataclass
    class User:
        user_id: int
        name: str

    result = adapter.format_output_type(User)
    assert isinstance(result, dict)
    assert result["json_schema"]["strict"] is True
    assert result["json_schema"]["schema"] == schema


def test_mistral_type_adapter_typed_dict(adapter, schema):
    class User(TypedDict):
        user_id: int
        name: str

    result = adapter.format_output_type(User)
    assert isinstance(result, dict)
    assert result["json_schema"]["strict"] is True
    assert result["json_schema"]["schema"] == schema


def test_mistral_type_adapter_pydantic(adapter, schema):
    class User(BaseModel):
        user_id: int
        name: str

    result = adapter.format_output_type(User)
    assert isinstance(result, dict)
    assert result["json_schema"]["strict"] is True
    assert result["json_schema"]["schema"] == schema


def test_mistral_type_adapter_genson_schema_builder(adapter, schema):
    builder = SchemaBuilder()
    builder.add_schema({"type": "object", "properties": {}})
    builder.add_object({"hi": "there"})
    builder.add_object({"hi": 5})

    result = adapter.format_output_type(builder)
    assert isinstance(result, dict)
    assert result["json_schema"]["strict"] is True
    expected_schema = {
        "$schema": "http://json-schema.org/schema#",
        "type": "object",
        "properties": {"hi": {"type": ["integer", "string"]}},
        "required": ["hi"],
        "additionalProperties": False
    }
    assert result["json_schema"]["schema"] == expected_schema


def test_mistral_type_adapter_json_schema_str(adapter, schema):
    schema_str = json.dumps(schema)
    result = adapter.format_output_type(JsonSchema(schema_str))
    assert isinstance(result, dict)
    assert result["json_schema"]["strict"] is True
    assert result["json_schema"]["schema"] == schema


def test_mistral_type_adapter_output_unsupported(adapter):
    with pytest.raises(
        TypeError,
        match="Regex-based structured outputs are not available with Mistral.",
    ):
        adapter.format_output_type(Regex("[0-9]"))

    with pytest.raises(
        TypeError,
        match="CFG-based structured outputs are not available with Mistral.",
    ):
        adapter.format_output_type(CFG(""))

    with pytest.raises(TypeError, match="is not available with Mistral."):
        adapter.format_output_type(Literal["foo", "bar"])


================================================
FILE: tests/models/test_mlxlm.py
================================================
import pytest
import re
from enum import Enum
from typing import Generator

import outlines
from outlines.types import Regex
from outlines.models.mlxlm import (
    MLXLM,
    MLXLMTypeAdapter,
    from_mlxlm
)
from outlines.models.transformers import TransformerTokenizer
from pydantic import BaseModel

try:
    import mlx_lm
    import mlx.core as mx

    HAS_MLX = mx.metal.is_available()
except ImportError:
    HAS_MLX = False


TEST_MODEL = "mlx-community/SmolLM-135M-Instruct-4bit"


@pytest.mark.skipif(not HAS_MLX, reason="MLX tests require Apple Silicon")
def test_mlxlm_model_initialization():
    model = from_mlxlm(*mlx_lm.load(TEST_MODEL))
    assert isinstance(model, MLXLM)
    assert isinstance(model.model, mlx_lm.models.llama.Model)
    assert isinstance(
        model.mlx_tokenizer, mlx_lm.tokenizer_utils.TokenizerWrapper
    )
    assert isinstance(model.tokenizer, TransformerTokenizer)
    assert isinstance(model.type_adapter, MLXLMTypeAdapter)
    assert model.tensor_library_name == "mlx"


@pytest.fixture(scope="session")
def model(tmp_path_factory):
    model, tokenizer = mlx_lm.load(TEST_MODEL)
    return outlines.from_mlxlm(model, tokenizer)


@pytest.mark.skipif(not HAS_MLX, reason="MLX tests require Apple Silicon")
def test_mlxlm_tokenizer(model):
    # Test single string encoding/decoding
    test_text = "Hello, world!"
    token_ids, _ = model.tokenizer.encode(test_text)
    token_ids = mx.array(token_ids)
    assert isinstance(token_ids, mx.array)


@pytest.mark.skipif(not HAS_MLX, reason="MLX tests require Apple Silicon")
def test_mlxlm_simple(model):
    result = model.generate("Respond with one word. Not more.", None)
    assert isinstance(result, str)


@pytest.mark.skipif(not HAS_MLX, reason="MLX tests require Apple Silicon")
def test_mlxlm_call(model):
    result = model("Respond with one word. Not more.")
    assert isinstance(result, str)


@pytest.mark.skipif(not HAS_MLX, reason="MLX tests require Apple Silicon")
def test_mlxlm_invalid_input_type(model):
    with pytest.raises(NotImplementedError, match="is not available"):
        model(["Respond with one word. Not more."])


@pytest.mark.skipif(not HAS_MLX, reason="MLX tests require Apple Silicon")
def test_mlxlm_invalid_inference_kwargs(model):
    with pytest.raises(TypeError):
        model("Respond with one word. Not more.", foo="bar")


@pytest.mark.skipif(not HAS_MLX, reason="MLX tests require Apple Silicon")
def test_mlxlm_inference_kwargs(model):
    result = model("Write a short story about a cat.", max_tokens=2)
    assert isinstance(result, str)
    assert len(result) < 20


@pytest.mark.skipif(not HAS_MLX, reason="MLX tests require Apple Silicon")
def test_mlxlm_regex(model):
    result = model("Give a number between 0 and 9.", Regex(r"[0-9]"))
    assert isinstance(result, str)
    assert re.match(r"[0-9]", result)


@pytest.mark.skipif(not HAS_MLX, reason="MLX tests require Apple Silicon")
def test_mlxlm_json_schema(model):
    class Character(BaseModel):
        name: str

    result = model("Create a character with a name.", Character)
    assert "name" in result


@pytest.mark.skipif(not HAS_MLX, reason="MLX tests require Apple Silicon")
def test_mlxlm_choice(model):
    class Foo(Enum):
        cat = "cat"
        dog = "dog"

    result = model("Cat or dog?", Foo)
    assert result in ["cat", "dog"]


@pytest.mark.skipif(not HAS_MLX, reason="MLX tests require Apple Silicon")
def test_mlxlm_stream_text_stop(model):
    generator = model.stream(
        "Respond with one word. Not more.", None, max_tokens=100
    )
    assert isinstance(generator, Generator)
    assert isinstance(next(generator), str)


@pytest.mark.skipif(not HAS_MLX, reason="MLX tests require Apple Silicon")
def test_mlxlm_batch(model):
    result = model.batch(
        ["Respond with one word.", "Respond with one word."],
    )
    assert isinstance(result, list)
    assert len(result) == 2
    assert isinstance(result[0], str)
    assert isinstance(result[1], str)


@pytest.mark.skipif(not HAS_MLX, reason="MLX tests require Apple Silicon")
def test_mlxlm_batch_output_type(model):
    with pytest.raises(
        NotImplementedError,
        match="mlx-lm does not support constrained generation with batching."
    ):
        model.batch(
            ["Respond with one word.", "Respond with one word."],
            Regex(r"[0-9]")
        )


================================================
FILE: tests/models/test_mlxlm_type_adapter.py
================================================
import pytest
import io
from unittest.mock import MagicMock

from outlines_core import Index, Vocabulary
from PIL import Image as PILImage

from outlines.backends.outlines_core import OutlinesCoreLogitsProcessor
from outlines.inputs import Chat, Image
from outlines.models.mlxlm import MLXLMTypeAdapter

try:
    import mlx_lm
    import mlx.core as mx

    HAS_MLX = mx.metal.is_available()
except ImportError:
    HAS_MLX = False


MODEL_NAME = "mlx-community/SmolLM-135M-Instruct-4bit"


@pytest.fixture
def adapter():
    _, tokenizer = mlx_lm.load(MODEL_NAME)
    return MLXLMTypeAdapter(tokenizer=tokenizer)


@pytest.fixture
def logits_processor():
    vocabulary = Vocabulary.from_pretrained(MODEL_NAME)
    index = Index(r"[0-9]{3}", vocabulary)
    return OutlinesCoreLogitsProcessor(index, "mlx")


@pytest.fixture
def image():
    width, height = 1, 1
    white_background = (255, 255, 255)
    image = PILImage.new("RGB", (width, height), white_background)
    buffer = io.BytesIO()
    image.save(buffer, format="PNG")
    buffer.seek(0)
    image = PILImage.open(buffer)

    return image


def test_mlxlm_type_adapter_format_input_with_template():
    tokenizer = MagicMock()
    tokenizer.chat_template = "some_template"
    tokenizer.apply_chat_template.return_value = "formatted_prompt"

    adapter = MLXLMTypeAdapter(tokenizer=tokenizer, has_chat_template=True)
    message = "prompt"
    result = adapter.format_input(message)

    assert result == "formatted_prompt"
    tokenizer.apply_chat_template.assert_called_once_with(
        [{"role": "user", "content": "prompt"}],
        tokenize=False,
        add_generation_prompt=True,
    )


def test_mlxlm_type_adapter_format_input_without_template():
    tokenizer = MagicMock()
    tokenizer.chat_template = None

    adapter = MLXLMTypeAdapter(tokenizer=tokenizer, has_chat_template=False)
    message = "prompt"
    result = adapter.format_input(message)

    assert result == "prompt"


@pytest.mark.skipif(not HAS_MLX, reason="MLX tests require Apple Silicon")
def test_mlxlm_type_adapter_format_input(adapter, image):
    # Anything else than a string/Chat (invalid)
    with pytest.raises(NotImplementedError):
        adapter.format_input(["Hello, world!"])

    # String
    assert adapter.format_input("Hello, world!") == "Hello, world!"

    # Chat
    messages = [
        {"role": "user", "content": "Hello, world!"},
        {"role": "assistant", "content": "Hello, world!"},
    ]
    expected = (
        "<|im_start|>user\nHello, world!<|im_end|>\n<|im_start|>assistant\n"
        + "Hello, world!<|im_end|>\n<|im_start|>assistant\n"
    )
    assert adapter.format_input(Chat(messages=messages)) == expected

    # Multi-modal (invalid)
    with pytest.raises(
        ValueError,
        match="mlx-lm does not support multi-modal messages."
    ):
        adapter.format_input(Chat(messages=[
            {"role": "user", "content": ["prompt", Image(image)]},
        ]))


@pytest.mark.skipif(not HAS_MLX, reason="MLX tests require Apple Silicon")
def test_mlxlm_type_adapter_format_output_type(adapter, logits_processor):
    formatted = adapter.format_output_type(logits_processor)
    assert isinstance(formatted, list)
    assert len(formatted) == 1
    assert isinstance(formatted[0], OutlinesCoreLogitsProcessor)


================================================
FILE: tests/models/test_ollama.py
================================================
import io
import json
from enum import Enum
from typing import Annotated

import pytest
from PIL import Image as PILImage
from ollama import AsyncClient, Client
from pydantic import BaseModel, Field

import outlines
from outlines.inputs import Chat, Image, Video
from outlines.models import AsyncOllama, Ollama


MODEL_NAME = "tinyllama"


@pytest.fixture
def model():
    return Ollama(Client(), MODEL_NAME)


@pytest.fixture
def model_no_model_name():
    return Ollama(Client())


@pytest.fixture
def async_model():
    return AsyncOllama(AsyncClient(), MODEL_NAME)


@pytest.fixture
def async_model_no_model_name():
    return AsyncOllama(AsyncClient())


@pytest.fixture(scope="session")
def image():
    width, height = 1, 1
    white_background = (255, 255, 255)
    image = PILImage.new("RGB", (width, height), white_background)

    # Save to an in-memory bytes buffer and read as png
    buffer = io.BytesIO()
    image.save(buffer, format="PNG")
    buffer.seek(0)
    image = PILImage.open(buffer)

    return image


def test_ollama_init_from_client():
    client = Client()

    # With model name
    model = outlines.from_ollama(client, MODEL_NAME)
    assert isinstance(model, Ollama)
    assert model.client == client
    assert model.model_name == MODEL_NAME

    # Without model name
    model = outlines.from_ollama(client)
    assert isinstance(model, Ollama)
    assert model.client == client
    assert model.model_name is None

    # With invalid client
    with pytest.raises(ValueError, match="Invalid client type"):
        outlines.from_ollama(object())


def test_ollama_wrong_inference_parameters(model):
    with pytest.raises(TypeError, match="got an unexpected"):
        model.generate(
            "Respond with one word. Not more.", None, foo=10
        )


def test_ollama_simple(model):
    result = model.generate(
        "Respond with one word. Not more.", None
    )
    assert isinstance(result, str)


def test_ollama_direct(model_no_model_name):
    result = model_no_model_name(
        "Respond with one word. Not more.",
        None,
        model=MODEL_NAME,
    )
    assert isinstance(result, str)


def test_ollama_simple_vision(image, model):
    # This is not using a vision model, so it's not able to describe
    # the image, but we're still checking the model input syntax
    result = model.generate(
        ["What does this logo represent?", Image(image)],
        model=MODEL_NAME,
    )
    assert isinstance(result, str)


def test_ollama_chat(image, model):
    result = model.generate(
        Chat(
            [
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": [
                    "What does this logo represent?",
                    Image(image)
                ]},
            ]
        ),
        model=MODEL_NAME,
    )
    assert isinstance(result, str)


def test_ollama_json(model):
    class Foo(BaseModel):
        foo: Annotated[str, Field(max_length=1)]

    result = model("Respond with one word. Not more.", Foo)
    assert isinstance(result, str)
    assert "foo" in json.loads(result)


def test_ollama_wrong_output_type(model):
    class Foo(Enum):
        bar = "Bar"
        foor = "Foo"

    with pytest.raises(TypeError, match="is not supported"):
        model.generate("foo?", Foo)


def test_ollama_wrong_input_type(model, image):
    with pytest.raises(TypeError, match="is not available"):
        model.generate({"foo?": "bar?"}, None)

    with pytest.raises(ValueError, match="All assets provided must be of type Image"):
        model.generate(["foo?", Image(image), Video("")], None)


def test_ollama_stream(model):
    generator = model.stream("Write a sentence about a cat.")
    assert isinstance(next(generator), str)


def test_ollama_stream_json(model_no_model_name):
    class Foo(BaseModel):
        foo: Annotated[str, Field(max_length=2)]

    generator = model_no_model_name.stream("Create a character.", Foo, model=MODEL_NAME)
    generated_text = []
    for text in generator:
        generated_text.append(text)
    assert "foo" in json.loads("".join(generated_text))


def test_ollama_batch(model):
    with pytest.raises(NotImplementedError, match="does not support"):
        model.batch(
            ["Respond with one word.", "Respond with one word."],
        )


def test_ollama_async_init_from_client():
    client = AsyncClient()

    # With model name
    model = outlines.from_ollama(client, MODEL_NAME)
    assert isinstance(model, AsyncOllama)
    assert model.client == client
    assert model.model_name == MODEL_NAME

    # Without model name
    model = outlines.from_ollama(client)
    assert isinstance(model, AsyncOllama)
    assert model.client == client
    assert model.model_name is None


@pytest.mark.asyncio
async def test_ollama_async_wrong_inference_parameters(async_model):
    with pytest.raises(TypeError, match="got an unexpected"):
        await async_model.generate(
            "Respond with one word. Not more.", None, foo=10
        )


@pytest.mark.asyncio
async def test_ollama_async_simple(async_model):
    result = await async_model.generate(
        "Respond with one word. Not more.", None
    )
    assert isinstance(result, str)


@pytest.mark.asyncio
async def test_ollama_async_direct(async_model_no_model_name):
    result = await async_model_no_model_name(
        "Respond with one word. Not more.",
        None,
        model=MODEL_NAME,
    )
    assert isinstance(result, str)


@pytest.mark.asyncio
async def test_ollama_async_simple_vision(image, async_model):
    # This is not using a vision model, so it's not able to describe
    # the image, but we're still checking the model input syntax
    result = await async_model.generate(
        ["What does this logo represent?", Image(image)],
        model=MODEL_NAME,
    )
    assert isinstance(result, str)


@pytest.mark.asyncio
async def test_ollama_async_chat(image, async_model):
    result = await async_model.generate(
        Chat(
            [
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": [
                    "What does this logo represent?",
                    Image(image)
                ]},
            ]
        ),
        model=MODEL_NAME,
    )
    assert isinstance(result, str)


@pytest.mark.asyncio
async def test_ollama_async_json(async_model):
    class Foo(BaseModel):
        foo: Annotated[str, Field(max_length=1)]

    result = await async_model("Respond with one word. Not more.", Foo)
    assert isinstance(result, str)
    assert "foo" in json.loads(result)


@pytest.mark.asyncio
async def test_ollama_async_wrong_output_type(async_model):
    class Foo(Enum):
        bar = "Bar"
        foor = "Foo"

    with pytest.raises(TypeError, match="is not supported"):
        await async_model.generate("foo?", Foo)


@pytest.mark.asyncio
async def test_ollama_async_wrong_input_type(async_model):
    with pytest.raises(TypeError, match="is not available"):
        await async_model.generate({"foo?": "bar?"}, None)


@pytest.mark.asyncio
async def test_ollama_async_stream(async_model):
    async_generator = async_model.stream("Write a sentence about a cat.")
    assert isinstance(await async_generator.__anext__(), str)


@pytest.mark.asyncio
async def test_ollama_async_stream_json(async_model_no_model_name):
    class Foo(BaseModel):
        foo: Annotated[str, Field(max_length=2)]

    async_generator = async_model_no_model_name.stream("Create a character.", Foo, model=MODEL_NAME)
    generated_text = []
    async for chunk in async_generator:
        generated_text.append(chunk)
    assert "foo" in json.loads("".join(generated_text))


@pytest.mark.asyncio
async def test_ollama_async_batch(async_model):
    with pytest.raises(NotImplementedError, match="does not support"):
        await async_model.batch(
            ["Respond with one word.", "Respond with one word."],
        )


================================================
FILE: tests/models/test_ollama_type_adapter.py
================================================
import io
import json
import pytest
import sys
from dataclasses import dataclass

from genson import SchemaBuilder
from PIL import Image as PILImage
from pydantic import BaseModel

from outlines.inputs import Chat, Image
from outlines.models.ollama import OllamaTypeAdapter
from outlines.types import cfg, json_schema, regex

if sys.version_info >= (3, 12):
    from typing import TypedDict
else:
    from typing_extensions import TypedDict


@pytest.fixture
def schema():
    return {
        "properties": {
            "user_id": {"title": "User Id", "type": "integer"},
            "name": {"title": "Name", "type": "string"},
        },
        "required": ["user_id", "name"],
        "title": "User",
        "type": "object",
    }


@pytest.fixture
def image():
    width, height = 1, 1
    white_background = (255, 255, 255)
    image = PILImage.new("RGB", (width, height), white_background)

    # Save to an in-memory bytes buffer and read as png
    buffer = io.BytesIO()
    image.save(buffer, format="PNG")
    buffer.seek(0)
    image = PILImage.open(buffer)

    return image


@pytest.fixture
def adapter():
    return OllamaTypeAdapter()


def test_ollama_type_adapter_input_text(adapter):
    text_input = "prompt"
    result = adapter.format_input(text_input)
    assert isinstance(result, list)
    assert len(result) == 1
    assert result[0] == {"role": "user", "content": text_input}


def test_ollama_type_adapter_input_vision(adapter, image):
    image_input = Image(image)
    text_input = "prompt"
    result = adapter.format_input([text_input, image_input])
    assert isinstance(result, list)
    assert len(result) == 1
    assert result[0] == {
        "role": "user",
        "content": text_input,
        "images": [image_input.image_str],
    }


def test_ollama_type_adapter_input_chat(adapter, image):
    image_input = Image(image)
    chat_input = Chat(messages=[
        {"role": "system", "content": "prompt"},
        {"role": "user", "content": [
            "hello",
            image_input,
        ]},
        {"role": "assistant", "content": "response"},
    ])
    result = adapter.format_input(chat_input)
    assert isinstance(result, list)
    assert len(result) == 3
    assert result[0] == {"role": "system", "content": "prompt"}
    assert result[1] == {"role": "user", "content": "hello", "images": [image_input.image_str]}
    assert result[2] == {"role": "assistant", "content": "response"}


def test_ollama_type_adapter_input_invalid(adapter):
    prompt = {"foo": "bar"}
    with pytest.raises(TypeError, match="The input type"):
        _ = adapter.format_input(prompt)

    prompt = Chat(messages=[
        {"role": "user", "content": {"foo": "bar"}},
    ])
    with pytest.raises(ValueError, match="Invalid content type"):
        _ = adapter.format_input(prompt)


def test_ollama_type_adapter_output_invalid(adapter):
    with pytest.raises(TypeError, match="The type `str` is not supported"):
        adapter.format_output_type(str)

    with pytest.raises(TypeError, match="The type `int` is not supported"):
        adapter.format_output_type(int)

    with pytest.raises(TypeError, match="Regex-based structured outputs are not"):
        adapter.format_output_type(regex("[0-9]"))

    with pytest.raises(TypeError, match="CFG-based structured outputs are not"):
        adapter.format_output_type(cfg(""))


def test_ollama_type_adapter_output_dataclass(adapter, schema):
    @dataclass
    class User:
        user_id: int
        name: str

    result = adapter.format_output_type(User)
    assert result == schema


def test_ollama_type_adapter_output_typed_dict(adapter, schema):
    class User(TypedDict):
        user_id: int
        name: str

    result = adapter.format_output_type(User)
    assert result == schema


def test_ollama_type_adapter_output_pydantic(adapter, schema):
    class User(BaseModel):
        user_id: int
        name: str

    result = adapter.format_output_type(User)
    assert result == schema


def test_ollama_type_adapter_output_genson_schema_builder(adapter):
    builder = SchemaBuilder()
    builder.add_schema({"type": "object", "properties": {}})
    builder.add_object({"hi": "there"})
    builder.add_object({"hi": 5})

    result = adapter.format_output_type(builder)
    assert result == {
        "$schema": "http://json-schema.org/schema#",
        "type": "object",
        "properties": {"hi": {"type": ["integer", "string"]}},
        "required": ["hi"]
    }


def test_ollama_type_adapter_json_schema_str(adapter, schema):
    schema_str = json.dumps(schema)
    result = adapter.format_output_type(json_schema(schema_str))
    assert result == schema


def test_ollama_type_adapter_json_schema_dict(adapter, schema):
    result = adapter.format_output_type(json_schema(schema))
    assert result == schema


================================================
FILE: tests/models/test_openai.py
================================================
import io
import json
import os
from typing import Annotated, Generator, AsyncGenerator

import pytest
from PIL import Image as PILImage
from openai import AsyncOpenAI as AsyncOpenAIClient, OpenAI as OpenAIClient
from pydantic import BaseModel, Field

import outlines
from outlines.inputs import Chat, Image, Video
from outlines.models.openai import AsyncOpenAI, OpenAI
from outlines.types import json_schema

MODEL_NAME = "gpt-4o-mini-2024-07-18"


@pytest.fixture(scope="session")
def api_key():
    """Get the OpenAI API key from the environment, providing a default value if not found.

    This fixture should be used for tests that do not make actual api calls,
    but still require to initialize the OpenAI client.

    """
    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        return "MOCK_VALUE"
    return api_key


@pytest.fixture(scope="session")
def image():
    width, height = 1, 1
    white_background = (255, 255, 255)
    image = PILImage.new("RGB", (width, height), white_background)

    # Save to an in-memory bytes buffer and read as png
    buffer = io.BytesIO()
    image.save(buffer, format="PNG")
    buffer.seek(0)
    image = PILImage.open(buffer)

    return image


@pytest.fixture(scope="session")
def model(api_key):
    return OpenAI(OpenAIClient(api_key=api_key), MODEL_NAME)


@pytest.fixture(scope="session")
def async_model(api_key):
    return AsyncOpenAI(AsyncOpenAIClient(api_key=api_key), MODEL_NAME)


@pytest.fixture(scope="session")
def model_no_model_name(api_key):
    return OpenAI(OpenAIClient(api_key=api_key))


@pytest.fixture(scope="session")
def async_model_no_model_name(api_key):
    return AsyncOpenAI(AsyncOpenAIClient(api_key=api_key))


def test_openai_init_from_client(api_key):
    client = OpenAIClient(api_key=api_key)

    # With model name
    model = outlines.from_openai(client, "gpt-4o")
    assert isinstance(model, OpenAI)
    assert model.client == client
    assert model.model_name == "gpt-4o"

    # Without model name
    model = outlines.from_openai(client)
    assert isinstance(model, OpenAI)
    assert model.client == client
    assert model.model_name is None


def test_openai_wrong_inference_parameters(model):
    with pytest.raises(TypeError, match="got an unexpected"):
        model.generate("prompt", foo=10)


def test_openai_wrong_input_type(model, image):
    class Foo:
        def __init__(self, foo):
            self.foo = foo

    with pytest.raises(TypeError, match="is not available"):
        model.generate(Foo("prompt"))

    with pytest.raises(ValueError, match="All assets provided must be of type Image"):
        model.generate(["foo?", Image(image), Video("")])


def test_openai_wrong_output_type(model):
    class Foo:
        def __init__(self, foo):
            self.foo = foo

    with pytest.raises(TypeError, match="is not available"):
        model.generate("prompt", Foo(1))


@pytest.mark.api_call
def test_openai_simple_call(model):
    result = model.generate("Respond with one word. Not more.")
    assert isinstance(result, str)


@pytest.mark.api_call
def test_openai_simple_call_multiple_samples(model):
    result = model.generate("Respond with one word. Not more.", n=2)
    assert isinstance(result, list)
    assert len(result) == 2
    assert isinstance(result[0], str)
    assert isinstance(result[1], str)


@pytest.mark.api_call
def test_openai_direct_call(model_no_model_name):
    result = model_no_model_name(
        "Respond with one word. Not more.",
        model=MODEL_NAME,
    )
    assert isinstance(result, str)


@pytest.mark.api_call
def test_openai_simple_vision(image, model):
    result = model.generate(["What does this logo represent?", Image(image)])
    assert isinstance(result, str)


@pytest.mark.api_call
def test_openai_chat(image, model):
    result = model.generate(Chat(messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {
            "role": "user",
            "content": ["What does this logo represent?", Image(image)]
        },
    ]), max_tokens=10)
    assert isinstance(result, str)


@pytest.mark.api_call
def test_openai_simple_pydantic(model):
    class Foo(BaseModel):
        bar: int

    result = model.generate("foo?", Foo)
    assert isinstance(result, str)
    assert "bar" in json.loads(result)


@pytest.mark.api_call
def test_openai_simple_pydantic_refusal(model):
    class Foo(BaseModel):
        bar: Annotated[str, Field(int, pattern=r"^\d+$")]

    with pytest.raises(TypeError, match="OpenAI does not support your schema"):
        _ = model.generate("foo?", Foo)


@pytest.mark.api_call
def test_openai_simple_vision_pydantic(image, model):
    class Logo(BaseModel):
        name: int

    result = model.generate(["What does this logo represent?", Image(image)], Logo)
    assert isinstance(result, str)
    assert "name" in json.loads(result)


@pytest.mark.api_call
def test_openai_simple_json_schema(model):
    class Foo(BaseModel):
        bar: int

    schema = json.dumps(Foo.model_json_schema())

    result = model.generate("foo?", json_schema(schema))
    assert isinstance(result, str)
    assert "bar" in json.loads(result)


@pytest.mark.api_call
def test_openai_streaming(model):
    result = model.stream("Respond with one word. Not more.")
    assert isinstance(result, Generator)
    assert isinstance(next(result), str)


def test_openai_batch(model):
    with pytest.raises(NotImplementedError, match="does not support"):
        model.batch(
            ["Respond with one word.", "Respond with one word."],
        )


def test_openai_async_init_from_client(api_key):
    client = AsyncOpenAIClient(api_key=api_key)

    # With model name
    model = outlines.from_openai(client, "gpt-4o")
    assert isinstance(model, AsyncOpenAI)
    assert model.client == client
    assert model.model_name == "gpt-4o"

    # Without model name
    model = outlines.from_openai(client)
    assert isinstance(model, AsyncOpenAI)
    assert model.client == client
    assert model.model_name is None


@pytest.mark.asyncio
async def test_openai_async_wrong_inference_parameters(async_model):
    with pytest.raises(TypeError, match="got an unexpected"):
        await async_model.generate("prompt", foo=10)


@pytest.mark.asyncio
async def test_openai_async_wrong_input_type(async_model, image):
    class Foo:
        def __init__(self, foo):
            self.foo = foo

    with pytest.raises(TypeError, match="is not available"):
        await async_model.generate(Foo("prompt"))

    with pytest.raises(ValueError, match="All assets provided must be of type Image"):
        await async_model.generate(["foo?", Image(image), Video("")])


@pytest.mark.asyncio
async def test_openai_async_wrong_output_type(async_model):
    class Foo:
        def __init__(self, foo):
            self.foo = foo

    with pytest.raises(TypeError, match="is not available"):
        await async_model.generate("prompt", Foo(1))


@pytest.mark.asyncio
@pytest.mark.api_call
async def test_openai_async_simple_call(async_model):
    result = await async_model.generate("Respond with one word. Not more.")
    assert isinstance(result, str)


@pytest.mark.asyncio
@pytest.mark.api_call
async def test_openai_async_simple_call_multiple_samples(async_model):
    result = await async_model.generate("Respond with one word. Not more.", n=2)
    assert isinstance(result, list)
    assert len(result) == 2
    assert isinstance(result[0], str)
    assert isinstance(result[1], str)


@pytest.mark.asyncio
@pytest.mark.api_call
async def test_openai_async_direct_call(async_model_no_model_name):
    result = await async_model_no_model_name(
        "Respond with one word. Not more.",
        model=MODEL_NAME,
    )
    assert isinstance(result, str)


@pytest.mark.asyncio
@pytest.mark.api_call
async def test_openai_async_simple_vision(image, async_model):
    result = await async_model.generate(["What does this logo represent?", Image(image)])
    assert isinstance(result, str)


@pytest.mark.asyncio
@pytest.mark.api_call
async def test_openai_async_chat(image, async_model):
    result = await async_model.generate(Chat(messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {
            "role": "user",
            "content": ["What does this logo represent?", Image(image)]
        },
    ]), max_tokens=10)
    assert isinstance(result, str)


@pytest.mark.asyncio
@pytest.mark.api_call
async def test_openai_async_simple_pydantic(async_model):
    class Foo(BaseModel):
        bar: int

    result = await async_model.generate("foo?", Foo)
    assert isinstance(result, str)
    assert "bar" in json.loads(result)


@pytest.mark.asyncio
@pytest.mark.api_call
async def test_openai_async_simple_pydantic_refusal(async_model):
    class Foo(BaseModel):
        bar: Annotated[str, Field(int, pattern=r"^\d+$")]

    with pytest.raises(TypeError, match="OpenAI does not support your schema"):
        _ = await async_model.generate("foo?", Foo)


@pytest.mark.asyncio
@pytest.mark.api_call
async def test_openai_async_simple_vision_pydantic(image, async_model):
    class Logo(BaseModel):
        name: int

    result = await async_model.generate(["What does this logo represent?", Image(image)], Logo)
    assert isinstance(result, str)
    assert "name" in json.loads(result)


@pytest.mark.asyncio
@pytest.mark.api_call
async def test_openai_async_simple_json_schema(async_model):
    class Foo(BaseModel):
        bar: int

    schema = json.dumps(Foo.model_json_schema())

    result = await async_model.generate("foo?", json_schema(schema))
    assert isinstance(result, str)
    assert "bar" in json.loads(result)


@pytest.mark.asyncio
@pytest.mark.api_call
async def test_openai_async_streaming(async_model):
    result = async_model.stream("Respond with a single word.")
    assert isinstance(result, AsyncGenerator)
    async for chunk in result:
        assert isinstance(chunk, str)
        break  # Just check the first chunk


@pytest.mark.asyncio
async def test_openai_async_batch(async_model):
    with pytest.raises(NotImplementedError, match="does not support"):
        await async_model.batch(
            ["Respond with one word.", "Respond with one word."],
        )


================================================
FILE: tests/models/test_openai_type_adapter.py
================================================
import io
import json
import pytest
import sys
from dataclasses import dataclass
from typing import Literal

from genson import SchemaBuilder
from PIL import Image as PILImage
from pydantic import BaseModel

from outlines import cfg, json_schema, regex
from outlines.inputs import Chat, Image
from outlines.models.openai import OpenAITypeAdapter

if sys.version_info >= (3, 12):
    from typing import TypedDict
else:
    from typing_extensions import TypedDict


@pytest.fixture
def schema():
    return {
        "properties": {
            "user_id": {"title": "User Id", "type": "integer"},
            "name": {"title": "Name", "type": "string"},
        },
        "required": ["user_id", "name"],
        "title": "User",
        "type": "object",
        "additionalProperties": False,
    }


@pytest.fixture
def image():
    width, height = 1, 1
    white_background = (255, 255, 255)
    image = PILImage.new("RGB", (width, height), white_background)

    # Save to an in-memory bytes buffer and read as png
    buffer = io.BytesIO()
    image.save(buffer, format="PNG")
    buffer.seek(0)
    image = PILImage.open(buffer)

    return image


@pytest.fixture
def adapter():
    return OpenAITypeAdapter()


def test_openai_type_adapter_input_text(adapter):
    message = "prompt"
    result = adapter.format_input(message)
    assert result == [{"role": "user", "content": message}]


def test_openai_type_adapter_input_vision(adapter, image):
    image_input = Image(image)
    text_input = "hello"
    result = adapter.format_input([text_input, image_input])
    assert result == [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": text_input},
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/png;base64,{image_input.image_str}"
                    },
                },
            ],
        },
    ]


def test_openai_type_adapter_input_chat(adapter, image):
    image_input = Image(image)
    model_input = Chat(messages=[
        {"role": "system", "content": "prompt"},
        {"role": "user", "content": [
            "hello",
            image_input,
        ]},
        {"role": "assistant", "content": "response"},
    ])
    result = adapter.format_input(model_input)
    assert result == [
        {"role": "system", "content": "prompt"},
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "hello"},
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/png;base64,{image_input.image_str}"
                    },
                },
            ]
        },
        {"role": "assistant", "content": "response"},
    ]


def test_openai_type_adapter_input_invalid(adapter):
    @dataclass
    class Audio:
        file: str

    with pytest.raises(TypeError, match="is not available"):
        _ = adapter.format_input(Audio("file"))

    with pytest.raises(
        ValueError,
        match="All assets provided must be of type Image",
    ):
        _ = adapter.format_input(["prompt", Audio("file")])

    with pytest.raises(
        ValueError,
        match="The content must be a string or a list",
    ):
        _ = adapter.format_input(
            Chat(messages=[{"role": "user", "content": {"foo": "bar"}}])
        )


def test_openai_type_adapter_output_invalid(adapter):
    with pytest.raises(TypeError, match="The type `str` is not available"):
        adapter.format_output_type(str)

    with pytest.raises(TypeError, match="The type `int` is not available"):
        adapter.format_output_type(int)

    with pytest.raises(TypeError, match="The type `Literal` is not available"):
        adapter.format_output_type(Literal[1, 2])

    with pytest.raises(TypeError, match="Neither regex-based"):
        adapter.format_output_type(regex("[0-9]"))

    with pytest.raises(TypeError, match="CFG-based structured outputs"):
        adapter.format_output_type(cfg(""))

    class Foo(BaseModel):
        bar: str

    with pytest.raises(TypeError, match="The type `list` is not available"):
        adapter.format_output_type(list[Foo])


def test_openai_type_adapter_output_none(adapter):
    result = adapter.format_output_type(None)
    assert result == {}


def test_openai_type_adapter_json_mode(adapter):
    result = adapter.format_output_type(dict)
    assert result == {"response_format": {"type": "json_object"}}


def test_openai_type_adapter_dataclass(adapter, schema):
    @dataclass
    class User:
        user_id: int
        name: str

    result = adapter.format_output_type(User)
    assert isinstance(result, dict)
    assert result["response_format"]["json_schema"]["strict"] is True
    assert result["response_format"]["json_schema"]["schema"] == schema


def test_openai_type_adapter_typed_dict(adapter, schema):
    class User(TypedDict):
        user_id: int
        name: str

    result = adapter.format_output_type(User)
    assert isinstance(result, dict)
    assert result["response_format"]["json_schema"]["strict"] is True
    assert result["response_format"]["json_schema"]["schema"] == schema


def test_openai_type_adapter_pydantic(adapter, schema):
    class User(BaseModel):
        user_id: int
        name: str

    result = adapter.format_output_type(User)
    assert isinstance(result, dict)
    assert result["response_format"]["json_schema"]["strict"] is True
    assert result["response_format"]["json_schema"]["schema"] == schema


def test_openai_type_adapter_genson_schema_builder(adapter, schema):
    builder = SchemaBuilder()
    builder.add_schema({"type": "object", "properties": {}})
    builder.add_object({"hi": "there"})
    builder.add_object({"hi": 5})

    result = adapter.format_output_type(builder)
    assert isinstance(result, dict)
    assert result["response_format"]["json_schema"]["strict"] is True
    expected_schema = {
        "$schema": "http://json-schema.org/schema#",
        "type": "object",
        "properties": {"hi": {"type": ["integer", "string"]}},
        "required": ["hi"],
        "additionalProperties": False  # OpenAI adds this
    }
    assert result["response_format"]["json_schema"]["schema"] == expected_schema


def test_openai_type_adapter_json_schema_str(adapter, schema):
    schema_str = json.dumps(schema)
    result = adapter.format_output_type(json_schema(schema_str))
    assert isinstance(result, dict)
    assert result["response_format"]["json_schema"]["strict"] is True
    assert result["response_format"]["json_schema"]["schema"] == schema


def test_openai_type_adapter_json_schema_dict(adapter, schema):
    result = adapter.format_output_type(json_schema(schema))
    assert isinstance(result, dict)
    assert result["response_format"]["json_schema"]["strict"] is True
    assert result["response_format"]["json_schema"]["schema"] == schema


================================================
FILE: tests/models/test_sglang.py
================================================
# ATTENTION: When running this test with an actual SGLang server, use the
# llguidance backend (--grammar-backend llguidance)
# The outlines backend does not support the EBNF grammar. The xgrammar
# backend is slow and buggy.

import io
import os
import re
import warnings
from typing import AsyncGenerator, Generator

import pytest
from PIL import Image as PILImage
from openai import AsyncOpenAI, OpenAI

from outlines.inputs import Chat, Image
from outlines.models.sglang import SGLang, AsyncSGLang, from_sglang
from outlines.types.dsl import CFG, Regex, JsonSchema
from tests.test_utils.mock_openai_client import MockOpenAIClient, MockAsyncOpenAIClient


EBNF_YES_NO_GRAMMAR = """
root ::= answer
answer ::= "yes" | "no"
"""

# Image for testing
width, height = 1, 1
white_background = (255, 255, 255)
image = PILImage.new("RGB", (width, height), white_background)
buffer = io.BytesIO()
image.save(buffer, format="PNG")
buffer.seek(0)
image = PILImage.open(buffer)
image_input = Image(image)


# If the SGLANG_SERVER_URL environment variable is set, use the real SGLang server
# Otherwise, use the mock server
sglang_server_url = os.environ.get("SGLANG_SERVER_URL")
sglang_model_name = os.environ.get(
    "SGLANG_MODEL_NAME", "qwen/qwen2.5-0.5b-instruct"
)
if sglang_server_url:
    openai_client = OpenAI(base_url=sglang_server_url, api_key="foo")
    async_openai_client = AsyncOpenAI(base_url=sglang_server_url, api_key="foo")
else:
    warnings.warn("No SGLang server URL provided, using mock server")
    openai_client = MockOpenAIClient()
    async_openai_client = MockAsyncOpenAIClient()

mock_responses = [
    (
        {
            'messages': [
                {'role': "user", 'content': 'Respond with a single word.'}
            ],
            'model': sglang_model_name,
        },
        "foo"
    ),
    (
        {
            'messages': [
                {'role': "user", 'content': 'Respond with a single word.'}
            ],
            'model': sglang_model_name,
            'stream': True
        },
        ["foo", "bar"]
    ),
    (
        {
            'messages': [
                {'role': "user", 'content': 'Respond with a single word.'}
            ],
            'n': 2,
            'model': sglang_model_name,
        },
        ["foo", "bar"]
    ),
    (
        {
            'messages': [{'role': "user", 'content': 'foo?'}],
            'model': sglang_model_name,
            'max_tokens': 10,
            'response_format': {
                'type': 'json_schema',
                'json_schema': {
                    'name': 'default',
                    'strict': True,
                    'schema': {
                        'type': 'object',
                        'properties': {'bar': {'type': 'string'}},
                        'additionalProperties': False
                    }
                }
            }
        },
        '{"foo": "bar"}'
    ),
    (
        {
            'messages': [{'role': "user", 'content': 'foo?'}],
            'model': sglang_model_name,
            'max_tokens': 10,
            'extra_body': {
                'regex': '([0-9]{3})',
            },
        },
        "123"
    ),
    (
        {
            'messages': [{'role': "user", 'content': 'foo?'}],
            'model': sglang_model_name,
            'max_tokens': 10,
            'extra_body': {
                'ebnf': EBNF_YES_NO_GRAMMAR,
            },
        },
        "yes"
    ),
    (
        {
            'messages': [
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": "hello"},
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/png;base64,{image_input.image_str}"
                            },
                        },
                    ]
                }
            ],
            'model': sglang_model_name,
            'max_tokens': 10,
        },
        "foo"
    ),
    (
        {
            'messages': [
                {"role": "system", "content": "prompt"},
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": "hello"},
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/png;base64,{image_input.image_str}"
                            },
                        },
                    ],
                },
                {"role": "assistant", "content": "response"},
            ],
            'model': sglang_model_name,
            'max_tokens': 10,
        },
        "foo"
    )
]


# If the SGLANG_SERVER_URL environment variable is not set, add the mock
# responses to the mock clients
if not sglang_server_url:
    async_openai_client.add_mock_responses(mock_responses)
    openai_client.add_mock_responses(mock_responses)


@pytest.fixture
def sync_model():
    return SGLang(openai_client, model_name=sglang_model_name)


@pytest.fixture
def sync_model_no_model_name():
    return SGLang(openai_client)


@pytest.fixture
def async_model():
    return AsyncSGLang(async_openai_client, model_name=sglang_model_name)


@pytest.fixture
def async_model_no_model_name():
    return AsyncSGLang(async_openai_client)


def test_sglang_init():
    # We do not rely on the mock server here because we need an object
    # of type OpenAI and AsyncOpenAI to test the init function.
    openai_client = OpenAI(base_url="http://localhost:11434", api_key="foo")
    async_openai_client = AsyncOpenAI(base_url="http://localhost:11434", api_key="foo")

    # Sync with model name
    model = from_sglang(openai_client, sglang_model_name)
    assert isinstance(model, SGLang)
    assert model.client == openai_client
    assert model.model_name == sglang_model_name

    # Sync without model name
    model = from_sglang(openai_client)
    assert isinstance(model, SGLang)
    assert model.client == openai_client
    assert model.model_name is None

    # Async with model name
    model = from_sglang(async_openai_client, sglang_model_name)
    assert isinstance(model, AsyncSGLang)
    assert model.client == async_openai_client
    assert model.model_name == sglang_model_name

    # Async without model name
    model = from_sglang(async_openai_client)
    assert isinstance(model, AsyncSGLang)
    assert model.client == async_openai_client
    assert model.model_name is None

    with pytest.raises(ValueError, match="Unsupported client type"):
        from_sglang("foo")


def test_sglang_sync_simple_call(sync_model):
    result = sync_model("Respond with a single word.",)
    assert isinstance(result, str)


def test_sglang_sync_streaming(sync_model_no_model_name):
    result = sync_model_no_model_name.stream(
        "Respond with a single word.",
        model=sglang_model_name,
    )
    assert isinstance(result, Generator)
    assert isinstance(next(result), str)


def test_sglang_sync_batch(sync_model):
    with pytest.raises(NotImplementedError, match="does not support"):
        sync_model.batch(
            ["Respond with one word.", "Respond with one word."],
        )


def test_sglang_sync_vision(sync_model):
    result = sync_model(["hello", image_input], max_tokens=10)
    assert isinstance(result, str)


def test_sglang_sync_vision_chat(sync_model):
    result = sync_model(
        Chat(messages=[
            {"role": "system", "content": "prompt"},
            {"role": "user", "content": [
                "hello",
                image_input,
            ]},
            {"role": "assistant", "content": "response"},
        ]),
        max_tokens=10,
    )
    assert isinstance(result, str)


def test_sglang_sync_multiple_samples(sync_model):
    result = sync_model("Respond with a single word.", n=2)
    assert isinstance(result, list)
    assert len(result) == 2
    assert isinstance(result[0], str)
    assert isinstance(result[1], str)


def test_sglang_sync_json(sync_model):
    json_string = (
        '{"type": "object", "properties":'
        + ' {"bar": {"type": "string"}}}'
    )
    result = sync_model("foo?", JsonSchema(json_string), max_tokens=10)
    assert isinstance(result, str)
    assert "bar" in result


def test_sglang_sync_regex(sync_model):
    result = sync_model("foo?", Regex(r"[0-9]{3}"), max_tokens=10)
    assert isinstance(result, str)
    assert re.match(r"[0-9]{3}", result)


def test_sglang_sync_cfg(sync_model):
    with pytest.warns(
        UserWarning,
        match="SGLang grammar-based structured outputs expects an EBNF"
    ):
        result = sync_model("foo?", CFG(EBNF_YES_NO_GRAMMAR), max_tokens=10)
        assert isinstance(result, str)
        assert result in ["yes", "no"]


@pytest.mark.asyncio
async def test_sglang_async_simple_call(async_model):
    result = await async_model("Respond with a single word.",)
    assert isinstance(result, str)


@pytest.mark.asyncio
async def test_sglang_async_streaming(async_model_no_model_name):
    result = async_model_no_model_name.stream(
        "Respond with a single word.",
        model=sglang_model_name,
    )
    assert isinstance(result, AsyncGenerator)
    async for chunk in result:
        assert isinstance(chunk, str)
        break  # Just check the first chunk


@pytest.mark.asyncio
async def test_sglang_async_batch(async_model):
    with pytest.raises(NotImplementedError, match="does not support"):
        await async_model.batch(
            ["Respond with one word.", "Respond with one word."],
        )


@pytest.mark.asyncio
async def test_sglang_async_vision(async_model):
    result = await async_model(["hello", image_input], max_tokens=10)
    assert isinstance(result, str)


@pytest.mark.asyncio
async def test_sglang_async_vision_chat(async_model):
    result = await async_model(
        Chat(messages=[
            {"role": "system", "content": "prompt"},
            {"role": "user", "content": [
                "hello",
                image_input,
            ]},
            {"role": "assistant", "content": "response"},
        ]),
        max_tokens=10,
    )
    assert isinstance(result, str)


@pytest.mark.asyncio
async def test_sglang_async_multiple_samples(async_model):
    result = await async_model("Respond with a single word.", n=2)
    assert isinstance(result, list)
    assert len(result) == 2
    assert isinstance(result[0], str)
    assert isinstance(result[1], str)


@pytest.mark.asyncio
async def test_sglang_async_json(async_model):
    json_string = (
        '{"type": "object", "properties":'
        + ' {"bar": {"type": "string"}}}'
    )
    result = await async_model("foo?", JsonSchema(json_string), max_tokens=10)
    assert isinstance(result, str)
    assert "bar" in result


@pytest.mark.asyncio
async def test_sglang_async_regex(async_model):
    result = await async_model("foo?", Regex(r"[0-9]{3}"), max_tokens=10)
    assert isinstance(result, str)
    assert re.match(r"[0-9]{3}", result)


@pytest.mark.asyncio
async def test_sglang_async_cfg(async_model):
    result = await async_model("foo?", CFG(EBNF_YES_NO_GRAMMAR), max_tokens=10)
    assert isinstance(result, str)
    assert result in ["yes", "no"]


================================================
FILE: tests/models/test_sglang_type_adapter.py
================================================
import io
import json
import pytest
from dataclasses import dataclass

from PIL import Image as PILImage

from outlines.inputs import Chat, Image
from outlines.models.sglang import SGLangTypeAdapter
from outlines.types import CFG, JsonSchema


CFG_STRING = """
?start: expr
?expr: NUMBER
"""

JSON_SCHEMA_STRING = """
{
    "type": "object",
    "properties": {
        "answer": {"type": "number"}
    }
}
"""


@pytest.fixture
def type_adapter():
    return SGLangTypeAdapter()

@pytest.fixture
def cfg_instance():
    return CFG(CFG_STRING)

@pytest.fixture
def json_schema_instance():
    return JsonSchema(JSON_SCHEMA_STRING)

@pytest.fixture
def json_schema_whitespace_instance():
    return JsonSchema(JSON_SCHEMA_STRING, whitespace_pattern="\n")

@pytest.fixture
def image():
    width, height = 1, 1
    white_background = (255, 255, 255)
    image = PILImage.new("RGB", (width, height), white_background)

    # Save to an in-memory bytes buffer and read as png
    buffer = io.BytesIO()
    image.save(buffer, format="PNG")
    buffer.seek(0)
    image = PILImage.open(buffer)

    return image


def test_sglang_type_adapter_input_text(type_adapter):
    message = "prompt"
    result = type_adapter.format_input(message)
    assert result == [{"role": "user", "content": message}]


def test_sglang_type_adapter_input_vision(type_adapter, image):
    image_input = Image(image)
    result = type_adapter.format_input(["hello", image_input])
    assert result == [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "hello"},
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/png;base64,{image_input.image_str}"
                    },
                },
            ]
        }
    ]


def test_sglang_type_adapter_input_chat(type_adapter, image):
    image_input = Image(image)
    model_input = Chat(messages=[
        {"role": "system", "content": "prompt"},
        {"role": "user", "content": [
            "hello",
            image_input,
        ]},
        {"role": "assistant", "content": "response"},
    ])
    result = type_adapter.format_input(model_input)
    assert result == [
        {"role": "system", "content": "prompt"},
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "hello"},
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/png;base64,{image_input.image_str}"
                    },
                },
            ],
        },
        {"role": "assistant", "content": "response"},
    ]


def test_sglang_type_adapter_input_invalid(type_adapter):
    @dataclass
    class Audio:
        file: str

    prompt = Audio(
        "file",
    )
    with pytest.raises(TypeError, match="The input type"):
        _ = type_adapter.format_input(prompt)


def test_sglang_type_adapter_output_type(
    type_adapter,
    cfg_instance,
    json_schema_instance,
    json_schema_whitespace_instance,
):
    assert type_adapter.format_output_type(None) == {}
    with pytest.warns(
        UserWarning,
        match="SGLang grammar-based structured outputs expects an EBNF"
    ):
        assert type_adapter.format_output_type(cfg_instance) == {
            "extra_body": {"ebnf": CFG_STRING}
        }
    assert type_adapter.format_output_type(json_schema_instance) == {
        "response_format": {
            "type": "json_schema",
            "json_schema": {
                "name": "default",
                "strict": True,
                "schema": {
                    **json.loads(JSON_SCHEMA_STRING),
                    "additionalProperties": False,
                },
            },
        }
    }
    # whitespace pattern is ignored
    assert type_adapter.format_output_type(json_schema_whitespace_instance) == {
        "response_format": {
            "type": "json_schema",
            "json_schema": {
                "name": "default",
                "strict": True,
                "schema": {
                    **json.loads(JSON_SCHEMA_STRING),
                    "additionalProperties": False,
                },
            },
        }
    }
    assert type_adapter.format_output_type(int) == {
        "extra_body": {"regex": "([+-]?(0|[1-9][0-9]*))"}
    }


================================================
FILE: tests/models/test_tgi.py
================================================
import os
import re
import warnings
from typing import AsyncGenerator, Generator

import pytest
from huggingface_hub import InferenceClient, AsyncInferenceClient

from outlines.models.tgi import TGI, AsyncTGI, from_tgi
from outlines.types.dsl import CFG, Regex, JsonSchema
from tests.test_utils.mock_tgi_client import MockTGIInferenceClient, MockAsyncTGIInferenceClient


YES_NO_GRAMMAR = """
?start: answer

answer: "yes" | "no"
"""

# If the TGI_SERVER_URL environment variable is set, use the real TGI server
# Otherwise, use the mock server
tgi_server_url = os.environ.get("TGI_SERVER_URL")
if tgi_server_url:
    tgi_client = InferenceClient(tgi_server_url)
    async_tgi_client = AsyncInferenceClient(tgi_server_url)
else:
    warnings.warn("No TGI server URL provided, using mock server")
    tgi_client = MockTGIInferenceClient()
    async_tgi_client = MockAsyncTGIInferenceClient()

mock_responses = [
    (
        {
            'prompt': 'Respond with a single word.',
            'max_new_tokens': 10,
        },
        "foo"
    ),
    (
        {
            'prompt': 'Respond with a single word.',
            'max_new_tokens': 10,
            'stream': True
        },
        ["foo", "bar"]
    ),
    (
        {
            'prompt': 'foo?',
            'max_new_tokens': 10,
            'grammar': {
                'type': 'json',
                'value': {
                    'type': 'object',
                    'properties': {
                        'bar': {'type': 'string'}
                    },
                    'required': ['bar']
                }
            }
        },
        '{"foo": "bar"}'
    ),
    (
        {
            'prompt': 'foo?',
            'max_new_tokens': 10,
            'grammar': {
                'type': 'regex',
                'value': '([0-9]{3})',
            },
        },
        "123"
    ),
]

# If the TGI_SERVER_URL environment variable is not set, add the mock
# responses to the mock clients
if not tgi_server_url:
    async_tgi_client.add_mock_responses(mock_responses)
    tgi_client.add_mock_responses(mock_responses)


@pytest.fixture
def sync_model():
    return TGI(tgi_client)


@pytest.fixture
def async_model():
    return AsyncTGI(async_tgi_client)


def test_tgi_init():
    model = from_tgi(
        InferenceClient("http://localhost:11434"),
    )
    assert isinstance(model, TGI)

    model = from_tgi(
        AsyncInferenceClient("http://localhost:11434"),
    )
    assert isinstance(model, AsyncTGI)

    with pytest.raises(ValueError, match="Unsupported client type"):
        from_tgi("foo")


def test_tgi_sync_simple_call(sync_model):
    result = sync_model("Respond with a single word.", max_new_tokens=10)
    assert isinstance(result, str)


def test_tgi_sync_streaming(sync_model):
    result = sync_model.stream(
        "Respond with a single word.",
        max_new_tokens=10,
    )
    assert isinstance(result, Generator)
    assert isinstance(next(result), str)


def test_tgi_sync_batch(sync_model):
    with pytest.raises(NotImplementedError, match="does not support"):
        sync_model.batch(
            ["Respond with one word.", "Respond with one word."],
        )


def test_tgi_sync_json(sync_model):
    json_string = '{"type": "object", "properties": {"bar": {"type": "string"}}, "required": ["bar"]}'
    result = sync_model("foo?", JsonSchema(json_string), max_new_tokens=10)
    assert isinstance(result, str)
    assert "bar" in result


def test_tgi_sync_regex(sync_model):
    result = sync_model("foo?", Regex(r"[0-9]{3}"), max_new_tokens=10)
    assert isinstance(result, str)
    assert re.match(r"[0-9]{3}", result)


def test_tgi_sync_cfg(sync_model):
    with pytest.raises(
        NotImplementedError,
        match="TGI does not support CFG-based structured outputs",
    ):
        sync_model("foo?", CFG(YES_NO_GRAMMAR), max_new_tokens=10)


@pytest.mark.asyncio
async def test_tgi_async_simple_call(async_model):
    result = await async_model("Respond with a single word.", max_new_tokens=10)
    assert isinstance(result, str)


@pytest.mark.asyncio
async def test_tgi_async_streaming(async_model):
    result = async_model.stream("Respond with a single word.", max_new_tokens=10)
    assert isinstance(result, AsyncGenerator)
    async for chunk in result:
        assert isinstance(chunk, str)
        break  # Just check the first chunk


@pytest.mark.asyncio
async def test_tgi_async_batch(async_model):
    with pytest.raises(NotImplementedError, match="does not support"):
        await async_model.batch(
            ["Respond with one word.", "Respond with one word."],
        )


@pytest.mark.asyncio
async def test_tgi_async_json(async_model):
    json_string = '{"type": "object", "properties": {"bar": {"type": "string"}}, "required": ["bar"]}'
    result = await async_model("foo?", JsonSchema(json_string), max_new_tokens=10)
    assert isinstance(result, str)
    assert "bar" in result


@pytest.mark.asyncio
async def test_tgi_async_regex(async_model):
    result = await async_model("foo?", Regex(r"[0-9]{3}"), max_new_tokens=10)
    assert isinstance(result, str)
    assert re.match(r"[0-9]{3}", result)


@pytest.mark.asyncio
async def test_tgi_async_cfg(async_model):
    with pytest.raises(
        NotImplementedError,
        match="TGI does not support CFG-based structured outputs",
    ):
        await async_model("foo?", CFG(YES_NO_GRAMMAR), max_new_tokens=10)


================================================
FILE: tests/models/test_tgi_model_adapter.py
================================================
import json
import pytest

from outlines.models.tgi import TGITypeAdapter
from outlines.types import CFG, JsonSchema


CFG_STRING = """
?start: expr
?expr: NUMBER
"""

JSON_SCHEMA_STRING = """
{
    "type": "object",
    "properties": {
        "answer": {"type": "number"}
    }
}
"""


@pytest.fixture
def type_adapter():
    return TGITypeAdapter()

@pytest.fixture
def cfg_instance():
    return CFG(CFG_STRING)

@pytest.fixture
def json_schema_instance():
    return JsonSchema(JSON_SCHEMA_STRING)

@pytest.fixture
def json_schema_whitespace_instance():
    return JsonSchema(JSON_SCHEMA_STRING, whitespace_pattern="\n")


def test_tgi_type_adapter_input_text(type_adapter):
    message = "prompt"
    assert message == type_adapter.format_input(message)


def test_tgi_type_adapter_input_invalid(type_adapter):
    with pytest.raises(
        NotImplementedError,
        match="is not available with TGI",
    ):
        type_adapter.format_input({"foo": "bar"})


def test_tgi_type_adapter_output_type(
    type_adapter,
    json_schema_instance,
    json_schema_whitespace_instance,
):
    assert type_adapter.format_output_type(None) == {}
    assert type_adapter.format_output_type(json_schema_instance) == {
        "grammar": {
            "type": "json",
            "value": json.loads(JSON_SCHEMA_STRING),
        }
    }
    # whitespace_pattern is ignored
    assert type_adapter.format_output_type(json_schema_whitespace_instance) == {
        "grammar": {
            "type": "json",
            "value": json.loads(JSON_SCHEMA_STRING),
        }
    }
    assert type_adapter.format_output_type(int) == {
        "grammar": {
            "type": "regex",
            "value": "([+-]?(0|[1-9][0-9]*))",
        }
    }


def test_tgi_type_adapter_output_type_invalid(
    type_adapter,
    cfg_instance,
):
    with pytest.raises(
        NotImplementedError,
        match="TGI does not support CFG-based structured outputs.",
    ):
        type_adapter.format_output_type(cfg_instance)


================================================
FILE: tests/models/test_tokenizer.py
================================================
import pytest

from outlines.models.tokenizer import Tokenizer, _check_hf_chat_template


def test_tokenizer():
    with pytest.raises(TypeError, match="instantiate abstract"):
        Tokenizer()

def test_check_hf_chat_template():
    from transformers import AutoTokenizer

    assert _check_hf_chat_template(AutoTokenizer.from_pretrained("openai-community/gpt2")) is False
    assert _check_hf_chat_template(AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B")) is True


================================================
FILE: tests/models/test_transformers.py
================================================
import re
from enum import Enum

from pydantic import BaseModel
import pytest
import torch
import transformers

import outlines
from outlines.inputs import Chat
from outlines.models.transformers import (
    Transformers,
    TransformerTokenizer,
    TransformersTypeAdapter,
)
from outlines.types import Regex


TEST_MODEL = "erwanf/gpt2-mini"
TEST_MODEL_MAMBA = "hf-internal-testing/tiny-random-MambaForCausalLM"
TEST_MODEL_BART = "trl-internal-testing/tiny-BartModel"


def test_transformers_instantiate_invalid():
    with pytest.raises(ValueError):
        outlines.from_transformers(
            transformers.AutoModelForCausalLM.from_pretrained(TEST_MODEL),
            int,
        )


def test_transformers_instantiate_simple():
    model = outlines.from_transformers(
        transformers.AutoModelForCausalLM.from_pretrained(TEST_MODEL),
        transformers.AutoTokenizer.from_pretrained(TEST_MODEL),
    )
    assert isinstance(model, Transformers)
    assert isinstance(model.tokenizer, TransformerTokenizer)
    assert isinstance(model.type_adapter, TransformersTypeAdapter)
    assert model.tensor_library_name == "torch"


def test_transformers_instantiate_mamba():
    model = outlines.from_transformers(
        transformers.MambaForCausalLM.from_pretrained(TEST_MODEL_MAMBA),
        transformers.AutoTokenizer.from_pretrained(TEST_MODEL),
    )
    assert isinstance(model, Transformers)


def test_transformers_instantiate_tokenizer_kwargs_dtype():
    model = outlines.from_transformers(
        transformers.AutoModelForCausalLM.from_pretrained(TEST_MODEL),
        transformers.AutoTokenizer.from_pretrained(
            TEST_MODEL, additional_special_tokens=["<t1>", "<t2>"]
        ),
        device_dtype=torch.bfloat16,
    )
    assert "<t1>" in model.tokenizer.special_tokens
    assert "<t2>" in model.tokenizer.special_tokens
    assert model.device_dtype == torch.bfloat16


@pytest.fixture
def model():
    model = outlines.from_transformers(
        transformers.AutoModelForCausalLM.from_pretrained(TEST_MODEL),
        transformers.AutoTokenizer.from_pretrained(TEST_MODEL),
    )
    chat_template = '{% for message in messages %}{{ message.role }}: {{ message.content }}{% endfor %}'
    model.type_adapter.tokenizer.chat_template = chat_template

    return model


@pytest.fixture
def model_bart():
    model = outlines.from_transformers(
        transformers.BartForConditionalGeneration.from_pretrained(TEST_MODEL_BART),
        transformers.BartTokenizer.from_pretrained(TEST_MODEL_BART),
    )
    return model


def test_transformers_simple(model):
    result = model.generate("Respond with one word. Not more.", None)
    assert isinstance(result, str)


def test_transformers_call(model, model_bart):
    result = model("Respond with one word. Not more.")
    assert isinstance(result, str)

    model.device_dtype = torch.bfloat16
    result = model("Respond with one word. Not more.")
    assert isinstance(result, str)

    result = model_bart("Respond with one word. Not more.")
    assert isinstance(result, str)


def test_transformers_chat(model):
    result = model(
        Chat(messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "What is the capital of France?"},
        ])
    )
    assert isinstance(result, str)


def test_transformers_inference_kwargs(model):
    result = model("Respond with one word. Not more.", max_new_tokens=100)
    assert isinstance(result, str)


def test_transformers_invalid_inference_kwargs(model):
    with pytest.raises(ValueError):
        model("Respond with one word. Not more.", foo="bar")


def test_transformers_regex(model):
    result = model("Give a number between 0 and 9.", Regex(r"[0-9]"))
    assert isinstance(result, str)
    assert re.match(r"[0-9]", result)


def test_transformers_json(model):
    class Character(BaseModel):
        name: str

    result = model("Create a character with a name.", Character)
    assert "name" in result


def test_transformers_choice(model):
    class Foo(Enum):
        cat = "cat"
        dog = "dog"

    result = model("Cat or dog?", Foo)
    assert result in ["cat", "dog"]


def test_transformers_multiple_samples(model):
    result = model("Respond with one word. Not more.")
    assert isinstance(result, str)
    result = model(
        "Respond with one word. Not more.", num_return_sequences=2, do_sample=True
    )
    assert isinstance(result, list)
    assert len(result) == 2


def test_transformers_batch(model):
    result = model.batch(
        ["Respond with one word. Not more.", "Respond with one word. Not more."]
    )
    assert isinstance(result, list)
    assert len(result) == 2

    result = model.batch(
        ["Respond with one word. Not more.", "Respond with one word. Not more."],
        num_return_sequences=2,
        do_sample=True,
    )
    assert isinstance(result, list)
    assert len(result) == 2
    for item in result:
        assert isinstance(item, list)
        assert len(item) == 2

    result = model.batch(
        [
            Chat(messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": "What is the capital of France?"},
            ]),
            Chat(messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": "What is the capital of France?"},
            ]),
        ],
    )
    assert isinstance(result, list)
    assert len(result) == 2


def test_transformers_multiple_samples_constrained(model):
    class Foo(Enum):
        cat = "cat"
        dog = "dog"

    result = model("Cat or dog?", Foo, num_return_sequences=2, do_sample=True)
    assert isinstance(result, list)
    assert len(result) == 2
    assert result[0] in ["cat", "dog"]
    assert result[1] in ["cat", "dog"]


def test_transformers_batch_constrained(model):
    class Foo(Enum):
        cat = "cat"
        dog = "dog"

    result = model.batch(
        ["Cat or dog?", "Cat or dog?"],
        Foo,
    )
    assert isinstance(result, list)
    assert len(result) == 2
    assert result[0] in ["cat", "dog"]
    assert result[1] in ["cat", "dog"]

    result = model.batch(
        ["Cat or dog?", "Cat or dog?"],
        Foo,
        num_return_sequences=2,
        do_sample=True,
    )
    assert isinstance(result, list)
    assert len(result) == 2
    for item in result:
        assert isinstance(item, list)
        assert len(item) == 2
        assert item[0] in ["cat", "dog"]
        assert item[1] in ["cat", "dog"]


def test_transformers_streaming(model):
    with pytest.raises(NotImplementedError, match="Streaming is not implemented"):
        model.stream("Respond with one word. Not more.")


@pytest.mark.parametrize(
    "model_name",
    [
        TEST_MODEL,
        "HuggingFaceTB/SmolLM2-135M"
    ],
)
def test_transformers_parametrized_smoke(model_name):
    """
    Smoke test to ensure basic constrained generation works across
    different tokenizers.
    """
    hf_model = transformers.AutoModelForCausalLM.from_pretrained(model_name)
    hf_model.eval()

    hf_tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)

    model = outlines.from_transformers(hf_model, hf_tokenizer)

    prompt = "Is 1+1=2? Answer Yes or No:"
    constraint = Regex(r"\s*(Yes|No)")

    out = model(
        prompt,
        constraint,
        max_new_tokens=5,
        do_sample=False,
    )

    assert out.strip() in {"Yes", "No"}


================================================
FILE: tests/models/test_transformers_multimodal.py
================================================
# we only test vision models here as audio models are too heavy to run on CI

import io
import re
import torch
from enum import Enum

import pytest
from PIL import Image as PILImage
from pydantic import BaseModel
from transformers import (
    LlavaForConditionalGeneration,
    AutoProcessor,
)

import outlines
from outlines.inputs import Chat, Image
from outlines.models.transformers import (
    TransformersMultiModal,
    TransformerTokenizer,
    TransformersMultiModalTypeAdapter,
)
from outlines.types import Regex

TEST_MODEL = "trl-internal-testing/tiny-LlavaForConditionalGeneration"


@pytest.fixture
def image():
    width, height = 256, 256
    blue_background = (0, 0, 255)
    image = PILImage.new("RGB", (width, height), blue_background)
    buffer = io.BytesIO()
    image.save(buffer, format="PNG")
    buffer.seek(0)
    image = PILImage.open(buffer)

    return image


@pytest.fixture
def model():
    model = outlines.from_transformers(
        LlavaForConditionalGeneration.from_pretrained(TEST_MODEL),
        AutoProcessor.from_pretrained(TEST_MODEL),
    )

    return model


def test_transformers_multimodal_instantiate():
    model = outlines.from_transformers(
        LlavaForConditionalGeneration.from_pretrained(TEST_MODEL),
        AutoProcessor.from_pretrained(TEST_MODEL),
        device_dtype=torch.bfloat16,
    )
    assert isinstance(model, TransformersMultiModal)
    assert isinstance(model.tokenizer, TransformerTokenizer)
    assert isinstance(model.type_adapter, TransformersMultiModalTypeAdapter)
    assert model.tensor_library_name == "torch"
    assert model.device_dtype == torch.bfloat16


def test_transformers_multimodal_simple(model, image):
    result = model.generate(
        ["<image>Describe this image in one sentence:", Image(image)],
        None,
        max_new_tokens=2,
    )
    assert isinstance(result, str)


def test_transformers_multimodal_call(model, image):
    result = model(
        ["<image>Describe this image in one sentence:", Image(image)],
        max_new_tokens=2,
    )
    assert isinstance(result, str)

    model.device_dtype = torch.bfloat16
    result = model(
        ["<image>Describe this image in one sentence:", Image(image)],
        max_new_tokens=2,
    )
    assert isinstance(result, str)


def test_transformers_multimodal_wrong_number_image(model, image):
    with pytest.raises(ValueError):
        model(
            [
                "<image>Describe this image in one sentence:",
                Image(image),
                Image(image),
            ],
        )


def test_transformers_multimodal_wrong_input_type(model):
    with pytest.raises(TypeError):
        model.generate("invalid input", None)


def test_transformers_multimodal_chat(model, image):
    result = model(
        Chat(messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {
                "role": "user",
                "content": [
                    "Describe this image in one sentence:",
                    Image(image),
                ],
            },
        ]),
        max_new_tokens=2,
    )
    assert isinstance(result, str)

    result = model(
        Chat(messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Describe this image in one sentence:"},
                    {"type": "image", "image": Image(image)},
                ],
            },
        ]),
        max_new_tokens=2,
    )
    assert isinstance(result, str)


def test_transformers_inference_kwargs(model, image):
    result = model(
        ["<image>Describe this image in one sentence:", Image(image)],
        max_new_tokens=2,
    )
    assert isinstance(result, str)


def test_transformers_invalid_inference_kwargs(model, image):
    with pytest.raises(ValueError):
        model(
            [
                "<image>Describe this image in one sentence:",
                Image(image),
            ],
            foo="bar",
        )


def test_transformers_several_image(model, image):
    result = model(
        [
            "<image><image>Describe this image in one sentence:",
            Image(image),
            Image(image),
        ],
        max_new_tokens=2,
    )
    assert isinstance(result, str)


def test_transformers_multimodal_json(model, image):
    class Foo(BaseModel):
        name: str

    result = model(
        ["<image>Give the name of the color.", Image(image)],
        Foo,
        max_new_tokens=10,
    )
    assert "name" in result


def test_transformers_multimodal_regex(model, image):
    result = model(
        ["<image>How warn is the color from 0 to 9?", Image(image)],
        Regex(r"[0-9]")
    )

    assert isinstance(result, str)
    assert re.match(r"[0-9]", result)


def test_transformers_multimodal_choice(model, image):
    class Foo(Enum):
        white = "white"
        blue = "blue"

    result = model(
        ["<image>Is it a white or a blue?", Image(image)],
        Foo,
    )

    assert isinstance(result, str)
    assert result in ["white", "blue"]


def test_transformers_multimodal_multiple_samples(model, image):
    result = model(
        ["<image>Describe this image in one sentence.", Image(image)],
        num_return_sequences=2,
        num_beams=2,
        max_new_tokens=2,
    )
    assert isinstance(result, list)
    assert len(result) == 2


def test_transformers_multimodal_batch(model, image):
    result = model.batch(
        [
            ["<image>Describe this image in one sentence.", Image(image)],
            ["<image>Describe this image in one sentence.", Image(image)],
        ],
        max_new_tokens=2,
    )
    assert isinstance(result, list)
    assert len(result) == 2

    result = model.batch(
        [
            ["<image>Describe this image in one sentence.<image>", Image(image), Image(image)],
            ["<image>Describe this image in one sentence.<image>", Image(image), Image(image)],
        ],
        num_return_sequences=2,
        num_beams=2,
        max_new_tokens=2,
    )
    assert isinstance(result, list)
    assert len(result) == 2
    for item in result:
        assert isinstance(item, list)
        assert len(item) == 2

    result = model.batch(
        [
            Chat(messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {
                    "role": "user",
                    "content": [
                        "Describe this image in one sentence:",
                        Image(image),
                    ],
                },
            ]),
            Chat(messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {
                    "role": "user",
                    "content": [
                        "Describe this image in one sentence:",
                        Image(image),
                    ],
                },
            ]),
        ],
        max_new_tokens=2,
    )
    assert isinstance(result, list)
    assert len(result) == 2

    result = model.batch(
        [
            Chat(messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": "Describe this image in one sentence:"},
                        {"type": "image", "image": Image(image)},
                    ],
                },
            ]),
            Chat(messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": "Describe this image in one sentence:"},
                        {"type": "image", "image": Image(image)},
                    ],
                },
            ]),
        ],
        max_new_tokens=2,
    )
    assert isinstance(result, list)
    assert len(result) == 2


================================================
FILE: tests/models/test_transformers_multimodal_type_adapter.py
================================================
import pytest

from PIL import Image as PILImage
from outlines_core import Index, Vocabulary
from transformers import (
    AutoProcessor,
    LogitsProcessorList,
)

from outlines.inputs import Audio, Chat, Image, Video
from outlines.models.transformers import TransformersMultiModalTypeAdapter
from outlines.backends.outlines_core import OutlinesCoreLogitsProcessor


MODEL_NAME = "trl-internal-testing/tiny-LlavaForConditionalGeneration"


@pytest.fixture
def adapter():
    processor = AutoProcessor.from_pretrained(MODEL_NAME)
    tokenizer = processor.tokenizer
    type_adapter = TransformersMultiModalTypeAdapter(tokenizer=tokenizer)

    return type_adapter


@pytest.fixture
def logits_processor():
    vocabulary = Vocabulary.from_pretrained("openai-community/gpt2")
    index = Index(r"[0-9]{3}", vocabulary)
    return OutlinesCoreLogitsProcessor(index, "torch")


@pytest.fixture
def image():
    width, height = 1, 1
    white_background = (255, 255, 255)
    image = PILImage.new("RGB", (width, height), white_background)
    image.format = "PNG"

    return image


@pytest.fixture
def video():
    # Simple mock video data
    return "mock_video_data"


@pytest.fixture
def audio():
    # Simple mock audio data
    return "mock_audio_data"


def test_transformers_multimodal_type_adapter_format_input(adapter, image):
    with pytest.raises(TypeError):
        adapter.format_input("hello")

    with pytest.raises(TypeError):
        adapter.format_input({"foo": "bar"})

    with pytest.raises(ValueError, match="All assets must be of the same type"):
        adapter.format_input(["foo", Image(image), Video("")])

    class MockAsset:
        pass

    with pytest.raises(ValueError, match="Unsupported asset type"):
        adapter.format_input(["foo", MockAsset()])

    image_asset = Image(image)
    assert adapter.format_input(["foo", image_asset]) == {
        "text": "foo",
        "images": [image_asset.image],
    }

    chat_prompt = Chat(messages=[
        {"role": "system", "content": "foo"},
        {"role": "user", "content": ["bar", image_asset]},
    ])
    result = adapter.format_input(chat_prompt)
    assert isinstance(result, dict)
    assert isinstance(result["text"], str)
    assert isinstance(result["images"], list)
    assert len(result["images"]) == 1
    assert result["images"][0] == image_asset.image

    chat_prompt = Chat(messages=[
        {"role": "system", "content": "foo"},
        {"role": "user", "content": [{"type": "text", "text": "bar"}, {"type": "image", "image": image_asset}]},
    ])
    result = adapter.format_input(chat_prompt)
    assert isinstance(result, dict)
    assert isinstance(result["text"], str)
    assert isinstance(result["images"], list)
    assert len(result["images"]) == 1
    assert result["images"][0] == image_asset.image


def test_transformers_multimodal_type_adapter_format_input_empty_assets(adapter):
    result = adapter.format_input(["Just text prompt"])
    assert result == {"text": "Just text prompt"}


def test_transformers_multimodal_type_adapter_format_input_chat_invalid_asset_type(adapter, image):
    class MockAsset:
        pass

    chat_prompt = Chat(messages=[
        {"role": "user", "content": [
            {"type": "text", "text": "Hello"},
            {"type": "image", "image": MockAsset()}  # Wrong type
        ]}
    ])

    with pytest.raises(ValueError, match="Assets must be of type"):
        adapter.format_input(chat_prompt)


def test_transformers_multimodal_type_adapter_format_input_chat_unsupported_content_type(adapter):
    chat_prompt = Chat(messages=[
        {"role": "user", "content": [
            {"type": "text", "text": "Hello"},
            {"type": "unsupported", "data": "some_data"}  # Unsupported type
        ]}
    ])

    with pytest.raises(ValueError, match="Content must be 'text'"):
        adapter.format_input(chat_prompt)


def test_transformers_multimodal_type_adapter_format_output_type(
    adapter, logits_processor
):
    formatted = adapter.format_output_type(logits_processor)
    assert isinstance(formatted, LogitsProcessorList)
    assert len(formatted) == 1
    assert formatted[0].index == logits_processor.index
    assert formatted[0].tensor_library_name == logits_processor.tensor_library_name

    formatted = adapter.format_output_type(None)
    assert formatted is None


def test_transformers_multimodal_type_adapter_format_input_chat_missing_asset_key(adapter, image):
    image_asset = Image(image)

    # Test missing 'image' key when type is 'image'
    chat_prompt = Chat(messages=[
        {"role": "user", "content": [
            {"type": "text", "text": "What's in this image?"},
            {"type": "image", "txt": image_asset}  # Wrong key: 'txt' instead of 'image'
        ]}
    ])

    with pytest.raises(ValueError, match="Item with type 'image' must contain a 'image' key"):
        adapter.format_input(chat_prompt)

    # Test missing 'video' key when type is 'video'
    video_asset = Video("dummy_video")
    chat_prompt = Chat(messages=[
        {"role": "user", "content": [
            {"type": "text", "text": "What's in this video?"},
            {"type": "video", "vid": video_asset}  # Wrong key: 'vid' instead of 'video'
        ]}
    ])

    with pytest.raises(ValueError, match="Item with type 'video' must contain a 'video' key"):
        adapter.format_input(chat_prompt)


def test_transformers_multimodal_type_adapter_format_input_chat_missing_type_key(adapter, image):
    image_asset = Image(image)

    chat_prompt = Chat(messages=[
        {"role": "user", "content": [
            {"text": "What's in this image?"},  # Missing 'type' key
            {"type": "image", "image": image_asset}
        ]}
    ])

    with pytest.raises(ValueError, match="Each item in the content list must be a dictionary with a 'type' key"):
        adapter.format_input(chat_prompt)


def test_transformers_multimodal_type_adapter_format_input_invalid_content_type(adapter):
    chat_prompt = Chat(messages=[
        {"role": "user", "content": 42}  # Invalid content type (integer)
    ])

    with pytest.raises(ValueError, match="Invalid content type"):
        adapter.format_input(chat_prompt)

    # Test with another invalid type
    chat_prompt = Chat(messages=[
        {"role": "user", "content": {"invalid": "dict"}}  # Invalid content type (dict not in list)
    ])

    with pytest.raises(ValueError, match="Invalid content type"):
        adapter.format_input(chat_prompt)


def test_transformers_multimodal_type_adapter_format_asset_for_template(adapter, image, video, audio):
    # Test Image asset
    image_asset = Image(image)
    formatted_image = adapter._format_asset_for_template(image_asset)
    assert formatted_image == {"type": "image", "image": image_asset}

    # Test Video asset
    video_asset = Video(video)
    formatted_video = adapter._format_asset_for_template(video_asset)
    assert formatted_video == {"type": "video", "video": video_asset}

    # Test Audio asset
    audio_asset = Audio(audio)
    formatted_audio = adapter._format_asset_for_template(audio_asset)
    assert formatted_audio == {"type": "audio", "audio": audio_asset}


def test_transformers_multimodal_type_adapter_format_asset_for_template_invalid_type(adapter):
    class MockUnsupportedAsset:
        pass

    # This test requires accessing the private method directly since the error
    # would normally be caught earlier in the validation chain
    unsupported_asset = MockUnsupportedAsset()

    with pytest.raises(ValueError, match="Assets must be of type `Image`, `Video` or `Audio`"):
        adapter._format_asset_for_template(unsupported_asset)


def test_transformers_multimodal_type_adapter_multiple_assets_in_single_item(adapter, image):
    image_asset = Image(image)
    video_asset = Video("dummy_video")

    chat_prompt = Chat(messages=[
        {"role": "user", "content": [
            {"type": "text", "text": "What's in this?"},
            {"type": "image", "image": image_asset, "video": video_asset}  # Multiple asset types
        ]}
    ])

    with pytest.raises(ValueError, match="Found item with multiple keys:"):
        adapter.format_input(chat_prompt)


def test_transformers_multimodal_type_adapter_correct_multiple_assets_usage(adapter, image):
    image_asset1 = Image(image)
    image_asset2 = Image(image)

    # Correct way: separate dictionary items for each asset
    chat_prompt = Chat(messages=[
        {"role": "user", "content": [
            {"type": "text", "text": "What's in these images?"},
            {"type": "image", "image": image_asset1},
            {"type": "image", "image": image_asset2}
        ]}
    ])

    result = adapter.format_input(chat_prompt)
    assert isinstance(result, dict)
    assert "text" in result
    assert "images" in result
    assert len(result["images"]) == 2


================================================
FILE: tests/models/test_transformers_tokenizer.py
================================================
import pytest

import transformers

from outlines.models.transformers import (
    get_llama_tokenizer_types,
    TransformerTokenizer,
)


TEST_MODEL = "erwanf/gpt2-mini"
TEST_MODEL_SEQ2SEQ = "hf-internal-testing/tiny-random-t5"


@pytest.fixture
def tokenizer():
    return transformers.AutoTokenizer.from_pretrained(TEST_MODEL)


@pytest.fixture
def tokenizer_no_pad_token_id(tokenizer):
    tokenizer.pad_token_id = None
    return tokenizer


@pytest.fixture
def tokenizer_seq2seq():
    return transformers.AutoTokenizer.from_pretrained(TEST_MODEL_SEQ2SEQ)


@pytest.fixture
def transformer_tokenizer(tokenizer):
    return TransformerTokenizer(tokenizer)


@pytest.fixture
def another_transformer_tokenizer(tokenizer):
    return TransformerTokenizer(tokenizer)


@pytest.fixture
def transformer_tokenizer_seq2seq(tokenizer_seq2seq):
    return TransformerTokenizer(tokenizer_seq2seq)


def test_get_llama_tokenizer_types():
    tokenizer_types = get_llama_tokenizer_types()
    assert tokenizer_types[0] is transformers.models.llama.LlamaTokenizer
    assert tokenizer_types[1] is transformers.models.llama.LlamaTokenizerFast
    assert tokenizer_types[2] is transformers.models.code_llama.CodeLlamaTokenizer
    assert tokenizer_types[3] is transformers.models.code_llama.CodeLlamaTokenizerFast


def test_transformer_tokenizer_init(
    tokenizer,
    tokenizer_no_pad_token_id
):
    # tokenizer with a pad_token_id
    transformer_tokenizer = TransformerTokenizer(tokenizer)
    assert transformer_tokenizer.tokenizer == tokenizer
    assert transformer_tokenizer.eos_token_id == tokenizer.eos_token_id
    assert transformer_tokenizer.pad_token_id == tokenizer.pad_token_id
    assert transformer_tokenizer.special_tokens == set(tokenizer.all_special_tokens)
    assert transformer_tokenizer.vocabulary == tokenizer.get_vocab()

    # tokenizer with no pad_token_id
    transformer_tokenizer_no_pad_token_id = TransformerTokenizer(tokenizer_no_pad_token_id)
    assert transformer_tokenizer_no_pad_token_id.tokenizer == tokenizer_no_pad_token_id
    assert transformer_tokenizer_no_pad_token_id.eos_token_id == tokenizer_no_pad_token_id.eos_token_id
    assert transformer_tokenizer_no_pad_token_id.pad_token_id == tokenizer_no_pad_token_id.eos_token_id
    assert transformer_tokenizer_no_pad_token_id.special_tokens == set(tokenizer_no_pad_token_id.all_special_tokens)
    assert transformer_tokenizer_no_pad_token_id.vocabulary == tokenizer_no_pad_token_id.get_vocab()


def test_transformer_tokenizer_encode(transformer_tokenizer):
    input_ids, attention_mask = transformer_tokenizer.encode("Hello, world!")
    assert input_ids is not None
    assert attention_mask is not None
    assert input_ids.shape == attention_mask.shape


def test_transformer_tokenizer_decode(transformer_tokenizer):
    input_ids, _ = transformer_tokenizer.encode("Hello, world!")
    decoded_text = transformer_tokenizer.decode(input_ids)
    assert isinstance(decoded_text, list)
    assert "Hello, world!" in decoded_text[0]


def test_transformer_tokenizer_convert_token_to_string(transformer_tokenizer):
    # regular
    transformer_tokenizer.is_llama = False
    token = transformer_tokenizer.tokenizer.tokenize("Hello")[0]
    string = transformer_tokenizer.convert_token_to_string(token)
    assert isinstance(string, str)
    assert "Hello" in string

    # is_llama + <0x20>
    transformer_tokenizer.is_llama = True
    string = transformer_tokenizer.convert_token_to_string("<0x20>")
    assert isinstance(string, str)
    assert " " in string


def test_transformer_tokenizer_eq(
    transformer_tokenizer,
    another_transformer_tokenizer,
    transformer_tokenizer_seq2seq,
):
    # different types of object
    assert transformer_tokenizer.__eq__(1) == NotImplemented

    # regular case
    assert transformer_tokenizer == another_transformer_tokenizer
    assert transformer_tokenizer != transformer_tokenizer_seq2seq

    # with model name and kwargs attributes
    transformer_tokenizer.model_name = "foo"
    transformer_tokenizer.kwargs = {"foo": "bar"}
    another_transformer_tokenizer.model_name = "foo"
    another_transformer_tokenizer.kwargs = {"foo": "bar"}
    assert transformer_tokenizer == another_transformer_tokenizer


def test_transformer_tokenizer_hash(
    transformer_tokenizer,
    another_transformer_tokenizer,
    transformer_tokenizer_seq2seq,
):
    assert isinstance(hash(transformer_tokenizer), int)
    assert hash(transformer_tokenizer) == hash(another_transformer_tokenizer)
    assert hash(transformer_tokenizer) != hash(transformer_tokenizer_seq2seq)


def test_transformer_tokenizer_getstate_setstate(
    transformer_tokenizer,
    another_transformer_tokenizer,
):
    state = transformer_tokenizer.__getstate__()
    assert "tokenizer" in state

    another_transformer_tokenizer.__setstate__(state)
    assert another_transformer_tokenizer == transformer_tokenizer


================================================
FILE: tests/models/test_transformers_type_adapter.py
================================================
import io
import pytest

import transformers
from transformers import LogitsProcessorList
from outlines_core import Index, Vocabulary
from PIL import Image as PILImage

from outlines.backends.outlines_core import OutlinesCoreLogitsProcessor
from outlines.inputs import Chat, Image
from outlines.models.transformers import TransformersTypeAdapter


MODEL_NAME = "erwanf/gpt2-mini"


@pytest.fixture
def adapter():
    tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_NAME)
    type_adapter = TransformersTypeAdapter(tokenizer=tokenizer)
    chat_template = '{% for message in messages %}{{ message.role }}: {{ message.content }}{% endfor %}'
    type_adapter.tokenizer.chat_template = chat_template

    return type_adapter

@pytest.fixture
def logits_processor():
    vocabulary = Vocabulary.from_pretrained("openai-community/gpt2")
    index = Index(r"[0-9]{3}", vocabulary)
    return OutlinesCoreLogitsProcessor(index, "torch")

@pytest.fixture
def image():
    width, height = 1, 1
    white_background = (255, 255, 255)
    image = PILImage.new("RGB", (width, height), white_background)

    # Save to an in-memory bytes buffer and read as png
    buffer = io.BytesIO()
    image.save(buffer, format="PNG")
    buffer.seek(0)
    image = PILImage.open(buffer)

    return image


def test_transformers_type_adapter_format_input(adapter, image):
    # invalid input
    with pytest.raises(TypeError, match="is not available."):
        adapter.format_input(["prompt", Image(image)])

    # string with chat template
    # The fixture sets a chat template, so it should be formatted
    adapter.has_chat_template = True
    assert adapter.format_input("Hello, world!") == "user: Hello, world!"

    # string without chat template
    adapter.has_chat_template = False
    assert adapter.format_input("Hello, world!") == "Hello, world!"

    # chat
    # Restore chat template for chat test
    adapter.has_chat_template = True
    assert isinstance(adapter.format_input(Chat(messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Hello, world!"},
        {"role": "assistant", "content": "Hello, world!"},
    ])), str)


def test_transformers_type_adapter_format_output_type(
    adapter, logits_processor
):
    formatted = adapter.format_output_type(logits_processor)
    assert isinstance(formatted, LogitsProcessorList)
    assert formatted[0].index == logits_processor.index
    assert formatted[0].tensor_library_name == logits_processor.tensor_library_name

    formatted = adapter.format_output_type(None)
    assert formatted is None


================================================
FILE: tests/models/test_utils.py
================================================
from outlines.models.utils import set_additional_properties_false_json_schema


def test_set_additional_properties_false_json_schema():
    # additionalProperties is not set
    schema = {
        "type": "object",
        "properties": {
            "name": {"type": "string"},
            "age": {"type": "integer"},
        },
        "required": ["name"],
    }
    modified_schema = set_additional_properties_false_json_schema(schema)
    target_schema = {
        "type": "object",
        "properties": {
            "name": {"type": "string"},
            "age": {"type": "integer"},
        },
        "required": ["name"],
        "additionalProperties": False,
    }
    assert modified_schema == target_schema

    # additionalProperties is set to False
    schema = {
        "type": "object",
        "properties": {
            "name": {"type": "string"},
            "age": {"type": "integer"},
        },
        "required": ["name"],
        "additionalProperties": False,
    }
    modified_schema = set_additional_properties_false_json_schema(schema)
    assert modified_schema == schema

    # additionalProperties is set to True
    schema = {
        "type": "object",
        "properties": {
            "name": {"type": "string"},
            "age": {"type": "integer"},
        },
        "required": ["name"],
        "additionalProperties": True,
    }
    modified_schema = set_additional_properties_false_json_schema(schema)
    assert modified_schema == schema


================================================
FILE: tests/models/test_vllm.py
================================================
import io
import os
import re
import warnings
import base64
from typing import AsyncGenerator, Generator

import pytest
from PIL import Image as PILImage
from openai import AsyncOpenAI, OpenAI

from outlines.inputs import Chat, Image
from outlines.models.vllm import VLLM, AsyncVLLM, from_vllm
from outlines.types.dsl import CFG, Regex, JsonSchema
from tests.test_utils.mock_openai_client import MockOpenAIClient, MockAsyncOpenAIClient


YES_NO_GRAMMAR = """
?start: answer

answer: "yes" | "no"
"""

# Image for testing
width, height = 1, 1
white_background = (255, 255, 255)
image = PILImage.new("RGB", (width, height), white_background)
buffer = io.BytesIO()
image.save(buffer, format="PNG")
buffer.seek(0)
image = PILImage.open(buffer)
image_input = Image(image)


# If the VLLM_SERVER_URL environment variable is set, use the real vLLM server
# Otherwise, use the mock server
vllm_server_url = os.environ.get("VLLM_SERVER_URL")
vllm_model_name = os.environ.get(
    "VLLM_MODEL_NAME", "Qwen/Qwen2.5-VL-3B-Instruct"
)
if vllm_server_url:
    openai_client = OpenAI(base_url=vllm_server_url, api_key="foo")
    async_openai_client = AsyncOpenAI(base_url=vllm_server_url, api_key="foo")
else:
    warnings.warn("No VLLM server URL provided, using mock server")
    openai_client = MockOpenAIClient()
    async_openai_client = MockAsyncOpenAIClient()


mock_responses = [
    (
        {
            'messages': [
                {'role': "user", 'content': 'Respond with a single word.'}
            ],
            'model': vllm_model_name,
        },
        "foo"
    ),
    (
        {
            'messages': [
                {'role': "user", 'content': 'Respond with a single word.'}
            ],
            'model': vllm_model_name,
            'stream': True
        },
        ["foo", "bar"]
    ),
    (
        {
            'messages': [
                {'role': "user", 'content': 'Respond with a single word.'}
            ],
            'n': 2,
            'model': vllm_model_name,
        },
        ["foo", "bar"]
    ),
    (
        {
            'messages': [{'role': "user", 'content': 'foo?'}],
            'model': vllm_model_name,
            'max_tokens': 10,
            'extra_body': {
            'guided_json': {
                'type': 'object',
                'properties': {
                    'bar': {'type': 'string'}
                }
            },
            }
        },
        '{"foo": "bar"}'
    ),
    (
        {
            'messages': [{'role': "user", 'content': 'foo?'}],
            'model': vllm_model_name,
            'max_tokens': 10,
            'extra_body': {
                'guided_regex': '([0-9]{3})',
            },
        },
        "123"
    ),
    (
        {
            'messages': [{'role': "user", 'content': 'foo?'}],
            'model': vllm_model_name,
            'max_tokens': 10,
            'extra_body': {
                'guided_grammar': YES_NO_GRAMMAR,
            },
        },
        "yes"
    ),
    (
        {
            'messages': [
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": "hello"},
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/png;base64,{image_input.image_str}"
                            },
                        },
                    ]
                }
            ],
            'model': vllm_model_name,
            'max_tokens': 10,
        },
        "foo"
    ),
    (
        {
            'messages': [
                {"role": "system", "content": "prompt"},
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": "hello"},
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/png;base64,{image_input.image_str}"
                            },
                        },
                    ],
                },
                {"role": "assistant", "content": "response"},
            ],
            'model': vllm_model_name,
            'max_tokens': 10,
        },
        "foo"
    )
]


# If the VLLM_SERVER_URL environment variable is not set, add the mock
# responses to the mock clients
if not vllm_server_url:
    async_openai_client.add_mock_responses(mock_responses)
    openai_client.add_mock_responses(mock_responses)


@pytest.fixture
def sync_model():
    return VLLM(openai_client, vllm_model_name)


@pytest.fixture
def sync_model_no_model_name():
    return VLLM(openai_client)


@pytest.fixture
def async_model():
    return AsyncVLLM(async_openai_client, vllm_model_name)


@pytest.fixture
def async_model_no_model_name():
    return AsyncVLLM(async_openai_client)


def test_vllm_init():
    # We do not rely on the mock server here because we need an object
    # of type OpenAI and AsyncOpenAI to test the init function.
    openai_client = OpenAI(base_url="http://localhost:11434", api_key="foo")
    async_openai_client = AsyncOpenAI(base_url="http://localhost:11434", api_key="foo")

    # Sync with model name
    model = from_vllm(openai_client, vllm_model_name)
    assert isinstance(model, VLLM)
    assert model.client == openai_client
    assert model.model_name == vllm_model_name

    # Sync without model name
    model = from_vllm(openai_client)
    assert isinstance(model, VLLM)
    assert model.client == openai_client
    assert model.model_name is None

    # Async with model name
    model = from_vllm(async_openai_client, vllm_model_name)
    assert isinstance(model, AsyncVLLM)
    assert model.client == async_openai_client
    assert model.model_name == vllm_model_name

    # Async without model name
    model = from_vllm(async_openai_client)
    assert isinstance(model, AsyncVLLM)
    assert model.client == async_openai_client
    assert model.model_name is None

    with pytest.raises(ValueError, match="Unsupported client type"):
        from_vllm("foo")


def test_vllm_sync_simple_call(sync_model):
    result = sync_model("Respond with a single word.",)
    assert isinstance(result, str)


def test_vllm_sync_streaming(sync_model_no_model_name):
    result = sync_model_no_model_name.stream(
        "Respond with a single word.",
        model=vllm_model_name,
    )
    assert isinstance(result, Generator)
    assert isinstance(next(result), str)


def test_vllm_sync_batch(sync_model):
    with pytest.raises(NotImplementedError, match="does not support"):
        sync_model.batch(
            ["Respond with one word.", "Respond with one word."],
        )


def test_vllm_sync_vision(sync_model):
    result = sync_model(["hello", image_input], max_tokens=10)
    assert isinstance(result, str)


def test_vllm_sync_vision_chat(sync_model):
    result = sync_model(
        Chat(messages=[
            {"role": "system", "content": "prompt"},
            {"role": "user", "content": [
                "hello",
                image_input,
            ]},
            {"role": "assistant", "content": "response"},
        ]),
        max_tokens=10,
    )
    assert isinstance(result, str)


def test_vllm_sync_multiple_samples(sync_model):
    result = sync_model("Respond with a single word.", n=2)
    assert isinstance(result, list)
    assert len(result) == 2
    assert isinstance(result[0], str)
    assert isinstance(result[1], str)


def test_vllm_sync_json(sync_model):
    json_string = '{"type": "object", "properties": {"bar": {"type": "string"}}}'
    result = sync_model("foo?", JsonSchema(json_string), max_tokens=10)
    assert isinstance(result, str)
    assert "bar" in result


def test_vllm_sync_regex(sync_model):
    result = sync_model("foo?", Regex(r"[0-9]{3}"), max_tokens=10)
    assert isinstance(result, str)
    assert re.match(r"[0-9]{3}", result)


def test_vllm_sync_cfg(sync_model):
    result = sync_model("foo?", CFG(YES_NO_GRAMMAR), max_tokens=10)
    assert isinstance(result, str)
    assert result in ["yes", "no"]


@pytest.mark.asyncio
async def test_vllm_async_simple_call(async_model):
    result = await async_model("Respond with a single word.",)
    assert isinstance(result, str)


@pytest.mark.asyncio
async def test_vllm_async_streaming(async_model_no_model_name):
    result = async_model_no_model_name.stream(
        "Respond with a single word.",
        model=vllm_model_name,
    )
    assert isinstance(result, AsyncGenerator)
    async for chunk in result:
        assert isinstance(chunk, str)
        break  # Just check the first chunk


@pytest.mark.asyncio
async def test_vllm_async_batch(async_model):
    with pytest.raises(NotImplementedError, match="does not support"):
        await async_model.batch(
            ["Respond with one word.", "Respond with one word."],
        )


@pytest.mark.asyncio
async def test_vllm_async_vision(async_model):
    result = await async_model(["hello", image_input], max_tokens=10)
    assert isinstance(result, str)


@pytest.mark.asyncio
async def test_vllm_async_vision_chat(async_model):
    result = await async_model(
        Chat(messages=[
            {"role": "system", "content": "prompt"},
            {"role": "user", "content": [
                "hello",
                image_input,
            ]},
            {"role": "assistant", "content": "response"},
        ]),
        max_tokens=10,
    )
    assert isinstance(result, str)


@pytest.mark.asyncio
async def test_vllm_async_multiple_samples(async_model):
    result = await async_model("Respond with a single word.", n=2)
    assert isinstance(result, list)
    assert len(result) == 2
    assert isinstance(result[0], str)
    assert isinstance(result[1], str)


@pytest.mark.asyncio
async def test_vllm_async_json(async_model):
    json_string = '{"type": "object", "properties": {"bar": {"type": "string"}}}'
    result = await async_model("foo?", JsonSchema(json_string), max_tokens=10)
    assert isinstance(result, str)
    assert "bar" in result


@pytest.mark.asyncio
async def test_vllm_async_regex(async_model):
    result = await async_model("foo?", Regex(r"[0-9]{3}"), max_tokens=10)
    assert isinstance(result, str)
    assert re.match(r"[0-9]{3}", result)


@pytest.mark.asyncio
async def test_vllm_async_cfg(async_model):
    result = await async_model("foo?", CFG(YES_NO_GRAMMAR), max_tokens=10)
    assert isinstance(result, str)
    assert result in ["yes", "no"]


================================================
FILE: tests/models/test_vllm_offline.py
================================================
import io
import re
from enum import Enum

import pytest
from PIL import Image as PILImage
from pydantic import BaseModel

try:
    from vllm import LLM, SamplingParams
    HAS_VLLM = True
except ImportError:
    HAS_VLLM = False

import outlines
from outlines.inputs import Chat
from outlines.models.vllm_offline import (
    VLLMOffline,
    VLLMOfflineTypeAdapter,
    from_vllm_offline
)
from outlines.types import Regex


TEST_MODEL = "microsoft/Phi-3-mini-4k-instruct"

pytestmark = pytest.mark.skipif(
    not HAS_VLLM,
    reason="vLLM models can only be run on GPU."
)

@pytest.fixture(scope="session")
def image():
    width, height = 1, 1
    white_background = (255, 255, 255)
    image = PILImage.new("RGB", (width, height), white_background)

    # Save to an in-memory bytes buffer and read as png
    buffer = io.BytesIO()
    image.save(buffer, format="PNG")
    buffer.seek(0)
    image = PILImage.open(buffer)

    return image


def test_vllm_model_initialization():
    model = from_vllm_offline(LLM(TEST_MODEL))
    assert isinstance(model, VLLMOffline)
    assert isinstance(model.model, LLM)
    assert isinstance(model.type_adapter, VLLMOfflineTypeAdapter)


@pytest.fixture(scope="session")
def model(tmp_path_factory):
    model = outlines.from_vllm_offline(LLM(TEST_MODEL))
    return model


def test_vllm_simple(model):
    result = model.generate("Respond with one word. Not more.", None)
    assert isinstance(result, str)


def test_vllm_call(model):
    result = model("Respond with one word. Not more.")
    assert isinstance(result, str)


def test_vllm_inference_kwargs(model):
    result = model(
        "Write a short story about a cat.",
        sampling_params=SamplingParams(max_tokens=2),
        use_tqdm=True
    )
    assert isinstance(result, str)
    assert len(result) <= 20


def test_vllm_chat(model):
    result = model(
        Chat(messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "What is the capital of France?"},
            {"role": "assistant", "content": "Response: "},
        ]),
        sampling_params=SamplingParams(max_tokens=2),
    )
    assert isinstance(result, str)


def test_vllm_invalid_inference_kwargs(model):
    with pytest.raises(TypeError):
        model("Respond with one word. Not more.", foo="bar")


def test_vllm_regex(model):
    result = model("Give a number between 0 and 9.", Regex(r"[0-9]"))
    assert isinstance(result, str)
    assert re.match(r"[0-9]", result)


def test_vllm_json(model):
    class Character(BaseModel):
        name: str

    result = model("Create a character with a name.", Character)
    assert "name" in result


def test_vllm_choice(model):
    class Foo(Enum):
        cat = "cat"
        dog = "dog"

    result = model("Cat or dog?", Foo)
    assert result in ["cat", "dog"]


def test_vllm_multiple_samples(model):
    result = model(
        "Respond with one word. Not more.",
        sampling_params=SamplingParams(n=2)
    )
    assert isinstance(result, list)
    assert len(result) == 2


def test_vllm_batch(model):
    result = model.batch(
        ["Respond with one word. Not more.", "Respond with one word. Not more."]
    )
    assert isinstance(result, list)
    assert len(result) == 2

    result = model.batch(
        ["Respond with one word. Not more.", "Respond with one word. Not more."],
        sampling_params=SamplingParams(n=2)
    )
    assert isinstance(result, list)
    assert len(result) == 2
    for item in result:
        assert isinstance(item, list)
        assert len(item) == 2

    with pytest.raises(TypeError, match="Batch generation is not available"):
        model.batch(
            [
                Chat(messages=[
                    {"role": "user", "content": "What is the capital of France?"},
                ]),
            ]
        )

def test_vllm_streaming(model):
    with pytest.raises(
        NotImplementedError,
        match="Streaming is not available"
    ):
        model.stream("Respond with one word. Not more.")


================================================
FILE: tests/models/test_vllm_offline_type_adapter.py
================================================
import io
import json

import pytest
from PIL import Image as PILImage

from outlines.inputs import Chat, Image
from outlines.models.vllm_offline import VLLMOfflineTypeAdapter
from outlines.types import CFG, JsonSchema, Regex


CFG_STRING = """
?start: expr
?expr: NUMBER
"""

JSON_SCHEMA_STRING = """
{
    "type": "object",
    "properties": {
        "answer": {"type": "number"}
    }
}
"""


@pytest.fixture
def type_adapter():
    return VLLMOfflineTypeAdapter()

@pytest.fixture
def cfg_instance():
    return CFG(CFG_STRING)

@pytest.fixture
def json_schema_instance():
    return JsonSchema(JSON_SCHEMA_STRING)

@pytest.fixture
def json_schema_whitespace_instance():
    return JsonSchema(JSON_SCHEMA_STRING, whitespace_pattern="\n")

@pytest.fixture
def regex_instance():
    return Regex(r"[0-9]+")

@pytest.fixture
def image():
    width, height = 1, 1
    white_background = (255, 255, 255)
    image = PILImage.new("RGB", (width, height), white_background)

    # Save to an in-memory bytes buffer and read as png
    buffer = io.BytesIO()
    image.save(buffer, format="PNG")
    buffer.seek(0)
    image = PILImage.open(buffer)

    return image


def test_vllm_offline_type_adapter_input_text(type_adapter):
    message = "prompt"
    result = type_adapter.format_input(message)
    assert result == message


def test_vllm_offline_type_adapter_input_text_with_template():
    adapter = VLLMOfflineTypeAdapter(has_chat_template=True)
    message = "prompt"
    result = adapter.format_input(message)

    assert result == [{"role": "user", "content": "prompt"}]


def test_vllm_offline_type_adapter_input_text_without_template():
    adapter = VLLMOfflineTypeAdapter(has_chat_template=False)
    message = "prompt"
    result = adapter.format_input(message)

    assert result == "prompt"


def test_vllm_offline_type_adapter_input_chat(type_adapter):
    model_input = Chat(messages=[
        {"role": "system", "content": "prompt"},
        {"role": "user", "content": "hello"},
        {"role": "assistant", "content": "response"},
    ])
    result = type_adapter.format_input(model_input)
    assert result == [
        {"role": "system", "content": "prompt"},
        {"role": "user", "content": "hello"},
        {"role": "assistant", "content": "response"},
    ]


def test_vllm_offline_type_adapter_input_invalid(type_adapter, image):
    with pytest.raises(TypeError, match="is not available"):
        _ = type_adapter.format_input(["Hello", Image(image)])

    with pytest.raises(ValueError, match="Assets are not supported"):
        _ = type_adapter.format_input(Chat(messages=[
            {"role": "user", "content": [
                "Hello",
                Image(image),
            ]},
        ]))


def test_vllm_offline_type_adapter_output_type(
    type_adapter,
    cfg_instance,
    json_schema_instance,
    json_schema_whitespace_instance,
    regex_instance,
):
    assert type_adapter.format_output_type(None) == {}
    assert type_adapter.format_output_type(cfg_instance) == {
        "grammar": CFG_STRING
    }
    assert type_adapter.format_output_type(json_schema_instance) == {
        "json": json.loads(JSON_SCHEMA_STRING)
    }
    assert type_adapter.format_output_type(json_schema_whitespace_instance) == {
        "json": json.loads(JSON_SCHEMA_STRING),
        "whitespace_pattern": "\n"
    }
    assert type_adapter.format_output_type(regex_instance) == {
        "regex": "([0-9]+)"
    }


================================================
FILE: tests/models/test_vllm_type_adapter.py
================================================
import io
import json
import pytest
from dataclasses import dataclass

from PIL import Image as PILImage

from outlines.inputs import Chat, Image
from outlines.models.vllm import VLLMTypeAdapter
from outlines.types import CFG, JsonSchema


CFG_STRING = """
?start: expr
?expr: NUMBER
"""

JSON_SCHEMA_STRING = """
{
    "type": "object",
    "properties": {
        "answer": {"type": "number"}
    }
}
"""


@pytest.fixture
def type_adapter():
    return VLLMTypeAdapter()

@pytest.fixture
def cfg_instance():
    return CFG(CFG_STRING)

@pytest.fixture
def json_schema_instance():
    return JsonSchema(JSON_SCHEMA_STRING)

@pytest.fixture
def json_schema_whitespace_instance():
    return JsonSchema(JSON_SCHEMA_STRING, whitespace_pattern="\n")

@pytest.fixture
def image():
    width, height = 1, 1
    white_background = (255, 255, 255)
    image = PILImage.new("RGB", (width, height), white_background)

    # Save to an in-memory bytes buffer and read as png
    buffer = io.BytesIO()
    image.save(buffer, format="PNG")
    buffer.seek(0)
    image = PILImage.open(buffer)

    return image


def test_vllm_type_adapter_input_text(type_adapter):
    message = "prompt"
    result = type_adapter.format_input(message)
    assert result == [{"role": "user", "content": message}]


def test_vllm_type_adapter_input_vision(type_adapter, image):
    image_input = Image(image)
    result = type_adapter.format_input(["hello", image_input])
    assert result == [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "hello"},
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/png;base64,{image_input.image_str}"
                    },
                },
            ],
        }
    ]


def test_vllm_type_adapter_input_chat(type_adapter, image):
    image_input = Image(image)
    model_input = Chat(messages=[
        {"role": "system", "content": "prompt"},
        {"role": "user", "content": [
            "hello",
            image_input,
        ]},
        {"role": "assistant", "content": "response"},
    ])
    result = type_adapter.format_input(model_input)
    assert result == [
        {"role": "system", "content": "prompt"},
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "hello"},
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/png;base64,{image_input.image_str}"
                    },
                },
            ],
        },
        {"role": "assistant", "content": "response"},
    ]


def test_vllm_type_adapter_input_invalid(type_adapter):
    @dataclass
    class Audio:
        file: str

    prompt = Audio(
        "file",
    )
    with pytest.raises(TypeError, match="The input type"):
        _ = type_adapter.format_input(prompt)


def test_vllm_type_adapter_output_type(
    type_adapter,
    cfg_instance,
    json_schema_instance,
    json_schema_whitespace_instance,
):
    assert type_adapter.format_output_type(None) == {}
    assert type_adapter.format_output_type(cfg_instance) == {
        "guided_grammar": CFG_STRING
    }
    assert type_adapter.format_output_type(json_schema_instance) == {
        "guided_json": json.loads(JSON_SCHEMA_STRING)
    }
    assert type_adapter.format_output_type(json_schema_whitespace_instance) == {
        "guided_json": json.loads(JSON_SCHEMA_STRING),
        "whitespace_pattern": "\n"
    }
    assert type_adapter.format_output_type(int) == {
        "guided_regex": "([+-]?(0|[1-9][0-9]*))"
    }


================================================
FILE: tests/processors/test_base_processor.py
================================================
from typing import List

import numpy as np
import pytest
import torch

from outlines.processors.base_logits_processor import OutlinesLogitsProcessor

try:
    import mlx.core as mx
    HAS_MLX = True
except ImportError:
    HAS_MLX = False


libraries = ["numpy", "torch"]
if HAS_MLX:
    libraries.append("mlx")

# we check the accepted shapes:
# - both 1D
# - both 2D
# - input_ids 1D and logits 2D with a single sequence
# we raise an error if the shapes are not accepted:
# - input_ids 2D and logits 1D
# - input_ids 1D and logits 2D, but with multiple sequences
# - both 3D
arrays = {
    "numpy": [
        (np.array([1, 2], dtype=np.float32), np.array([1, 2], dtype=np.int32), None),
        (np.array([[1, 2], [3, 4]], dtype=np.float32), np.array([[1, 2], [3, 4]], dtype=np.int32), None),
        (np.array([1, 2], dtype=np.float32), np.array([[1, 2]], dtype=np.int32), None),
        (np.array([[1, 2]], dtype=np.float32), np.array([1, 2], dtype=np.int32), AssertionError),
        (np.array([1, 2], dtype=np.float32), np.array([[1, 2], [3, 4]], dtype=np.int32), AssertionError),
        (np.array([[[1, 2]]], dtype=np.float32), np.array([[[1, 2]]], dtype=np.int32), ValueError),
    ],
    "torch": [
        (torch.tensor([1, 2], dtype=torch.float32), torch.tensor([1, 2], dtype=torch.int32), None),
        (torch.tensor([[1, 2], [3, 4]], dtype=torch.float32), torch.tensor([[1, 2], [3, 4]], dtype=torch.int32), None),
        (torch.tensor([1, 2], dtype=torch.float32), torch.tensor([[1, 2]], dtype=torch.int32), None),
        (torch.tensor([[1, 2]], dtype=torch.float32), torch.tensor([1, 2], dtype=torch.int32), AssertionError),
        (torch.tensor([1, 2], dtype=torch.float32), torch.tensor([[1, 2], [3, 4]], dtype=torch.int32), AssertionError),
        (torch.tensor([[[1, 2]]], dtype=torch.float32), torch.tensor([[[1, 2]]], dtype=torch.int32), ValueError),
    ],
}
if HAS_MLX:
    arrays["mlx"] = [
        (mx.array([1, 2], dtype=mx.float32), mx.array([1, 2], dtype=mx.int32), None),
        (mx.array([[1, 2], [3, 4]], dtype=mx.float32), mx.array([[1, 2], [3, 4]], dtype=mx.int32), None),
        (mx.array([1, 2], dtype=mx.float32), mx.array([[1, 2]], dtype=mx.int32), None),
        (mx.array([[1, 2]], dtype=mx.float32), mx.array([1, 2], dtype=mx.int32), AssertionError),
        (mx.array([1, 2], dtype=mx.float32), mx.array([[1, 2], [3, 4]], dtype=mx.int32), AssertionError),
        (mx.array([[[1, 2]]], dtype=mx.float32), mx.array([[[1, 2]]], dtype=mx.int32), ValueError),
    ]

class MockLogitsProcessor(OutlinesLogitsProcessor):
    def process_logits(self, input_ids, logits):
        # check that input_ids and logits received are 2D tensors
        assert len(self.tensor_adapter.shape(input_ids)) == 2
        assert len(self.tensor_adapter.shape(logits)) == 2
        return logits


@pytest.mark.parametrize("library", libraries)
def test_base_logits_processor_init(library):
    processor = MockLogitsProcessor(library)
    assert processor.tensor_adapter is not None
    with pytest.raises(NotImplementedError):
        processor = MockLogitsProcessor("foo")
        processor.reset()


@pytest.mark.parametrize("library", libraries)
def test_base_logits_processor_call(library):
    processor = MockLogitsProcessor(library)
    input_values = arrays[library]
    for input_value in input_values:
        input_ids, logits, expected_error = input_value
        if expected_error is not None:
            with pytest.raises(expected_error):
                processor(input_ids, logits)
        else:
            original_shape = processor.tensor_adapter.shape(logits)
            processed_logits = processor(input_ids, logits)
            # we check that the shape of logits is preserved
            assert processor.tensor_adapter.shape(processed_logits) == original_shape


@pytest.mark.parametrize("library", libraries)
def test_base_logits_processor_init_library_name(library):
    processor = MockLogitsProcessor(library)
    assert processor.tensor_adapter is not None
    with pytest.raises(NotImplementedError):
        processor = MockLogitsProcessor("foo")


================================================
FILE: tests/processors/test_tensor_adapters.py
================================================
import pytest
from pytest import mark

import numpy as np
import torch

from outlines.processors.tensor_adapters import (
    NumpyTensorAdapter,
    TorchTensorAdapter,
    MLXTensorAdapter,
)

try:
    import mlx_lm
    import mlx.core as mx

    HAS_MLX = mx.metal.is_available()
except ImportError:
    HAS_MLX = False


adapters = {
    "numpy": NumpyTensorAdapter(),
    "torch": TorchTensorAdapter(),
}
if HAS_MLX:
    adapters["mlx"] = MLXTensorAdapter()

frameworks = ["numpy", "torch", "mlx"]

def create_tensor(framework, shape, dtype=None):
    if framework == "torch":
        return torch.randn(*shape)
    elif framework == "numpy":
        return np.random.randn(*shape)
    elif framework == "mlx":
        if not HAS_MLX:
            pytest.skip("MLX not available")
        return mx.random.normal(shape)

def compare_tensors(framework, tensor1, tensor2):
    if framework == "torch":
        return torch.allclose(tensor1, tensor2)
    elif framework == "numpy":
        return np.array_equal(tensor1, tensor2)
    elif framework == "mlx":
        if not HAS_MLX:
            pytest.skip("MLX not available")
        return mx.array_equal(tensor1, tensor2)


@pytest.mark.parametrize("framework", frameworks)
def test_tensor_adapter_shape(framework):
    # 1d tensor
    tensor_1d = create_tensor(framework, (2,))
    result_1d = adapters[framework].shape(tensor_1d)
    assert len(result_1d) == 1
    assert result_1d[0] == 2

    # 2d tensor
    tensor_2d = create_tensor(framework, (2, 3))
    result_2d = adapters[framework].shape(tensor_2d)
    assert len(result_2d) == 2
    assert result_2d[0] == 2
    assert result_2d[1] == 3

    # 3d tensor
    tensor_3d = create_tensor(framework, (2, 2, 3))
    result_3d = adapters[framework].shape(tensor_3d)
    assert len(result_3d) == 3
    assert result_3d[0] == 2
    assert result_3d[1] == 2
    assert result_3d[2] == 3


@pytest.mark.parametrize("framework", frameworks)
def test_tensor_adapter_unsqueeze(framework):
    # 1d tensor
    tensor_1d = create_tensor(framework, (2,))
    result_1d = adapters[framework].unsqueeze(tensor_1d)
    assert result_1d.shape == (1, 2)

    # 2d tensor
    tensor_2d = create_tensor(framework, (2, 3))
    result_2d = adapters[framework].unsqueeze(tensor_2d)
    assert result_2d.shape == (1, 2, 3)


@pytest.mark.parametrize("framework", frameworks)
def test_tensor_adapter_squeeze(framework):
    # 1d tensor
    tensor_1d = create_tensor(framework, (1,))
    result_1d = adapters[framework].squeeze(tensor_1d)
    with pytest.raises(TypeError):
        len(result_1d)

    # 2d tensor
    tensor_2d = create_tensor(framework, (1, 2))
    result_2d = adapters[framework].squeeze(tensor_2d)
    assert result_2d.shape == (2,)

    # 3d tensor
    tensor_3d = create_tensor(framework, (1, 2, 3))
    result_3d = adapters[framework].squeeze(tensor_3d)
    assert result_3d.shape == (2, 3)


@pytest.mark.parametrize("framework", frameworks)
def test_tensor_adapter_to_list(framework):
    # 1d tensor
    tensor_1d = create_tensor(framework, (2,))
    result_1d = adapters[framework].to_list(tensor_1d)
    assert isinstance(result_1d, list)
    assert len(result_1d) == 2

    # 2d tensor
    tensor_2d = create_tensor(framework, (2, 3))
    result_2d = adapters[framework].to_list(tensor_2d)
    assert isinstance(result_2d, list)
    assert len(result_2d) == 2
    assert len(result_2d[0]) == 3
    assert len(result_2d[1]) == 3

    # 3d tensor
    tensor_3d = create_tensor(framework, (2, 2, 3))
    result_3d = adapters[framework].to_list(tensor_3d)
    assert isinstance(result_3d, list)
    assert len(result_3d) == 2
    assert len(result_3d[0]) == 2
    assert len(result_3d[1]) == 2
    assert len(result_3d[0][0]) == 3
    assert len(result_3d[0][1]) == 3
    assert len(result_3d[1][0]) == 3
    assert len(result_3d[1][1]) == 3


@pytest.mark.parametrize("framework", frameworks)
def test_tensor_adapter_to_scalar(framework):
    # multi-elements tensor, should raise an error
    tensor_multi = create_tensor(framework, (2, 3))
    if framework == "torch":
        with pytest.raises(RuntimeError):
            adapters[framework].to_scalar(tensor_multi)
    else:
        with pytest.raises(ValueError):
            adapters[framework].to_scalar(tensor_multi)

    # single-element tensor
    tensor_single = create_tensor(framework, (1, 1))
    scalar = adapters[framework].to_scalar(tensor_single)
    assert isinstance(scalar, float)


@pytest.mark.parametrize("framework", frameworks)
def test_tensor_adapter_full_like(framework):
    tensor = create_tensor(framework, (2, 3))
    result = adapters[framework].full_like(tensor, 0)
    assert result.shape == (2, 3)
    for i in range(2):
        for j in range(3):
            assert result[i, j] == 0


@pytest.mark.parametrize("framework", frameworks)
def test_tensor_adapter_concatenate(framework):
    # 1d tensors
    tensor1 = create_tensor(framework, (2,))
    tensor2 = create_tensor(framework, (2,))
    result = adapters[framework].concatenate([tensor1, tensor2])
    assert result.shape == (4,)
    assert result[0] == tensor1[0]
    assert result[1] == tensor1[1]
    assert result[2] == tensor2[0]
    assert result[3] == tensor2[1]

    # 2d tensors
    tensor1 = create_tensor(framework, (2, 3))
    tensor2 = create_tensor(framework, (2, 3))
    result = adapters[framework].concatenate([tensor1, tensor2])
    assert result.shape == (4, 3)
    for i in range(2):
        for j in range(3):
            assert result[i, j] == tensor1[i, j]
            assert result[i + 2, j] == tensor2[i, j]

    # 3d tensors
    tensor1 = create_tensor(framework, (2, 2, 3))
    tensor2 = create_tensor(framework, (2, 2, 3))
    result = adapters[framework].concatenate([tensor1, tensor2])
    assert result.shape == (4, 2, 3)
    for i in range(2):
        for j in range(2):
            for k in range(3):
                assert result[i, j, k] == tensor1[i, j, k]
                assert result[i + 2, j, k] == tensor2[i, j, k]


@pytest.mark.parametrize("framework", frameworks)
def test_tensor_adapter_get_to_device(framework):
    tensor = create_tensor(framework, (2, 3))
    device = adapters[framework].get_device(tensor)
    device_tensor = adapters[framework].to_device(tensor, device)

    if framework == "torch":
        assert isinstance(device_tensor.device.type, str)
        assert compare_tensors(framework, device_tensor, tensor)
    else:
        assert compare_tensors(framework, device_tensor, tensor)


@pytest.mark.parametrize("framework", frameworks)
def test_tensor_adapter_boolean_ones_like(framework):
    tensor = create_tensor(framework, (2, 3))
    ones = adapters[framework].boolean_ones_like(tensor)

    assert ones.shape == (2, 3)
    for i in range(2):
        for j in range(3):
            assert ones[i, j]


@pytest.mark.parametrize("framework", frameworks)
def test_tensor_adapter_apply_mask(framework):
    tensor = create_tensor(framework, (2, 3))

    if framework == "torch":
        mask = torch.randn(2, 3) > 0
    elif framework == "numpy":
        mask = np.random.randn(2, 3) > 0
    elif framework == "mlx":
        if not HAS_MLX:
            pytest.skip("MLX not available")
        mask = mx.random.normal((2, 3)) > 0

    masked = adapters[framework].apply_mask(tensor, mask, float("-inf"))

    assert masked.shape == (2, 3)
    for i in range(2):
        for j in range(3):
            if mask[i, j]:
                assert masked[i, j] == float("-inf")
            else:
                assert masked[i, j] == tensor[i, j]


@pytest.mark.parametrize("framework", frameworks)
def test_tensor_adapter_argsort_descending(framework):
    tensor = create_tensor(framework, (2, 3))
    indices = adapters[framework].argsort_descending(tensor)

    assert indices.shape == (2, 3)
    for i in range(2):
        sorted_values = [tensor[i][idx] for idx in indices[i]]
        for j in range(len(sorted_values) - 1):
            assert sorted_values[j] >= sorted_values[j + 1]


================================================
FILE: tests/test_applications.py
================================================
from typing import Any

import jinja2
import pytest
import transformers

from outlines import from_transformers
from outlines.applications import Application
from outlines.templates import Template


@pytest.fixture(scope="session")
def model():
    return from_transformers(
        transformers.AutoModelForCausalLM.from_pretrained("gpt2"),
        transformers.AutoTokenizer.from_pretrained("gpt2"),
    )


@pytest.fixture(scope="session")
def another_model():
    return from_transformers(
        transformers.AutoModelForCausalLM.from_pretrained("gpt2"),
        transformers.AutoTokenizer.from_pretrained("gpt2"),
    )


def test_application_initialization():
    template = Template.from_string("Test {{ value }}")
    output_type = None
    application = Application(template, output_type)

    assert application.template == template
    assert application.output_type == output_type
    assert application.model is None
    assert application.generator is None


def test_application_generator_no_model():
    template = Template.from_string("Test {{ value }}")
    output_type = None
    application = Application(template, output_type)

    with pytest.raises(ValueError):
        application(None, {"value": "example"})


def test_application_template_call(model):
    template = Template.from_string("Test {{ value }}")
    output_type = None
    application = Application(template, output_type)
    result = application(model, {"value": "example"}, max_new_tokens=10)

    assert isinstance(result, str)


def test_application_callable_call(model):
    def template(value):
        return f"Test {value}"

    output_type = None
    application = Application(template, output_type)
    result = application(model, {"value": "example"}, max_new_tokens=10)

    assert isinstance(result, str)


def test_application_template_error(model):
    template = Template.from_string("Test {{ value }}")
    output_type = None
    application = Application(template, output_type)

    with pytest.raises(jinja2.exceptions.UndefinedError):
        application(model, {"foo": "bar"})


def test_application_generator_reuse(model, another_model):
    template = Template.from_string("Test {{ value }}")
    output_type = None
    application = Application(template, output_type)

    application(model, {"value": "example"}, max_new_tokens=10)
    first_generator = application.generator
    first_model = application.model

    application(model, {"value": "example"}, max_new_tokens=10)
    assert application.model == first_model
    assert application.generator == first_generator

    application(another_model, {"value": "example"}, max_new_tokens=10)
    assert application.model == another_model
    assert application.model != first_model
    assert application.generator != first_generator


================================================
FILE: tests/test_cache.py
================================================
import os
import tempfile
import unittest
from importlib import reload

import diskcache
import pytest
from diskcache import Cache, UNKNOWN
from outlines.caching import CloudpickleDisk


@pytest.fixture
def temp_dir():
    """Create a temporary directory for testing."""
    directory = tempfile.mkdtemp()
    yield directory


@pytest.fixture
def refresh_environment():
    """Refresh the test environment.

    This deletes any reference to `outlines` in the modules dictionary and unsets the
    `OUTLINES_CACHE_DIR` environment variable if set. This is necessary because we
    are using a module variable to hold the cache.

    """
    import sys

    for key in list(sys.modules.keys()):
        if "outlines" in key:
            del sys.modules[key]

    try:
        del os.environ["OUTLINES_CACHE_DIR"]
    except KeyError:
        pass


@pytest.fixture
def test_cache(refresh_environment):
    """Initialize a temporary cache and delete it after the test has run."""
    with tempfile.TemporaryDirectory() as tempdir:
        os.environ["OUTLINES_CACHE_DIR"] = tempdir
        import outlines

        memory = outlines.get_cache()
        assert memory.directory == tempdir

        yield outlines.caching.cache()

        memory.clear()


def test_get_cache(test_cache):
    import outlines

    memory = outlines.get_cache()
    assert isinstance(memory, diskcache.Cache)

    # If the cache is enabled then the size
    # of `store` should not increase the
    # second time `f` is called.
    store = list()

    @test_cache
    def f(x):
        store.append(1)
        return x

    f(1)
    store_size = len(store)

    f(1)
    assert len(store) == store_size

    f(2)
    assert len(store) == store_size + 1


def test_disable_cache(test_cache):
    """Make sure that we can disable the cache."""
    import outlines

    outlines.disable_cache()

    # If the cache is disabled then the size
    # of `store` should increase every time
    # `f` is called.
    store = list()

    @test_cache
    def f(x):
        store.append(1)
        return x

    f(1)
    store_size = len(store)
    f(1)
    assert len(store) == store_size + 1


def test_clear_cache(test_cache):
    """Make sure that we can clear the cache."""
    import outlines

    store = list()

    @test_cache
    def f(x):
        store.append(1)
        return x

    # The size of `store` does not increase since
    # `f` is cached after the first run.
    f(1)
    store_size = len(store)
    f(1)
    assert len(store) == store_size

    # The size of `store` should increase if we call `f`
    # after clearing the cache.
    outlines.clear_cache()
    f(1)
    assert len(store) == store_size + 1


def test_version_upgrade_cache_invalidate(test_cache, mocker):
    """Ensure we can change the signature of a cached function if we upgrade the version"""

    import outlines.caching

    def simulate_restart_outlines():
        # clearing in-memory lru_cache which returns the diskcache in
        # order to simulate a reload, we're not clearing the diskcache itself
        outlines.caching.get_cache.cache_clear()

    mocker.patch("outlines._version.__version__", new="0.0.0")
    simulate_restart_outlines()

    # initialize cache with signature of Tuple-of-3
    @test_cache
    def foo():
        return (1, 2, 3)

    a, b, c = foo()

    # "restart" outlines without upgrading version
    simulate_restart_outlines()

    # change signature to Tuple-of-2
    @test_cache
    def foo():
        return (1, 2)

    # assert without version upgrade, old, bad cache is used
    with pytest.raises(ValueError):
        a, b = foo()

    # "restart" outlines WITH version upgrade
    mocker.patch("outlines._version.__version__", new="0.0.1")
    simulate_restart_outlines()

    # change signature to Tuple-of-2
    @test_cache
    def foo():
        return (1, 2)

    # assert with version upgrade, old cache is invalidated and new cache is used
    a, b = foo()


def test_cache_disabled_decorator(test_cache):
    """Ensure cache can be disabled in a local scope"""

    from outlines.caching import cache_disabled

    mock = unittest.mock.MagicMock()

    @test_cache
    def fn():
        mock()
        return 1

    # first call isn't cached
    fn()
    assert mock.call_count == 1

    # second call doesn't run fn, uses cache
    fn()
    assert mock.call_count == 1

    # cache_disabled decorator disables cache within scope
    with cache_disabled():
        fn()
    assert mock.call_count == 2  # called once in cache_disabled scope

    # scope has exited, cache is enabled again
    fn()
    assert mock.call_count == 2


@pytest.fixture
def temp_cache_dir():
    import os
    import tempfile

    import outlines.caching

    with tempfile.TemporaryDirectory() as tempdir:
        os.environ["OUTLINES_CACHE_DIR"] = tempdir
        outlines.caching.get_cache.cache_clear()
        reload(outlines)
        cache_status = outlines.caching._caching_enabled
        try:
            outlines.caching._caching_enabled = True
            yield
        finally:
            outlines.caching._caching_enabled = cache_status


================================================
FILE: tests/test_generator.py
================================================
import pytest
from typing import AsyncGenerator, Generator as TypingGenerator, Literal

import transformers
from outlines_core import Index, Vocabulary

import outlines
from outlines.backends.outlines_core import OutlinesCoreLogitsProcessor
from outlines.generator import (
    BlackBoxGenerator,
    SteerableGenerator,
    Generator,
    AsyncBlackBoxGenerator,
)
from outlines.models import AsyncVLLM, VLLM
from outlines.processors import (
    OutlinesLogitsProcessor,
)
from outlines.types import CFG
from tests.test_utils.mock_openai_client import (
    MockAsyncOpenAIClient,
    MockOpenAIClient,
)


MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct"


# We used the mocked vllm model to test the black box generator
async_openai_client = MockAsyncOpenAIClient()
openai_client = MockOpenAIClient()
mock_responses = [
    (
        {
            'messages': [
                {'role': "user", 'content': 'Write a very short sentence'}
            ],
            'model': MODEL_NAME,
            'max_tokens': 10,
            'extra_body': {'guided_regex': '("[^"]*")'},
        },
        "Mock response"
    ),
    (
        {
            'messages': [
                {'role': "user", 'content': 'Write a very short sentence'}
            ],
            'model': MODEL_NAME,
            'max_tokens': 10,
            'extra_body': {'guided_regex': '("[^"]*")'},
            'stream': True,
        },
        ["Mock", "response"]
    ),
]
async_openai_client.add_mock_responses(mock_responses)
openai_client.add_mock_responses(mock_responses)


@pytest.fixture(scope="session")
def steerable_model():
    model = outlines.from_transformers(
        transformers.AutoModelForCausalLM.from_pretrained("erwanf/gpt2-mini"),
        transformers.AutoTokenizer.from_pretrained("erwanf/gpt2-mini"),
    )
    return model


@pytest.fixture(scope="session")
def sample_processor():
    vocabulary = Vocabulary.from_pretrained("openai-community/gpt2")
    index = Index(r"[0-9]{3}", vocabulary)
    return OutlinesCoreLogitsProcessor(index, "torch")


@pytest.fixture(scope="module")
def black_box_sync_model():
    return VLLM(openai_client, MODEL_NAME)


@pytest.fixture(scope="module")
def black_box_async_model():
    return AsyncVLLM(async_openai_client, MODEL_NAME)


# SteerableGenerator


def test_steerable_generator_init_valid_processor(steerable_model, sample_processor):
    generator = SteerableGenerator.from_processor(steerable_model, sample_processor)
    assert generator.logits_processor == sample_processor
    assert generator.model == steerable_model


def test_steerable_generator_init_cfg_output_type(steerable_model):
    generator = SteerableGenerator(steerable_model, CFG('start: "a"'))
    assert generator.model == steerable_model
    assert isinstance(generator.logits_processor, OutlinesLogitsProcessor)


def test_steerable_generator_init_other_output_type(steerable_model):
    generator = SteerableGenerator(steerable_model, Literal["foo", "bar"])
    assert generator.model == steerable_model
    assert isinstance(generator.logits_processor, OutlinesLogitsProcessor)


def test_steerable_generator_init_invalid_output_type(steerable_model, sample_processor):
    with pytest.raises(ValueError):
        SteerableGenerator(steerable_model, sample_processor)


def test_steerable_generator_call(steerable_model):
    generator = SteerableGenerator(steerable_model, Literal["foo", "bar"])
    result = generator("foo", max_new_tokens=10)
    assert isinstance(result, str)


def test_steerable_generator_stream(steerable_model):
    with pytest.raises(NotImplementedError):
        generator = SteerableGenerator(steerable_model, Literal["foo", "bar"])
        result = generator.stream("foo", max_tokens=10)
        assert isinstance(result, TypingGenerator)
        assert isinstance(next(result), str)


# BlackBoxGenerator


def test_black_box_generator_init(black_box_sync_model):
    generator = BlackBoxGenerator(black_box_sync_model, Literal["foo", "bar"])
    assert generator.model == black_box_sync_model
    assert generator.output_type == Literal["foo", "bar"]

def test_black_box_generator_call(black_box_sync_model):
    generator = BlackBoxGenerator(black_box_sync_model, str)
    result = generator("Write a very short sentence", max_tokens=10)
    assert isinstance(result, str)


def test_black_box_generator_stream(black_box_sync_model):
    generator = BlackBoxGenerator(black_box_sync_model, str)
    result = generator.stream("Write a very short sentence", max_tokens=10)
    assert isinstance(result, TypingGenerator)
    assert isinstance(next(result), str)


# AsyncBlackBoxGenerator


def test_async_black_box_generator_init(black_box_async_model):
    generator = AsyncBlackBoxGenerator(black_box_async_model, Literal["foo", "bar"])
    assert generator.model == black_box_async_model
    assert generator.output_type == Literal["foo", "bar"]


@pytest.mark.asyncio
async def test_async_black_box_generator_call(black_box_async_model):
    generator = AsyncBlackBoxGenerator(black_box_async_model, str)
    result = await generator("Write a very short sentence", max_tokens=10)
    assert isinstance(result, str)


@pytest.mark.asyncio
async def test_async_black_box_generator_stream(black_box_async_model):
    generator = AsyncBlackBoxGenerator(black_box_async_model, str)
    result = generator.stream("Write a very short sentence", max_tokens=10)
    assert isinstance(result, AsyncGenerator)
    async for chunk in result:
        assert isinstance(chunk, str)
        break  # Just check the first chunk


# Generator


def test_generator_init_no_model():
    with pytest.raises(ValueError):
        Generator(None, Literal["foo", "bar"])


def test_generator_init_multiple_output_type(steerable_model, sample_processor):
    with pytest.raises(ValueError):
        Generator(steerable_model, Literal["foo", "bar"], processor=sample_processor)


def test_generator_steerable_output_type(steerable_model):
    generator = Generator(steerable_model, Literal["foo", "bar"])
    assert isinstance(generator, SteerableGenerator)
    assert generator.model == steerable_model
    assert isinstance(generator.logits_processor, OutlinesLogitsProcessor)


def test_generator_steerable_processor(steerable_model, sample_processor):
    generator = Generator(steerable_model, processor=sample_processor)
    assert isinstance(generator, SteerableGenerator)
    assert generator.model == steerable_model
    assert isinstance(generator.logits_processor, OutlinesLogitsProcessor)


def test_generator_black_box_sync_output_type(black_box_sync_model):
    generator = Generator(black_box_sync_model, Literal["foo", "bar"])
    assert isinstance(generator, BlackBoxGenerator)
    assert generator.model == black_box_sync_model
    assert generator.output_type == Literal["foo", "bar"]


def test_generator_black_box_sync_processor(black_box_sync_model, sample_processor):
    with pytest.raises(NotImplementedError):
        Generator(black_box_sync_model, processor=sample_processor)


def test_generator_black_box_async_output_type(black_box_async_model):
    generator = Generator(black_box_async_model, Literal["foo", "bar"])
    assert isinstance(generator, AsyncBlackBoxGenerator)
    assert generator.model == black_box_async_model
    assert generator.output_type == Literal["foo", "bar"]


def test_generator_black_box_async_processor(black_box_async_model, sample_processor):
    with pytest.raises(NotImplementedError):
        Generator(black_box_async_model, processor=sample_processor)


================================================
FILE: tests/test_inputs.py
================================================
"""Unit tests for the inputs module."""

import base64
import tempfile
from io import BytesIO
from typing import Dict, List, Any

import pytest
from PIL import Image as PILImage

from outlines.inputs import Image, Video, Audio, Chat


@pytest.fixture
def image_input():
    image = PILImage.new("RGB", (100, 100), color="red")
    image.format = "PNG"
    buffer = BytesIO()
    image.save(buffer, format="PNG")
    return Image(image=image)


def test_image_initialization():
    # png
    image = PILImage.new("RGB", (100, 100), color="red")
    image.format = "PNG"
    buffer = BytesIO()
    image.save(buffer, format="PNG")
    image_input = Image(image=image)

    assert image_input.image == image
    assert image_input.image_format == "image/png"
    assert image_input.image_str == base64.b64encode(buffer.getvalue()).decode("utf-8")

    # jpeg
    image = PILImage.new("RGB", (100, 100), color="blue")
    image.format = "JPEG"
    buffer = BytesIO()
    image.save(buffer, format="JPEG")
    image_input = Image(image=image)

    assert image_input.image == image
    assert image_input.image_format == "image/jpeg"
    assert image_input.image_str == base64.b64encode(buffer.getvalue()).decode("utf-8")


def test_image_initialization_invalid():
    """Test that Image initialization fails when image has no format."""
    # No format
    image = PILImage.new("RGB", (100, 100), color="yellow")
    with pytest.raises(TypeError, match="Could not read the format of the image"):
        Image(image=image)

    # Empty string format
    image = PILImage.new("RGB", (100, 100), color="orange")
    image.format = ""
    with pytest.raises(TypeError, match="Could not read the format of the image"):
        Image(image=image)


def test_video_initialization():
    video = "foo"
    video_input = Video(video=video)
    assert video_input.video == video


def test_audio_initialization():
    audio = "foo"
    audio_input = Audio(audio=audio)
    assert audio_input.audio == audio


def test_chat_initialization():
    # Empty
    chat = Chat()
    assert chat.messages == []
    assert len(chat.messages) == 0
    assert str(chat) == ""
    assert repr(chat) == "Chat(messages=[])"

    # With messages
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Hello!"},
        {"role": "assistant", "content": "Hi there!"}
    ]
    chat = Chat(messages=messages)
    assert chat.messages == messages
    assert len(chat.messages) == 3
    assert str(chat) == "{'role': 'system', 'content': 'You are a helpful assistant.'}\n{'role': 'user', 'content': 'Hello!'}\n{'role': 'assistant', 'content': 'Hi there!'}"
    assert repr(chat) == "Chat(messages=[{'role': 'system', 'content': 'You are a helpful assistant.'}, {'role': 'user', 'content': 'Hello!'}, {'role': 'assistant', 'content': 'Hi there!'}])"


def test_chat_append():
    chat = Chat(messages=[])
    message = {"role": "user", "content": "Hello"}
    chat.append(message)
    assert len(chat.messages) == 1
    assert chat.messages[0] == message

def test_chat_extend():
    chat = Chat(messages=[])
    messages = [
        {"role": "user", "content": "Hello"},
        {"role": "assistant", "content": "Hi"}
    ]
    chat.extend(messages)
    assert len(chat.messages) == 2
    assert chat.messages == messages

def test_chat_pop():
    # Pop from non-empty chat
    messages = [
        {"role": "user", "content": "Hello"},
        {"role": "assistant", "content": "Hi"}
    ]
    chat = Chat(messages=messages.copy())
    popped_message = chat.pop()
    assert popped_message == {"role": "assistant", "content": "Hi"}
    assert len(chat.messages) == 1
    assert chat.messages[0] == {"role": "user", "content": "Hello"}

    # Pop from empty chat
    chat = Chat(messages=[])
    with pytest.raises(IndexError):
        chat.pop()


def test_chat_add_system_message(image_input):
    # Add a string
    chat = Chat(messages=[])
    chat.add_system_message("You are a helpful assistant.")
    assert len(chat.messages) == 1
    assert chat.messages[0]["role"] == "system"
    assert chat.messages[0]["content"] == "You are a helpful assistant."

    # Add a list
    chat = Chat(messages=[])
    chat.add_system_message(["prompt", image_input])
    assert len(chat.messages) == 1
    assert chat.messages[0]["role"] == "system"
    assert chat.messages[0]["content"] == ["prompt", image_input]

    # Add a list of dict items with explicit types
    chat = Chat(messages=[])
    chat.add_system_message([{"type": "text", "text": "prompt"}, {"type": "image", "image": image_input}])
    assert len(chat.messages) == 1
    assert chat.messages[0]["role"] == "system"
    assert chat.messages[0]["content"] == [{"type": "text", "text": "prompt"}, {"type": "image", "image": image_input}]


def test_add_user_message_string(image_input):
    # Add a string
    chat = Chat(messages=[])
    chat.add_user_message("Hello, how are you?")
    assert len(chat.messages) == 1
    assert chat.messages[0]["role"] == "user"
    assert chat.messages[0]["content"] == "Hello, how are you?"

    # Add a list
    chat = Chat(messages=[])
    chat.add_user_message(["prompt", image_input])
    assert len(chat.messages) == 1
    assert chat.messages[0]["role"] == "user"
    assert chat.messages[0]["content"] == ["prompt", image_input]

    # Add a list of dict items with explicit types
    chat = Chat(messages=[])
    chat.add_user_message([{"type": "text", "text": "prompt"}, {"type": "image", "image": image_input}])
    assert len(chat.messages) == 1
    assert chat.messages[0]["role"] == "user"
    assert chat.messages[0]["content"] == [{"type": "text", "text": "prompt"}, {"type": "image", "image": image_input}]


def test_add_assistant_message_string(image_input):
    # Add a string
    chat = Chat(messages=[])
    chat.add_assistant_message("I'm doing well, thank you!")
    assert len(chat.messages) == 1
    assert chat.messages[0]["role"] == "assistant"
    assert chat.messages[0]["content"] == "I'm doing well, thank you!"

    # Add a list
    chat = Chat(messages=[])
    chat.add_assistant_message(["prompt", image_input])
    assert len(chat.messages) == 1
    assert chat.messages[0]["role"] == "assistant"
    assert chat.messages[0]["content"] == ["prompt", image_input]

    # Add a list of dict items with explicit types
    chat = Chat(messages=[])
    chat.add_assistant_message([{"type": "text", "text": "prompt"}, {"type": "image", "image": image_input}])
    assert len(chat.messages) == 1
    assert chat.messages[0]["role"] == "assistant"
    assert chat.messages[0]["content"] == [{"type": "text", "text": "prompt"}, {"type": "image", "image": image_input}]


================================================
FILE: tests/test_templates.py
================================================
import base64
import os
import tempfile
from typing import Optional

import pytest
from PIL import Image as PILImage
from io import BytesIO
from pydantic import BaseModel, Field

from outlines.inputs import Image
from outlines.templates import (
    Template,
    build_template_from_string,
    Vision,
    get_fn_name,
    get_fn_args,
    get_fn_description,
    get_fn_source,
    get_fn_signature,
    get_schema,
)


def sample_function(x, y=2):
    """This is a sample function."""
    return x + y

def function_with_annotations(x: int, y: str) -> str:
    """Function with annotations."""
    return f"{x} {y}"

def function_with_no_docstring(x, y):
    return x * y

class CallableClass:
    def __call__(self):
        pass

class PydanticClass(BaseModel):
    foo: str


def test_vision_initialization():
    # Create a simple image for testing
    image = PILImage.new("RGB", (10, 10), color="red")
    image.format = "PNG"

    # Initialize the Vision object
    with pytest.deprecated_call():
        vision = Vision(prompt="Test prompt", image=image)

    # Check that the prompt is set correctly
    assert isinstance(vision, list)
    assert len(vision) == 2
    assert vision[0] == "Test prompt"
    assert isinstance(vision[1], Image)

    # Check that the image is encoded correctly
    buffer = BytesIO()
    image.save(buffer, format=image.format)
    expected_image_str = base64.b64encode(buffer.getvalue()).decode("utf-8")
    assert vision[1].image_str == expected_image_str

    # Check that the image format is set correctly
    assert vision[1].image_format == "image/png"


def test_vision_invalid_image_format():
    # Create an image without a format
    image = PILImage.new("RGB", (10, 10), color="blue")

    # Expect a TypeError when the image format is not set
    with pytest.deprecated_call():
        with pytest.raises(TypeError, match="Could not read the format"):
            Vision(prompt="Test prompt", image=image)


def render(content: str, filters: Optional[dict] = None, **kwargs):
    template = build_template_from_string(content, filters or {})
    return template.render(kwargs)


def test_render():
    tpl = """
    A test string"""
    assert render(tpl) == "A test string"

    tpl = """
    A test string
    """
    assert render(tpl) == "A test string"

    tpl = """
        A test
        Another test
    """
    assert render(tpl) == "A test\nAnother test"

    tpl = """A test
        Another test
    """
    assert render(tpl) == "A test\nAnother test"

    tpl = """
        A test line
            An indented line
    """
    assert render(tpl) == "A test line\n    An indented line"

    tpl = """
        A test line
            An indented line

    """
    assert render(tpl) == "A test line\n    An indented line\n"


def test_render_escaped_linebreak():
    tpl = """
        A long test \
        that we break \
        in several lines
    """
    assert render(tpl) == "A long test that we break in several lines"

    tpl = """
        Break in \
        several lines \
        But respect the indentation
            on line breaks.
        And after everything \
        Goes back to normal
    """
    assert (
        render(tpl)
        == "Break in several lines But respect the indentation\n    on line breaks.\nAnd after everything Goes back to normal"
    )


def test_render_jinja():
    """Make sure that we can use basic Jinja2 syntax, and give examples
    of how we can use it for basic use cases.
    """

    # Notice the newline after the end of the loop
    examples = ["one", "two"]
    prompt = render(
        """
        {% for e in examples %}
        Example: {{e}}
        {% endfor -%}""",
        examples=examples,
    )
    assert prompt == "Example: one\nExample: two\n"

    # We can remove the newline by cloing with -%}
    examples = ["one", "two"]
    prompt = render(
        """
        {% for e in examples %}
        Example: {{e}}
        {% endfor -%}

        Final""",
        examples=examples,
    )
    assert prompt == "Example: one\nExample: two\nFinal"

    # Same for conditionals
    tpl = """
        {% if is_true %}
        true
        {% endif -%}

        final
        """
    assert render(tpl, is_true=True) == "true\nfinal"
    assert render(tpl, is_true=False) == "final"


def test_render_filters():
    def foo(bar: str) -> str:
        """This is a sample function."""
        return bar

    class PydanticClass(BaseModel):
        foo: str = Field(description="bar")

    def custom_filter(x: str) -> str:
        return x.upper()

    # name filter
    tpl = """
    {{ func | name }}
    """
    assert render(tpl, func=foo) == "foo"

    # description filter
    tpl = """
    {{ func | description }}
    """
    assert render(tpl, func=foo) == "This is a sample function."

    # source filter
    tpl = """
    {{ func | source }}
    """
    assert render(tpl, func=foo) == 'def foo(bar: str) -> str:\n    """This is a sample function."""\n    return bar\n'

    # signature filter
    tpl = """
    {{ func | signature }}
    """
    assert render(tpl, func=foo) == "bar: str"

    # args filter
    tpl = """
    {{ func | args }}
    """
    assert render(tpl, func=foo) == "bar: str"

    # schema filter
    tpl = """
    {{ schema | schema }}
    """
    assert render(tpl, schema=PydanticClass) == '{\n  "foo": "bar"\n}'

    # custom filters
    tpl = """
    {{ name | custom_filter }}
    """
    assert render(tpl, {"custom_filter": custom_filter}, name="John") == "JOHN"


@pytest.fixture
def temp_prompt_file():
    test_dir = tempfile.mkdtemp()

    base_template_path = os.path.join(test_dir, "base_template.txt")
    with open(base_template_path, "w") as f:
        f.write(
            """{% block content %}{% endblock %}
"""
        )

    include_file_path = os.path.join(test_dir, "include.txt")
    with open(include_file_path, "w") as f:
        f.write(
            """{% for example in examples %}
- Q: {{ example.question }}
- A: {{ example.answer }}
{% endfor %}
"""
        )

    prompt_file_path = os.path.join(test_dir, "prompt.txt")
    with open(prompt_file_path, "w") as f:
        f.write(
            """{% extends "base_template.txt" %}

{% block content %}
Here is a prompt with examples:

{% include "include.txt" %}

Now please answer the following question:

Q: {{ question }}
A:
{% endblock %}
"""
        )
    yield prompt_file_path


def test_prompt_from_file(temp_prompt_file):
    prompt = Template.from_file(temp_prompt_file)
    examples = [
        {"question": "What is the capital of France?", "answer": "Paris"},
        {"question": "What is 2 + 2?", "answer": "4"},
    ]
    question = "What is the Earth's diameter?"
    rendered = prompt(examples=examples, question=question)
    expected = """Here is a prompt with examples:

- Q: What is the capital of France?
- A: Paris
- Q: What is 2 + 2?
- A: 4

Now please answer the following question:

Q: What is the Earth's diameter?
A:
"""
    assert rendered.strip() == expected.strip()


def test_prompt_from_str():
    content = """
    Hello, {{ name }}!
    """
    prompt = Template.from_string(content)
    assert prompt(name="World") == "Hello, World!"


def test_template_from_str_with_extra_linebreaks():
    content = """
    Hello, {{ name }}!


    """
    template = build_template_from_string(content)
    assert template.render(name="World") == "Hello, World!\n"


def test_get_fn_name():
    with pytest.raises(TypeError):
        get_fn_name(1)
    assert get_fn_name(sample_function) == "sample_function"
    assert get_fn_name(function_with_annotations) == "function_with_annotations"
    no_name_func = lambda x: x
    assert get_fn_name(no_name_func) == "<lambda>"
    assert get_fn_name(CallableClass()) == "CallableClass"


def test_get_fn_args():
    with pytest.raises(TypeError):
        get_fn_args(1)
    assert get_fn_args(sample_function) == "x, y=2"
    assert get_fn_args(function_with_annotations) == "x: int, y: str"


def test_get_fn_description():
    with pytest.raises(TypeError):
        get_fn_description(1)
    assert get_fn_description(sample_function) == "This is a sample function."
    assert get_fn_description(function_with_annotations) == "Function with annotations."
    assert get_fn_description(function_with_no_docstring) == ""


def test_get_fn_source():
    with pytest.raises(TypeError, match="The `source` filter only applies to callables."):
        get_fn_source(1)
    source = (
        'def sample_function(x, y=2):\n'
        '    """This is a sample function."""\n'
        '    return x + y'
    )
    assert get_fn_source(sample_function).strip() == source


def test_get_fn_signature():
    with pytest.raises(TypeError, match="The `source` filter only applies to callables."):
        get_fn_signature(1)
    sample_function_signature = "x, y=2"
    assert get_fn_signature(sample_function) == sample_function_signature
    function_with_annotations_signature = "x: int, y: str"
    assert get_fn_signature(function_with_annotations) == function_with_annotations_signature


def test_get_schema():
    with pytest.raises(NotImplementedError):
        get_schema(1)

    dict_schema = {"foo": "bar"}
    dict_schema_output = get_schema(dict_schema)
    assert dict_schema_output == '{\n  "foo": "bar"\n}'

    pydantic_schema_output = get_schema(PydanticClass)
    assert pydantic_schema_output == '{\n  "foo": "<foo>"\n}'


================================================
FILE: tests/test_utils/mock_lmstudio_client.py
================================================
import json
from typing import Any, Dict, List, Optional, Tuple

from tests.test_utils.utils import hash_dict


def normalize_for_hash(obj):
    """Normalize objects for consistent hashing.

    lms.Chat objects have unique identifiers that change between instances,
    so we convert them to a canonical dict format for hashing.
    """
    obj_str = str(obj)
    if obj_str.startswith("Chat.from_history("):
        # Get the json from the string representation
        json_part = obj_str[len("Chat.from_history("):-1]
        data = json.loads(json_part)
        return {
            "type": "lms.Chat",
            "messages": normalize_lmstudio_messages(data.get("messages", []))
        }
    elif isinstance(obj, dict):
        return {k: normalize_for_hash(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [normalize_for_hash(item) for item in obj]
    else:
        return obj


def normalize_lmstudio_messages(messages):
    """Normalize message list for hashing."""
    result = []
    for msg in messages:
        normalized_msg = {
            "role": msg.get("role", ""),
            "content": normalize_lmstudio_content(msg.get("content", "")),
        }
        result.append(normalized_msg)
    return result


def normalize_lmstudio_content(content):
    """Normalize message content for hashing."""
    if isinstance(content, list):
        result = []
        for item in content:
            if isinstance(item, dict):
                if item.get("type") == "text":
                    result.append({"type": "text", "text": item.get("text", "")})
                elif item.get("type") == "file":
                    result.append({"type": "file", "sizeBytes": item.get("sizeBytes", 0)})
                else:
                    result.append(item)
            else:
                result.append(str(item))
        return result
    elif isinstance(content, str):
        return content
    else:
        return str(content)


def hash_lmstudio_request(data: dict) -> str:
    """Hash a request dict, normalizing lms.Chat objects."""
    normalized = normalize_for_hash(data)
    return hash_dict(normalized)


class MockLMStudioResponse:
    """Mock for LMStudio response object"""

    def __init__(self, content: str):
        self.content = content


class MockLMStudioModel:
    """Mock for LMStudio model object returned by client.llm.model()"""

    def __init__(self, mock_responses: Dict[str, Any]):
        self._mock_responses = mock_responses

    def respond(self, messages, **kwargs):
        request_key = hash_lmstudio_request({"messages": messages, **kwargs})
        response = self._mock_responses.get(request_key)
        if not response:
            raise ValueError(f"No response found for {{'messages': {messages}, **{kwargs}}}")
        return MockLMStudioResponse(response)

    def respond_stream(self, messages, **kwargs):
        request_key = hash_lmstudio_request({"messages": messages, **kwargs})
        response = self._mock_responses.get(request_key)
        if not response:
            raise ValueError(f"No response found for {{'messages': {messages}, **{kwargs}}}")
        for chunk in response:
            yield MockLMStudioResponse(chunk)


class MockLMStudioLLM:
    """Mock for the llm attribute of Client"""

    def __init__(self, mock_responses: Dict[str, Any]):
        self._mock_responses = mock_responses

    def model(self, model_key=None):
        return MockLMStudioModel(self._mock_responses)


class MockLMStudioClient:
    """Mock for LMStudio `Client` that can be used to test the LMStudio model"""

    def __init__(self):
        self._mock_responses: Dict[str, Any] = {}
        self.llm: Optional[MockLMStudioLLM] = None

    def add_mock_responses(self, mocks: List[Tuple[dict, Any]]):
        for kwargs, response in mocks:
            request_key = hash_lmstudio_request(kwargs)
            self._mock_responses[request_key] = response
        self.llm = MockLMStudioLLM(self._mock_responses)


class MockAsyncLMStudioModel:
    """Mock for async LMStudio model object returned by client.llm.model()"""

    def __init__(self, mock_responses: Dict[str, Any]):
        self._mock_responses = mock_responses

    async def respond(self, messages, **kwargs):
        request_key = hash_lmstudio_request({"messages": messages, **kwargs})
        response = self._mock_responses.get(request_key)
        if not response:
            raise ValueError(f"No response found for {{'messages': {messages}, **{kwargs}}}")
        return MockLMStudioResponse(response)

    async def respond_stream(self, messages, **kwargs):
        """Return an async iterator (must be awaited first, then iterated)."""
        request_key = hash_lmstudio_request({"messages": messages, **kwargs})
        response = self._mock_responses.get(request_key)
        if not response:
            raise ValueError(f"No response found for {{'messages': {messages}, **{kwargs}}}")

        async def _stream():
            for chunk in response:
                yield MockLMStudioResponse(chunk)

        return _stream()


class MockAsyncLMStudioLLM:
    """Mock for the llm attribute of AsyncClient"""

    def __init__(self, mock_responses: Dict[str, Any]):
        self._mock_responses = mock_responses

    async def model(self, model_key=None):
        return MockAsyncLMStudioModel(self._mock_responses)


class MockAsyncLMStudioClient:
    """Mock for LMStudio `AsyncClient` that can be used to test the AsyncLMStudio model"""

    def __init__(self):
        self._mock_responses: Dict[str, Any] = {}
        self.llm: Optional[MockAsyncLMStudioLLM] = None
        self._context_entered = False

    def add_mock_responses(self, mocks: List[Tuple[dict, Any]]):
        for kwargs, response in mocks:
            request_key = hash_lmstudio_request(kwargs)
            self._mock_responses[request_key] = response
        self.llm = MockAsyncLMStudioLLM(self._mock_responses)

    async def __aenter__(self):
        self._context_entered = True
        return self

    async def __aexit__(self, exc_type, exc_val, exc_tb):
        self._context_entered = False
        return False


================================================
FILE: tests/test_utils/mock_openai_client.py
================================================
from typing import List, Dict, Any, Optional
from unittest.mock import MagicMock

from tests.test_utils.utils import hash_dict


class MockChoice:
    def __init__(
        self,
        content: str,
        finish_reason: str = "stop",
        refusal: Optional[str] = None
    ):
        self.message = MagicMock()
        self.message.content = content
        self.message.refusal = refusal
        self.finish_reason = finish_reason
        self.delta = MagicMock()
        self.delta.content = content


class MockCompletionResponse:
    def __init__(self, choices: List[MockChoice]):
        self.choices = choices


class MockStreamingChunk:
    def __init__(self, content: Optional[str] = None):
        self.choices = []
        if content is not None:
            choice = MagicMock()
            delta = MagicMock()
            delta.content = content
            choice.delta = delta
            self.choices = [choice]


class MockOpenAIClient:
    """Mock for OpenAI client that can be used to test vLLM integration"""

    def __init__(self):
        self.chat = MagicMock()
        self.chat.completions = MagicMock()
        self.chat.completions.create = MagicMock()

        # The method that will be called by the model when it makes a request
        def _create(**kwargs):
            # Hash the arguments to create a unique key
            request_key = hash_dict(kwargs)
            response = self._mock_responses.get(request_key)
            if not response:
                raise ValueError(f"No response found for {kwargs}")
            if kwargs.get("stream", False):
                return self._create_streaming_response(response)
            else:
                return self._create_standard_response(response)

        self.chat.completions.create.side_effect = _create
        self._mock_responses: Dict[str, Any] = {}

    def add_mock_responses(self, mocks: list):
        for kwargs, response in mocks:
            request_key = hash_dict(kwargs)
            self._mock_responses[request_key] = response

    def _create_standard_response(self, response):
        if isinstance(response, str):
            response = [response]
        choices = [MockChoice(content=chunk) for chunk in response]
        return MockCompletionResponse(choices=choices)

    def _create_streaming_response(self, response):
        chunks = [MockStreamingChunk(content=chunk) for chunk in response]
        return iter(chunks)


class MockAsyncOpenAIClient:
    """Mock for AsyncOpenAI client that can be used to test AsyncVLLM integration"""

    def __init__(self):
        self.chat = MagicMock()
        self.chat.completions = MagicMock()
        self.chat.completions.create = MagicMock()

        # The method that will be called by the model when it makes a request
        async def _async_create(**kwargs):
            # Hash the arguments to create a unique key
            request_key = hash_dict(kwargs)
            response = self._mock_responses.get(request_key)
            if not response:
                raise ValueError(f"No response found for {kwargs}")
            if kwargs.get("stream", False):
                return self._create_async_streaming_response(response)
            else:
                return await self._create_async_standard_response(response)

        self.chat.completions.create.side_effect = _async_create
        self._mock_responses: Dict[str, Any] = {}

    def add_mock_responses(self, mocks: list):
        for kwargs, response in mocks:
            request_key = hash_dict(kwargs)
            self._mock_responses[request_key] = response

    async def _create_async_standard_response(self, response):
        """Create an async standard (non-streaming) response"""
        if isinstance(response, str):
            response = [response]
        choices = [MockChoice(content=chunk) for chunk in response]
        return MockCompletionResponse(choices=choices)

    async def _create_async_streaming_response(self, response):
        """Create an async streaming response generator"""
        chunks = [MockStreamingChunk(content=chunk) for chunk in response]

        for chunk in chunks:
            yield chunk


================================================
FILE: tests/test_utils/mock_tgi_client.py
================================================
from typing import Any, Dict
from unittest.mock import MagicMock

from tests.test_utils.utils import hash_dict


class MockTGIInferenceClient:
    """Mock for TGI `InferenceClient` that can be used to test the TGI model"""

    def __init__(self):
        self.text_generation = MagicMock()

        # The method that will be called by the model when it makes a request
        def _create(**kwargs):
            # Hash the arguments to create a unique key
            request_key = hash_dict(kwargs)
            response = self._mock_responses.get(request_key)
            if not response:
                raise ValueError(f"No response found for {kwargs}")
            if kwargs.get("stream", False):
                return iter(response)
            else:
                return response

        self.text_generation.side_effect = _create
        self._mock_responses: Dict[str, Any] = {}

    def add_mock_responses(self, mocks: list):
        for kwargs, response in mocks:
            request_key = hash_dict(kwargs)
            self._mock_responses[request_key] = response


class MockAsyncTGIInferenceClient:
    """Mock for TGI `InferenceClient` that can be used to test the TGI model"""

    def __init__(self):
        self.text_generation = MagicMock()

        # The method that will be called by the model when it makes a request
        async def _async_create(**kwargs):
            # Hash the arguments to create a unique key
            request_key = hash_dict(kwargs)
            response = self._mock_responses.get(request_key)
            if not response:
                raise ValueError(f"No response found for {kwargs}")
            if kwargs.get("stream", False):
                return self._create_async_streaming_response(response)
            else:
                return response

        self.text_generation.side_effect = _async_create
        self._mock_responses: Dict[str, Any] = {}

    def add_mock_responses(self, mocks: list):
        for kwargs, response in mocks:
            request_key = hash_dict(kwargs)
            self._mock_responses[request_key] = response

    async def _create_async_streaming_response(self, response):
        """Create an async streaming response generator"""
        for chunk in response:
            yield chunk


================================================
FILE: tests/test_utils/utils.py
================================================
import hashlib
import pickle
import sys


def hash_dict(d) -> str:
    def make_hashable(obj):
        if isinstance(obj, (bool, int, float, str, type(None))):
            if isinstance(obj, str):
                return sys.intern(obj)
            return obj
        if isinstance(obj, dict):
            return tuple(sorted(
                (sys.intern(k) if isinstance(k, str) else k, make_hashable(v))
                for k, v in obj.items()
            ))
        if isinstance(obj, (list, tuple)):
            return tuple(make_hashable(e) for e in obj)
        return str(obj)

    hashable_obj = make_hashable(d)
    pickled_obj = pickle.dumps(hashable_obj, protocol=4)
    return hashlib.sha256(pickled_obj).hexdigest()


================================================
FILE: tests/types/test_custom_types.py
================================================
import re

import pytest
from pydantic import BaseModel

from outlines import types
from outlines.types.dsl import to_regex


@pytest.mark.parametrize(
    "custom_type,test_string,should_match",
    [
        (types.locale.us.phone_number, "12", False),
        (types.locale.us.phone_number, "(123) 123-1234", True),
        (types.locale.us.phone_number, "123-123-1234", True),
        (types.locale.us.zip_code, "12", False),
        (types.locale.us.zip_code, "12345", True),
        (types.locale.us.zip_code, "12345-1234", True),
        (types.isbn, "ISBN 0-1-2-3-4-5", False),
        (types.isbn, "ISBN 978-0-596-52068-7", True),
        (types.isbn, "ISBN-13: 978-0-596-52068-7", True),
        (types.isbn, "978 0 596 52068 7", True),
        (types.isbn, "9780596520687", True),
        (types.isbn, "ISBN-10: 0-596-52068-9", True),
        (types.isbn, "0-596-52068-9", True),
        (types.email, "eitan@gmail.com", True),
        (types.email, "99@yahoo.com", True),
        (types.email, "eitan@.gmail.com", False),
        (types.email, "myemail", False),
        (types.email, "eitan@gmail", False),
        (types.email, "eitan@my.custom.domain", True),
        (types.integer, "-19", True),
        (types.integer, "19", True),
        (types.integer, "019", False),
        (types.integer, "1.9", False),
        (types.integer, "a", False),
        (types.boolean, "True", True),
        (types.boolean, "False", True),
        (types.boolean, "true", False),
        (types.number, "10", True),
        (types.number, "10.9", True),
        (types.number, "10.9e+3", True),
        (types.number, "10.9e-3", True),
        (types.number, "a", False),
        (types.date, "2022-03-23", True),
        (types.date, "2022-03-32", False),
        (types.date, "2022-13-23", False),
        (types.date, "32-03-2022", False),
        (types.time, "01:23:59", True),
        (types.time, "01:23:61", False),
        (types.time, "01:61:59", False),
        (types.time, "24:23:59", False),
        (types.sentence, "The temperature is 23.5 degrees !", True),
        (types.sentence, "Did you earn $1,234.56 last month  ?", True),
        (types.sentence, "The #1 player scored 100 points .", True),
        (types.sentence, "Hello @world, this is a test!", True),
        (types.sentence, "invalid sentence.", False),
        (types.sentence, "Invalid sentence", False),
        (types.paragraph, "This is a paragraph!\n", True),
        (types.paragraph, "Line1\nLine2", False),
        (types.paragraph, "One sentence. Two sentences.\n\n", True),
        (types.paragraph, "One sentence. invalid sentence.", False),
        (types.paragraph, "One sentence. Invalid sentence\n", False),
        (types.hex_str, "0x123", True),
        (types.hex_str, "0xABC", True),
        (types.hex_str, "0xabc", True),
        (types.hex_str, "0x123ABC", True),
        (types.hex_str, "123", True),
        (types.hex_str, "ABC", True),
        (types.hex_str, "abc", True),
        (types.hex_str, "123ABC", True),
        (types.hex_str, "0xg123", False),
        (types.hex_str, "0x", False),
        (types.hex_str, "0x123G", False),
        (types.uuid4, "123e4567-e89b-42d3-a456-426614174000", True),
        (types.uuid4, "00000000-0000-4000-8000-000000000000", True),
        (types.uuid4, "123e4567-e89b-12d3-a456-426614174000", False),
        (types.uuid4, "123e4567-e89b-12d3-a456-42661417400", False),
        (types.uuid4, "123e4567-e89b-12d3-a456-4266141740000", False),
        (types.uuid4, "123e4567-e89b-12d3-x456-426614174000", False),
        (types.uuid4, "123e4567-e89b-12d3-a456-42661417400g", False),
        (types.ipv4, "192.168.1.1", True),
        (types.ipv4, "10.0.0.1", True),
        (types.ipv4, "172.16.0.1", True),
        (types.ipv4, "255.255.255.255", True),
        (types.ipv4, "0.0.0.0", True),
        (types.ipv4, "256.1.2.3", False),
        (types.ipv4, "1.256.2.3", False),
        (types.ipv4, "1.2.256.3", False),
        (types.ipv4, "1.2.3.256", False),
        (types.ipv4, "1.2.3", False),
        (types.ipv4, "1.2.3.4.5", False),
        (types.ipv4, "1.2.3.4.", False),
        (types.ipv4, ".1.2.3.4", False),
        (types.ipv4, "1..2.3.4", False),
    ],
)
def test_type_regex(custom_type, test_string, should_match):
    class Model(BaseModel):
        attr: custom_type

    schema = Model.model_json_schema()
    assert schema["properties"]["attr"]["type"] == "string"
    regex_str = schema["properties"]["attr"]["pattern"]
    does_match = re.fullmatch(regex_str, test_string) is not None
    assert does_match is should_match

    regex_str = to_regex(custom_type)
    does_match = re.fullmatch(regex_str, test_string) is not None
    assert does_match is should_match


@pytest.mark.parametrize(
    "custom_type,test_string,should_match",
    [
        (types.airports.IATA, "CDG", True),
        (types.airports.IATA, "XXX", False),
        (types.countries.Alpha2, "FR", True),
        (types.countries.Alpha2, "XX", False),
        (types.countries.Alpha3, "UKR", True),
        (types.countries.Alpha3, "XXX", False),
        (types.countries.Numeric, "004", True),
        (types.countries.Numeric, "900", False),
        (types.countries.Name, "Ukraine", True),
        (types.countries.Name, "Wonderland", False),
        (types.countries.Flag, "🇿🇼", True),
        (types.countries.Flag, "🤗", False),
    ],
)
def test_type_enum(custom_type, test_string, should_match):
    type_name = custom_type.__name__

    class Model(BaseModel):
        attr: custom_type

    schema = Model.model_json_schema()
    assert isinstance(schema["$defs"][type_name]["enum"], list)
    does_match = test_string in schema["$defs"][type_name]["enum"]
    assert does_match is should_match

    does_match = test_string in custom_type.__members__
    assert does_match is should_match


================================================
FILE: tests/types/test_dsl.py
================================================
import datetime
import json
import re as _re
import sys
import tempfile
from dataclasses import dataclass
from enum import Enum
from typing import (
    Literal,
    Tuple,
    Union,
    get_args,
    Optional as PyOptional
)

import jsonschema
import pytest
from genson import SchemaBuilder
from pydantic import BaseModel

from outlines import grammars, types
from outlines.types.dsl import (
    Alternatives,
    JsonSchema,
    KleenePlus,
    KleeneStar,
    Optional,
    QuantifyBetween,
    QuantifyExact,
    QuantifyMaximum,
    QuantifyMinimum,
    Choice,
    Regex,
    Sequence,
    String,
    Term,
    either,
    CFG,
    _handle_dict,
    _handle_list,
    _handle_literal,
    _handle_tuple,
    _handle_union,
    _ensure_json_quoted,
    json_schema,
    one_or_more,
    zero_or_more,
    optional,
    between,
    at_most,
    at_least,
    exactly,
    regex,
    python_types_to_terms,
    to_regex,
)
from outlines.types.utils import (
    is_pydantic_model,
    is_typed_dict,
    is_dataclass,
)

if sys.version_info >= (3, 12):
    from typing import TypedDict
else:
    from typing_extensions import TypedDict


def test_dsl_init():
    string = String("test")
    assert string.value == "test"
    assert repr(string) == "String(value='test')"
    assert string.display_ascii_tree() == "└── String('test')\n"

    choice = Choice(["a", "b"])
    assert choice.items == ["a", "b"]
    assert repr(choice) == "Choice(items=['a', 'b'])"
    assert choice.display_ascii_tree() == "└── Choice(['a', 'b'])\n"

    regex = Regex("[0-9]")
    assert regex.pattern == "[0-9]"
    assert repr(regex) == "Regex(pattern='[0-9]')"
    assert regex.display_ascii_tree() == "└── Regex('[0-9]')\n"

    schema = JsonSchema('{ "type": "string" }')
    assert schema.schema == '{ "type": "string" }'
    assert repr(schema) == 'JsonSchema(schema=\'{ "type": "string" }\')'
    assert schema.display_ascii_tree() == "└── JsonSchema('{ \"type\": \"string\" }')\n"

    kleene_star = KleeneStar(string)
    assert kleene_star.term == string
    assert repr(kleene_star) == "KleeneStar(term=String(value='test'))"
    assert kleene_star.display_ascii_tree() == "└── KleeneStar(*)\n    └── String('test')\n"

    kleene_plus = KleenePlus(string)
    assert kleene_plus.term == string
    assert repr(kleene_plus) == "KleenePlus(term=String(value='test'))"
    assert kleene_plus.display_ascii_tree() == "└── KleenePlus(+)\n    └── String('test')\n"

    optional = Optional(string)
    assert optional.term == string
    assert repr(optional) == "Optional(term=String(value='test'))"
    assert optional.display_ascii_tree() == "└── Optional(?)\n    └── String('test')\n"

    alternatives = Alternatives([string, regex])
    assert alternatives.terms[0] == string
    assert alternatives.terms[1] == regex
    assert (
        repr(alternatives)
        == "Alternatives(terms=[String(value='test'), Regex(pattern='[0-9]')])"
    )
    assert alternatives.display_ascii_tree() == "└── Alternatives(|)\n    ├── String('test')\n    └── Regex('[0-9]')\n"

    sequence = Sequence([string, regex])
    assert sequence.terms[0] == string
    assert sequence.terms[1] == regex
    assert (
        repr(sequence)
        == "Sequence(terms=[String(value='test'), Regex(pattern='[0-9]')])"
    )
    assert sequence.display_ascii_tree() == "└── Sequence\n    ├── String('test')\n    └── Regex('[0-9]')\n"

    exact = QuantifyExact(string, 3)
    assert exact.term == string
    assert exact.count == 3
    assert repr(exact) == "QuantifyExact(term=String(value='test'), count=3)"
    assert exact.display_ascii_tree() == "└── Quantify({3})\n    └── String('test')\n"

    minimum = QuantifyMinimum(string, 3)
    assert minimum.term == string
    assert minimum.min_count == 3
    assert repr(minimum) == "QuantifyMinimum(term=String(value='test'), min_count=3)"
    assert minimum.display_ascii_tree() == "└── Quantify({3,})\n    └── String('test')\n"

    maximum = QuantifyMaximum(string, 3)
    assert maximum.term == string
    assert maximum.max_count == 3
    assert repr(maximum) == "QuantifyMaximum(term=String(value='test'), max_count=3)"
    assert maximum.display_ascii_tree() == "└── Quantify({,3})\n    └── String('test')\n"

    between = QuantifyBetween(string, 1, 3)
    assert between.term == string
    assert between.min_count == 1
    assert between.max_count == 3
    assert (
        repr(between)
        == "QuantifyBetween(term=String(value='test'), min_count=1, max_count=3)"
    )
    assert between.display_ascii_tree() == "└── Quantify({1,3})\n    └── String('test')\n"

    with pytest.raises(
        ValueError, match="`max_count` must be greater than `min_count`"
    ):
        QuantifyBetween(string, 3, 1)


def test_dsl_term_methods():
    a = String("a")
    b = Regex("[0-9]")
    c = "c"

    assert a + b == Sequence([a, b])
    assert a + c == Sequence([a, String(c)])
    assert a.__radd__(b) == Sequence([b, a])
    assert a.__radd__(c) == Sequence([String(c), a])

    assert a | b == Alternatives([a, b])
    assert a | c == Alternatives([a, String(c)])
    assert a.__ror__(b) == Alternatives([b, a])
    assert a.__ror__(c) == Alternatives([String(c), a])

    core_schema = a.__get_pydantic_core_schema__("", "")
    validator = a.__get_validator__(core_schema)
    assert validator("a") == "a"
    with pytest.raises(
        ValueError,
        match="Input should be in the language of the regular expression",
    ):
        validator("b")

    assert a.__get_pydantic_json_schema__("", "") == {"type": "string", "pattern": "a"}

    assert a.matches("a")
    assert not a.matches("b")

    assert a.display_ascii_tree() == "└── String('a')\n"

    with pytest.raises(NotImplementedError):
        Term()._display_node()

    assert a._display_children("") == ""

    assert a.__str__() == "└── String('a')\n"

def test_dsl_sequence():
    a = String("a")
    b = String("b")

    sequence = a + b
    assert isinstance(sequence, Sequence)
    assert sequence.terms[0] == a
    assert sequence.terms[1] == b

    sequence = "a" + b
    assert isinstance(sequence, Sequence)
    assert isinstance(sequence.terms[0], String)
    assert sequence.terms[0].value == "a"
    assert sequence.terms[1].value == "b"

    sequence = a + "b"
    assert isinstance(sequence, Sequence)
    assert isinstance(sequence.terms[1], String)
    assert sequence.terms[0].value == "a"
    assert sequence.terms[1].value == "b"


def test_dsl_alternatives():
    a = String("a")
    b = String("b")

    alt = either(a, b)
    assert isinstance(alt, Alternatives)
    assert isinstance(alt.terms[0], String)
    assert isinstance(alt.terms[1], String)

    alt = either("a", "b")
    assert isinstance(alt, Alternatives)
    assert isinstance(alt.terms[0], String)
    assert isinstance(alt.terms[1], String)

    alt = either("a", b)
    assert isinstance(alt, Alternatives)
    assert isinstance(alt.terms[0], String)
    assert isinstance(alt.terms[1], String)


def test_dsl_optional():
    a = String("a")

    opt = a.optional()
    assert isinstance(opt, Optional)

    opt = optional("a")
    assert isinstance(opt, Optional)
    assert isinstance(opt.term, String)

    opt = a.optional()
    assert isinstance(opt, Optional)


def test_dsl_exactly():
    a = String("a")

    rep = a.exactly(2)
    assert isinstance(rep, QuantifyExact)
    assert rep.count == 2

    rep = exactly(2, "a")
    assert isinstance(rep, QuantifyExact)
    assert isinstance(rep.term, String)

    rep = a.exactly(2)
    assert isinstance(rep, QuantifyExact)


def test_dsl_at_least():
    a = String("a")

    rep = a.at_least(2)
    assert isinstance(rep, QuantifyMinimum)
    assert rep.min_count == 2

    rep = at_least(2, "a")
    assert isinstance(rep, QuantifyMinimum)
    assert isinstance(rep.term, String)

    rep = a.at_least(2)
    assert isinstance(rep, QuantifyMinimum)


def test_dsl_at_most():
    a = String("a")

    rep = a.at_most(2)
    assert isinstance(rep, QuantifyMaximum)
    assert rep.max_count == 2

    rep = at_most(2, "a")
    assert isinstance(rep, QuantifyMaximum)
    assert isinstance(rep.term, String)

    rep = a.at_most(2)
    assert isinstance(rep, QuantifyMaximum)


def test_between():
    a = String("a")

    rep = a.between(1, 2)
    assert isinstance(rep, QuantifyBetween)
    assert rep.min_count == 1
    assert rep.max_count == 2

    rep = between(1, 2, "a")
    assert isinstance(rep, QuantifyBetween)
    assert isinstance(rep.term, String)

    rep = a.between(1, 2)
    assert isinstance(rep, QuantifyBetween)


def test_dsl_zero_or_more():
    a = String("a")

    rep = a.zero_or_more()
    assert isinstance(rep, KleeneStar)

    rep = zero_or_more("a")
    assert isinstance(rep, KleeneStar)
    assert isinstance(rep.term, String)

    rep = a.zero_or_more()
    assert isinstance(rep, KleeneStar)


def test_dsl_one_or_more():
    a = String("a")

    rep = a.one_or_more()
    assert isinstance(rep, KleenePlus)

    rep = one_or_more("a")
    assert isinstance(rep, KleenePlus)
    assert isinstance(rep.term, String)

    rep = a.zero_or_more()
    assert isinstance(rep, KleeneStar)


def test_dsl_aliases():
    test = regex("[0-9]")
    assert isinstance(test, Regex)

    test = json_schema('{"type": "string"}')
    assert isinstance(test, JsonSchema)


def test_dsl_term_pydantic_simple():
    a = String("a")

    class Model(BaseModel):
        field: a

    schema = Model.model_json_schema()
    assert schema == {
        "properties": {"field": {"pattern": "a", "title": "Field", "type": "string"}},
        "required": ["field"],
        "title": "Model",
        "type": "object",
    }


def test_dsl_term_pydantic_combination():
    a = String("a")
    b = String("b")
    c = String("c")

    class Model(BaseModel):
        field: either((a + b), c)

    schema = Model.model_json_schema()
    assert schema == {
        "properties": {
            "field": {"pattern": "(ab|c)", "title": "Field", "type": "string"}
        },
        "required": ["field"],
        "title": "Model",
        "type": "object",
    }


def test_dsl_display():
    a = String("a")
    b = String("b")
    c = Regex("[0-9]")
    d = Sequence([KleeneStar(Alternatives([a, b])), c])

    tree = str(d)
    assert (
        tree
        == "└── Sequence\n    ├── KleeneStar(*)\n    │   └── Alternatives(|)\n    │       ├── String('a')\n    │       └── String('b')\n    └── Regex('[0-9]')\n"
    )


def test_cfg():
    cfg_string = """
?start: expr
?expr: NUMBER
"""
    cfg = types.cfg(cfg_string)
    assert isinstance(cfg, CFG)
    assert cfg.definition.strip() == "?start: expr\n?expr: NUMBER"
    assert cfg._display_node() == "CFG('\n?start: expr\n?expr: NUMBER\n')"
    assert cfg.__repr__() == "CFG(definition='\n?start: expr\n?expr: NUMBER\n')"
    assert cfg == types.cfg(cfg_string)
    assert not cfg == "a"


def test_json_schema():
    # variables to be used in the tests
    json_schema = types.json_schema('{"type": "object", "properties": {"foo": {"type": "string"}, "bar": {"type": "integer"}}, "required": ["foo"]}')
    schema_builder_instance = SchemaBuilder()
    schema_builder_instance.add_schema({"type": "object", "properties": {"foo": {"type": "string"}, "bar": {"type": "integer"}}, "required": ["foo"]})
    class MyPydanticModel(BaseModel):
        foo: str
        bar: PyOptional[int] = None
    class MyTypedDict(TypedDict):
        foo: str
        bar: int
    @dataclass
    class MyDataClass:
        foo: str
        bar: PyOptional[int] = None

    # init dict
    schema = types.json_schema({"type": "string"})
    assert schema.schema == '{"type": "string"}'

    # init str
    schema = types.json_schema('{"type": "string"}')
    assert schema.schema == '{"type": "string"}'

    # init Pydantic model
    schema = types.json_schema(MyPydanticModel)
    assert schema.schema == '{"properties": {"foo": {"title": "Foo", "type": "string"}, "bar": {"anyOf": [{"type": "integer"}, {"type": "null"}], "default": null, "title": "Bar"}}, "required": ["foo"], "title": "MyPydanticModel", "type": "object"}'

    # init TypedDict
    schema = types.json_schema(MyTypedDict)
    assert schema.schema == '{"properties": {"foo": {"title": "Foo", "type": "string"}, "bar": {"title": "Bar", "type": "integer"}}, "required": ["foo", "bar"], "title": "MyTypedDict", "type": "object"}'

    # init dataclass
    schema = types.json_schema(MyDataClass)
    assert schema.schema == '{"properties": {"foo": {"title": "Foo", "type": "string"}, "bar": {"anyOf": [{"type": "integer"}, {"type": "null"}], "default": null, "title": "Bar"}}, "required": ["foo"], "title": "MyDataClass", "type": "object"}'

    # init SchemaBuilder
    schema = types.json_schema(schema_builder_instance)
    assert schema.schema == '{"$schema": "http://json-schema.org/schema#", "type": "object", "properties": {"foo": {"type": "string"}, "bar": {"type": "integer"}}, "required": ["foo"]}'

    # init unsupported type
    with pytest.raises(ValueError, match="Cannot parse schema"):
        types.json_schema(1)

    # init invalide JSON schema
    with pytest.raises(jsonschema.exceptions.SchemaError):
        types.json_schema({"type": "strin"})

    # is_json_schema
    assert not JsonSchema.is_json_schema(None)
    assert not JsonSchema.is_json_schema('{"type": "string"}')
    assert not JsonSchema.is_json_schema({"type": "string"})
    assert JsonSchema.is_json_schema(json_schema)
    assert JsonSchema.is_json_schema(schema_builder_instance)
    assert JsonSchema.is_json_schema(MyPydanticModel)
    assert JsonSchema.is_json_schema(MyTypedDict)
    assert JsonSchema.is_json_schema(MyDataClass)

    # convert_to
    assert JsonSchema.convert_to(json_schema, ["str"]) == json_schema.schema
    assert JsonSchema.convert_to(json_schema, ["dict"]) == json.loads(json_schema.schema)
    assert JsonSchema.convert_to(MyPydanticModel, ["pydantic"]) == MyPydanticModel
    assert JsonSchema.convert_to(MyTypedDict, ["typeddict"]) == MyTypedDict
    assert JsonSchema.convert_to(MyDataClass, ["dataclass"]) == MyDataClass
    assert JsonSchema.convert_to(schema_builder_instance, ["genson"]) == schema_builder_instance
    assert JsonSchema.convert_to(MyPydanticModel, ["str"]) == JsonSchema(MyPydanticModel).schema
    assert JsonSchema.convert_to(MyPydanticModel, ["dict"]) == json.loads(JsonSchema(MyPydanticModel).schema)
    assert is_pydantic_model(JsonSchema.convert_to(json_schema, ["pydantic"]))
    assert is_typed_dict(JsonSchema.convert_to(json_schema, ["typeddict"]))
    assert is_dataclass(JsonSchema.convert_to(json_schema, ["dataclass"]))
    with pytest.raises(ValueError, match="Cannot convert schema type"):
        JsonSchema.convert_to(json_schema, ["genson"])

    # other methods
    schema = types.json_schema('{"type": "string"}')
    assert schema._display_node() == "JsonSchema('{\"type\": \"string\"}')"
    assert schema.__repr__() == "JsonSchema(schema='{\"type\": \"string\"}')"
    assert schema == types.json_schema('{"type": "string"}')
    assert not schema == "a"


def test_dsl_cfg_from_file():
    grammar_content = """
    ?start: expression
    ?expression: term (("+" | "-") term)*
    ?term: factor (("*" | "/") factor)*
    ?factor: NUMBER
    """
    with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=True) as temp_file:
        temp_file.write(grammar_content)
        temp_file.flush()
        temp_file_path = temp_file.name
        cfg = CFG.from_file(temp_file_path)
        assert cfg == CFG(grammar_content)


def test_dsl_json_schema_from_file():
    schema_content = """
    {
        "type": "object",
        "properties": {
            "name": {
                "type": "string"
            }
        }
    }
    """
    with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=True) as temp_file:
        temp_file.write(schema_content)
        temp_file.flush()
        temp_file_path = temp_file.name
        schema = JsonSchema.from_file(temp_file_path)
        assert schema == JsonSchema(schema_content)


def test_dsl_python_types_to_terms():
    with pytest.raises(RecursionError):
        python_types_to_terms(None, 11)

    term = Term()
    assert python_types_to_terms(term) == term

    assert python_types_to_terms(int) == types.integer
    assert python_types_to_terms(float) == types.number
    assert python_types_to_terms(bool) == types.boolean
    assert python_types_to_terms(str) == types.string
    assert python_types_to_terms(datetime.time) == types.time
    assert python_types_to_terms(datetime.date) == types.date
    assert python_types_to_terms(datetime.datetime) == types.datetime
    assert python_types_to_terms(dict) == types.CFG(grammars.json)

    string_instance = "a"
    assert python_types_to_terms(string_instance) == String(string_instance)
    int_instance = 1
    assert python_types_to_terms(int_instance) == Regex(r"1")
    float_instance = 1.0
    assert python_types_to_terms(float_instance) == Regex(r"1.0")

    @dataclass
    class DataClass:
        a: int
        b: str

    assert python_types_to_terms(DataClass) == JsonSchema(
        {
            "properties": {"a": {"title": "A", "type": "integer"}, "b": {"title": "B", "type": "string"}},
            "required": ["a", "b"],
            "title": "DataClass",
            "type": "object",
        }
    )

    class SomeTypedDict(TypedDict):
        a: int
        b: str

    assert python_types_to_terms(SomeTypedDict) == JsonSchema(
        {
            "properties": {"a": {"title": "A", "type": "integer"}, "b": {"title": "B", "type": "string"}},
            "required": ["a", "b"],
            "title": "SomeTypedDict",
            "type": "object",
        }
    )

    class PydanticModel(BaseModel):
        a: int
        b: str

    assert python_types_to_terms(PydanticModel) == JsonSchema(
        {
            "properties": {"a": {"title": "A", "type": "integer"}, "b": {"title": "B", "type": "string"}},
            "required": ["a", "b"],
            "title": "PydanticModel",
            "type": "object",
        }
    )

    builder = SchemaBuilder()
    builder.add_schema({"type": "object", "properties": {}})
    builder.add_object({"hi": "there"})
    builder.add_object({"hi": 5})
    assert python_types_to_terms(builder) == JsonSchema(
        {
            "$schema": "http://json-schema.org/schema#",
            "type": "object",
            "properties": {"hi": {"type": ["integer", "string"]}},
            "required": ["hi"]
        }
    )

    def func(a: int, b: str):
        return (a, b)

    assert python_types_to_terms(func) == JsonSchema(
        {
            "type": "object",
            "properties": {
                "a": {"title": "A", "type": "integer"},
                "b": {"title": "B", "type": "string"},
            },
            "required": ["a", "b"],
            "title": "func",
        }
    )

    class SomeEnum(Enum):
        a = "a"
        b = int
        c = func

    result = python_types_to_terms(SomeEnum)
    assert isinstance(result, Alternatives)
    assert len(result.terms) == 3
    assert result.terms[0] == String("a")
    assert result.terms[1] == types.integer
    assert isinstance(result.terms[2], JsonSchema)
    schema_dict = json.loads(result.terms[2].schema)
    assert schema_dict == {
        "properties": {
            "a": {"title": "A", "type": "integer"},
            "b": {"title": "B", "type": "string"},
        },
        "required": ["a", "b"],
        "title": "func",
        "type": "object",
    }

    # for generic types we only test the dispatch as the functions that
    # convert to terms are tested in distinct tests below
    assert python_types_to_terms(Literal["a", "b"]) == _handle_literal(("a", "b"))
    assert python_types_to_terms(Union[int, str]) == _handle_union((int, str), recursion_depth=0)
    assert python_types_to_terms(list[int]) == _handle_list((int,), recursion_depth=0)
    assert python_types_to_terms(tuple[int, str]) == _handle_tuple((int, str), recursion_depth=0)
    assert python_types_to_terms(dict[int, str]) == _handle_dict((int, str), recursion_depth=0)

    # type not supported
    with pytest.raises(TypeError, match="is currently not supported"):
        python_types_to_terms(bytes)


def test_dsl_handle_literal():
    literal = Literal["a", 1]
    result = _handle_literal(get_args(literal))
    assert isinstance(result, Alternatives)
    assert len(result.terms) == 2
    assert result.terms[0] == String("a")
    assert result.terms[1] == Regex(r"1")


def test_dsl_handle_union():
    # test simple Union
    simple_union = Union[int, str]
    result = _handle_union(get_args(simple_union), recursion_depth=0)
    assert isinstance(result, Alternatives)
    assert len(result.terms) == 2
    assert result.terms[0] == types.integer
    assert result.terms[1] == types.string

    # test with Optional[T]
    optional_type = PyOptional[int]
    result = _handle_union(get_args(optional_type), recursion_depth=0)
    assert isinstance(result, Alternatives)
    assert len(result.terms) == 2
    assert result.terms[0] == types.integer
    assert result.terms[1] == String("None")

    # test with more complex types
    class TestModel(BaseModel):
        field: str

    class TestEnum(Enum):
        a = "a"
        b = "b"

    complex_union = Union[TestModel, TestEnum]
    result = _handle_union(get_args(complex_union), recursion_depth=0)
    assert isinstance(result, Alternatives)
    assert len(result.terms) == 2
    assert isinstance(result.terms[0], JsonSchema)
    assert isinstance(result.terms[1], Alternatives)
    assert len(result.terms[1].terms) == 2
    assert result.terms[1].terms[0] == String("a")
    assert result.terms[1].terms[1] == String("b")


def test_dsl_handle_list():
    with pytest.raises(TypeError):
        _handle_list(None, recursion_depth=0)

    with pytest.raises(TypeError):
        _handle_list((), recursion_depth=0)

    with pytest.raises(TypeError):
        _handle_list((int, str), recursion_depth=0)

    # simple type
    list_type = list[int]
    result = _handle_list(get_args(list_type), recursion_depth=0)
    assert isinstance(result, Sequence)
    assert len(result.terms) == 4
    assert result.terms[0] == String("[")
    assert result.terms[1] == types.integer
    assert isinstance(result.terms[2], KleeneStar)
    assert result.terms[2].term == Sequence([String(", "), types.integer])
    assert result.terms[3] == String("]")

    # more complex type
    list_type = list[Union[int, str]]
    result = _handle_list(get_args(list_type), recursion_depth=0)
    assert isinstance(result, Sequence)
    assert len(result.terms) == 4
    assert result.terms[0] == String("[")
    assert result.terms[1] == _handle_union(get_args(Union[int, str]), recursion_depth=0)
    assert isinstance(result.terms[2], KleeneStar)
    assert result.terms[2].term == Sequence([String(", "), _handle_union(get_args(Union[int, str]), recursion_depth=0)])
    assert result.terms[3] == String("]")


def test_dsl_handle_tuple():
    # empty tuple
    tuple_type = Tuple[()]
    result = _handle_tuple(get_args(tuple_type), recursion_depth=0)
    assert isinstance(result, String)
    assert result.value == "()"

    # tuple with ellipsis
    tuple_type = tuple[int, ...]
    result = _handle_tuple(get_args(tuple_type), recursion_depth=0)
    assert isinstance(result, Sequence)
    assert len(result.terms) == 4
    assert result.terms[0] == String("(")
    assert result.terms[1] == types.integer
    assert isinstance(result.terms[2], KleeneStar)
    assert result.terms[2].term == Sequence([String(", "), types.integer])
    assert result.terms[3] == String(")")

    # tuple with fixed length
    tuple_type = tuple[int, str]
    result = _handle_tuple(get_args(tuple_type), recursion_depth=0)
    assert isinstance(result, Sequence)
    assert len(result.terms) == 5
    assert result.terms[0] == String("(")
    assert result.terms[1] == types.integer
    assert result.terms[2] == String(", ")
    assert result.terms[3] == types.string
    assert result.terms[4] == String(")")

    # tuple with fixed length and complex types
    tuple_type = tuple[int, Union[str, int]]
    result = _handle_tuple(get_args(tuple_type), recursion_depth=0)
    assert isinstance(result, Sequence)
    assert len(result.terms) == 5
    assert result.terms[0] == String("(")
    assert result.terms[1] == types.integer
    assert result.terms[2] == String(", ")
    assert result.terms[3] == _handle_union(get_args(Union[str, int]), recursion_depth=0)
    assert result.terms[4] == String(")")


def test_dsl_handle_dict():
    # args of incorrect length
    with pytest.raises(TypeError):
        incorrect_dict_type = dict[int, str, int]
        _handle_dict(get_args(incorrect_dict_type), recursion_depth=0)

    # correct type
    dict_type = dict[int, str]
    result = _handle_dict(get_args(dict_type), recursion_depth=0)
    assert isinstance(result, Sequence)
    assert len(result.terms) == 3
    assert result.terms[0] == String("{")
    assert isinstance(result.terms[1], Optional)
    assert isinstance(result.terms[1].term, Sequence)
    assert len(result.terms[1].term.terms) == 4
    assert result.terms[1].term.terms[0] == types.integer
    assert result.terms[1].term.terms[1] == String(":")
    assert result.terms[1].term.terms[2] == types.string
    assert result.terms[1].term.terms[3] == KleeneStar(Sequence([String(", "), types.integer, String(":"), types.string]))
    assert result.terms[2] == String("}")


def test_ensure_json_quoted_string():
    """String terms are wrapped in double-quote delimiters."""
    term = String("hello")
    result = _ensure_json_quoted(term)
    assert isinstance(result, String)
    assert result == String('"hello"')


def test_ensure_json_quoted_alternatives():
    """Each branch of an Alternatives is independently quoted."""
    term = Alternatives([String("a"), String("b")])
    result = _ensure_json_quoted(term)
    assert isinstance(result, Alternatives)
    assert len(result.terms) == 2
    for branch in result.terms:
        assert isinstance(branch, String)
        assert branch.value.startswith('"') and branch.value.endswith('"')


def test_ensure_json_quoted_passthrough():
    """Non-String, non-Alternatives terms are returned unchanged."""
    regex_term = types.integer
    assert _ensure_json_quoted(regex_term) is regex_term

    seq = Sequence([String("a"), String("b")])
    assert _ensure_json_quoted(seq) is seq


def test_list_of_literals_quoted():
    """Literal strings inside List are JSON-quoted."""
    list_type = list[Literal["cat", "dog"]]
    result = _handle_list(get_args(list_type), recursion_depth=0)
    assert isinstance(result, Sequence)
    assert result.terms[0] == String("[")
    item = result.terms[1]
    assert isinstance(item, Alternatives)
    for branch in item.terms:
        assert isinstance(branch, String)
        assert branch.value.startswith('"') and branch.value.endswith('"')


def test_tuple_of_literals_quoted():
    """Literal strings inside fixed Tuple are JSON-quoted."""
    tuple_type = Tuple[Literal["x"], Literal["y"]]
    result = _handle_tuple(get_args(tuple_type), recursion_depth=0)
    assert isinstance(result, Sequence)
    assert result.terms[0] == String("(")
    first_item = result.terms[1]
    assert isinstance(first_item, Alternatives)
    assert isinstance(first_item.terms[0], String)
    assert first_item.terms[0].value.startswith('"')


def test_dict_literal_key_quoted():
    """Literal string keys in Dict are JSON-quoted."""
    dict_type = dict[Literal["k1", "k2"], int]
    result = _handle_dict(get_args(dict_type), recursion_depth=0)
    assert isinstance(result, Sequence)
    inner = result.terms[1]
    assert isinstance(inner, Optional)
    key_term = inner.term.terms[0]
    assert isinstance(key_term, Alternatives)
    for branch in key_term.terms:
        assert isinstance(branch, String)
        assert branch.value.startswith('"') and branch.value.endswith('"')


def test_list_of_int_unchanged():
    """Non-string types in List are not wrapped in quotes."""
    list_type = list[int]
    result = _handle_list(get_args(list_type), recursion_depth=0)
    assert result.terms[1] == types.integer


def test_ensure_json_quoted_sequence_passthrough():
    """A Sequence term (already structured) passes through unchanged."""
    seq = Sequence([String("a"), String("b")])
    assert _ensure_json_quoted(seq) is seq


def test_ensure_json_quoted_regex_passthrough():
    """Regex terms (e.g. types.string) already include quotes internally."""
    assert _ensure_json_quoted(types.string) is types.string
    assert _ensure_json_quoted(types.integer) is types.integer
    assert _ensure_json_quoted(types.boolean) is types.boolean


def test_list_single_literal():
    """A single-variant Literal inside list is still quoted."""
    list_type = list[Literal["only"]]
    result = _handle_list(get_args(list_type), recursion_depth=0)
    item = result.terms[1]
    assert isinstance(item, Alternatives)
    branch = item.terms[0]
    assert isinstance(branch, String)
    assert branch == String('"only"')


def test_dict_literal_value_quoted():
    """Literal string values (not just keys) in Dict are JSON-quoted."""
    dict_type = dict[str, Literal["yes", "no"]]
    result = _handle_dict(get_args(dict_type), recursion_depth=0)
    inner = result.terms[1]
    assert isinstance(inner, Optional)
    value_term = inner.term.terms[2]
    assert isinstance(value_term, Alternatives)
    for branch in value_term.terms:
        assert isinstance(branch, String)
        assert branch.value.startswith('"') and branch.value.endswith('"')


def test_tuple_ellipsis_literal_quoted():
    """Variable-length Tuple with Literal element type is JSON-quoted."""
    tuple_type = Tuple[Literal["a", "b"], ...]
    result = _handle_tuple(get_args(tuple_type), recursion_depth=0)
    assert isinstance(result, Sequence)
    item = result.terms[1]
    assert isinstance(item, Alternatives)
    for branch in item.terms:
        assert isinstance(branch, String)
        assert branch.value.startswith('"') and branch.value.endswith('"')


def test_list_of_bool_unchanged():
    """Boolean types in List are not wrapped in quotes."""
    list_type = list[bool]
    result = _handle_list(get_args(list_type), recursion_depth=0)
    assert result.terms[1] == types.boolean


def test_dict_int_value_unchanged():
    """Non-string value type in Dict is not wrapped in quotes."""
    dict_type = dict[str, int]
    result = _handle_dict(get_args(dict_type), recursion_depth=0)
    inner = result.terms[1]
    assert isinstance(inner, Optional)
    value_term = inner.term.terms[2]
    assert value_term == types.integer


def test_ensure_json_quoted_nested_alternatives():
    """Nested Alternatives are recursively quoted."""
    inner_alt = Alternatives([String("x"), String("y")])
    outer_alt = Alternatives([inner_alt, String("z")])
    result = _ensure_json_quoted(outer_alt)
    assert isinstance(result, Alternatives)
    inner_result = result.terms[0]
    assert isinstance(inner_result, Alternatives)
    for branch in inner_result.terms:
        assert isinstance(branch, String)
        assert branch.value.startswith('"') and branch.value.endswith('"')
    z_result = result.terms[1]
    assert isinstance(z_result, String)
    assert z_result == String('"z"')


def test_literal_with_special_characters():
    """Literal strings with spaces and punctuation are quoted correctly."""
    list_type = list[Literal["hello world", "foo-bar"]]
    result = _handle_list(get_args(list_type), recursion_depth=0)
    item = result.terms[1]
    assert isinstance(item, Alternatives)
    assert len(item.terms) == 2
    for branch in item.terms:
        assert isinstance(branch, String)
        assert branch.value.startswith('"') and branch.value.endswith('"')


# ---------------------------------------------------------------------------
# End-to-end regex tests for JSON quoting in containers
# These verify the full pipeline: python_types_to_terms → to_regex → re.fullmatch
# ---------------------------------------------------------------------------


def test_e2e_list_literal_matches_quoted_json():
    """List[Literal[...]] regex matches JSON-quoted strings and rejects bare words."""
    pattern = to_regex(python_types_to_terms(list[Literal["Paris", "London"]]))
    assert _re.fullmatch(pattern, '["Paris"]')
    assert _re.fullmatch(pattern, '["Paris", "London"]')
    assert _re.fullmatch(pattern, '["London", "Paris", "London"]')
    assert not _re.fullmatch(pattern, "[Paris]")
    assert not _re.fullmatch(pattern, "['Paris']")


def test_e2e_standalone_literal_no_quotes():
    """Standalone Literal (not inside container) should NOT add quotes."""
    pattern = to_regex(python_types_to_terms(Literal["cat", "dog"]))
    assert _re.fullmatch(pattern, "cat")
    assert _re.fullmatch(pattern, "dog")
    assert not _re.fullmatch(pattern, '"cat"')


def test_e2e_list_literal_empty_string():
    """Empty string literal inside List produces quoted empty string."""
    pattern = to_regex(python_types_to_terms(list[Literal[""]]))
    assert _re.fullmatch(pattern, '[""]')
    assert _re.fullmatch(pattern, '["", ""]')
    assert not _re.fullmatch(pattern, "[]")


def test_e2e_list_mixed_literal_string_and_int():
    """Mixed Literal with string and int: only string values are quoted."""
    pattern = to_regex(python_types_to_terms(list[Literal["a", 1]]))
    assert _re.fullmatch(pattern, '["a"]')
    assert _re.fullmatch(pattern, "[1]")
    assert _re.fullmatch(pattern, '["a", 1]')
    assert _re.fullmatch(pattern, '[1, "a"]')
    assert not _re.fullmatch(pattern, "[a]")


def test_e2e_dict_literal_keys_quoted():
    """Dict with Literal keys produces JSON-quoted keys."""
    pattern = to_regex(python_types_to_terms(dict[Literal["k1", "k2"], int]))
    assert _re.fullmatch(pattern, '{"k1":0}')
    assert _re.fullmatch(pattern, '{"k1":42, "k2":-7}')
    assert not _re.fullmatch(pattern, "{k1:0}")


def test_e2e_dict_literal_values_quoted():
    """Dict with Literal string values produces JSON-quoted values."""
    pattern = to_regex(python_types_to_terms(dict[str, Literal["yes", "no"]]))
    assert _re.fullmatch(pattern, '{"answer":"yes"}')
    assert _re.fullmatch(pattern, '{"a":"yes", "b":"no"}')


def test_e2e_tuple_fixed_literal_quoted():
    """Fixed-length Tuple with Literal elements produces JSON-quoted strings."""
    pattern = to_regex(python_types_to_terms(Tuple[Literal["x"], Literal["y"]]))
    assert _re.fullmatch(pattern, '("x", "y")')
    assert not _re.fullmatch(pattern, "(x, y)")


def test_e2e_tuple_variadic_literal_quoted():
    """Variable-length Tuple with Literal produces JSON-quoted strings."""
    pattern = to_regex(python_types_to_terms(Tuple[Literal["a", "b"], ...]))
    assert _re.fullmatch(pattern, '("a")')
    assert _re.fullmatch(pattern, '("a", "b", "a")')
    assert not _re.fullmatch(pattern, "(a)")


def test_e2e_list_enum_string_values_quoted():
    """Enum with string members inside List produces JSON-quoted values."""

    class Color(Enum):
        RED = "red"
        BLUE = "blue"

    pattern = to_regex(python_types_to_terms(list[Color]))
    assert _re.fullmatch(pattern, '["red"]')
    assert _re.fullmatch(pattern, '["red", "blue"]')
    assert not _re.fullmatch(pattern, "[red]")


def test_e2e_list_int_not_quoted():
    """List[int] should not have any quoting applied."""
    pattern = to_regex(python_types_to_terms(list[int]))
    assert _re.fullmatch(pattern, "[42]")
    assert _re.fullmatch(pattern, "[1, 2, 3]")
    assert not _re.fullmatch(pattern, '["1"]')


def test_e2e_list_literal_special_characters():
    """Literal strings with spaces and hyphens are quoted correctly in regex."""
    pattern = to_regex(python_types_to_terms(list[Literal["hello world", "foo-bar"]]))
    assert _re.fullmatch(pattern, '["hello world"]')
    assert _re.fullmatch(pattern, '["hello world", "foo-bar"]')
    assert not _re.fullmatch(pattern, "[hello world]")


def test_e2e_dict_literal_key_and_enum_value():
    """Dict with Literal keys and Enum values: both quoted."""

    class Status(Enum):
        ON = "on"
        OFF = "off"

    pattern = to_regex(python_types_to_terms(dict[Literal["switch"], Status]))
    assert _re.fullmatch(pattern, '{"switch":"on"}')
    assert _re.fullmatch(pattern, '{"switch":"off"}')
    assert not _re.fullmatch(pattern, "{switch:on}")


def test_to_regex():
    string_term = String("hello")
    assert to_regex(string_term) == r"hello"

    regex_term = Regex("[0-9]+")
    assert to_regex(regex_term) == r"([0-9]+)"

    json_schema_term = JsonSchema({"type": "integer"})
    assert to_regex(json_schema_term) == r"((-)?(0|[1-9][0-9]*))"

    choice_term = Choice(["a", "b", "c"])
    assert to_regex(choice_term) == r"(a|b|c)"

    kleene_star = KleeneStar(String("a"))
    assert to_regex(kleene_star) == r"(a)*"

    kleene_plus = KleenePlus(String("a"))
    assert to_regex(kleene_plus) == r"(a)+"

    optional_term = Optional(String("a"))
    assert to_regex(optional_term) == r"(a)?"

    alt_term = Alternatives([String("a"), String("b")])
    assert to_regex(alt_term) == r"(a|b)"

    seq_term = Sequence([String("a"), String("b")])
    assert to_regex(seq_term) == r"ab"

    exact_term = QuantifyExact(String("a"), 3)
    assert to_regex(exact_term) == r"(a){3}"

    min_term = QuantifyMinimum(String("a"), 2)
    assert to_regex(min_term) == r"(a){2,}"

    max_term = QuantifyMaximum(String("a"), 5)
    assert to_regex(max_term) == r"(a){,5}"

    between_term = QuantifyBetween(String("a"), 1, 3)
    assert to_regex(between_term) == r"(a){1,3}"

    with pytest.raises(TypeError):
        to_regex(Term())


================================================
FILE: tests/types/test_json_schema_utils.py
================================================
import sys
from dataclasses import is_dataclass
from typing import Any, List, Literal, Optional

from pydantic import BaseModel, TypeAdapter
from pydantic_core import PydanticUndefined

from outlines.types.json_schema_utils import (
    schema_type_to_python,
    json_schema_dict_to_typeddict,
    json_schema_dict_to_pydantic,
    json_schema_dict_to_dataclass
)

if sys.version_info >= (3, 12):
    from typing import _TypedDictMeta  # type: ignore
else:
    from typing_extensions import _TypedDictMeta  # type: ignore


def test_schema_type_to_python_simple_types():
    assert schema_type_to_python({"type": "string"}, "pydantic") is str
    assert schema_type_to_python({"type": "integer"}, "pydantic") is int
    assert schema_type_to_python({"type": "number"}, "pydantic") is float
    assert schema_type_to_python({"type": "boolean"}, "pydantic") is bool
    assert schema_type_to_python({"type": "object"}, "foo") is Any
    assert schema_type_to_python({}, "pydantic") is Any


def test_schema_type_to_python_enum():
    schema = {"enum": ["red", "green", "blue"]}
    result = schema_type_to_python(schema, "pydantic")
    assert result == Literal[("red", "green", "blue")]


def test_schema_type_to_python_array():
    # String items
    schema = {"type": "array", "items": {"type": "string"}}
    result = schema_type_to_python(schema, "pydantic")
    assert result == List[str]

    # Integer items
    schema = {"type": "array", "items": {"type": "integer"}}
    result = schema_type_to_python(schema, "pydantic")
    assert result == List[int]

    # Without items specification
    schema = {"type": "array"}
    result = schema_type_to_python(schema, "pydantic")
    assert result == List[Any]


def test_schema_type_to_python_object():
    schema = {
        "type": "object",
        "title": "TestObject",
        "properties": {
            "name": {"type": "string"},
            "age": {"type": "integer"}
        },
        "required": ["name"]
    }

    # Pydantic caller
    pydantic_result = schema_type_to_python(schema, "pydantic")
    assert issubclass(pydantic_result, BaseModel)
    assert pydantic_result.__name__ == "TestObject"
    assert pydantic_result.model_fields["name"].annotation is str
    assert pydantic_result.model_fields["age"].annotation == Optional[int]

    # Typeddict caller
    typeddict_result = schema_type_to_python(schema, "typeddict")
    assert isinstance(typeddict_result, _TypedDictMeta)
    assert typeddict_result.__name__ == "TestObject"
    assert typeddict_result.__annotations__["name"] is str
    assert typeddict_result.__annotations__["age"] == Optional[int]

    # Dataclass caller
    dataclass_result = schema_type_to_python(schema, "dataclass")
    print(TypeAdapter(dataclass_result).json_schema())
    assert hasattr(dataclass_result, "__dataclass_fields__")
    assert dataclass_result.__annotations__["name"] is str
    assert not hasattr(dataclass_result, "name")
    assert dataclass_result.__annotations__["age"] is int
    assert dataclass_result.age is None


def test_schema_type_to_python_unknown_type():
    # Unknown type
    schema = {"type": "unknown"}
    result = schema_type_to_python(schema, "pydantic")
    assert result == Any

    # Schema without type
    schema = {}
    result = schema_type_to_python(schema, "pydantic")
    assert result == Any


def test_json_schema_dict_to_typeddict_basic():
    schema = {
        "type": "object",
        "properties": {
            "name": {"type": "string"},
            "age": {"type": "integer"}
        },
        "required": ["name"]
    }

    result = json_schema_dict_to_typeddict(schema, "Person")
    assert isinstance(result, _TypedDictMeta)
    assert result.__name__ == "Person"

    annotations = result.__annotations__
    assert annotations["name"] is str
    assert annotations["age"] == Optional[int]


def test_json_schema_dict_to_typeddict_array_enum():
    schema = {
        "type": "object",
        "properties": {
            "tags": {
                "type": "array",
                "items": {"type": "string"}
            },
            "preferences": {
                "enum": ["light", "dark"]
            }
        },
        "required": ["tags"]
    }

    result = json_schema_dict_to_typeddict(schema)
    assert isinstance(result, _TypedDictMeta)
    assert result.__name__ == "AnonymousTypedDict"

    annotations = result.__annotations__
    assert annotations["tags"] == List[str]
    assert annotations["preferences"] == Optional[Literal[("light", "dark")]]


def test_json_schema_dict_to_typeddict_nested_object():
    schema = {
        "type": "object",
        "properties": {
            "field": {
                "type": "object",
                "properties": {
                    "name": {"type": "string"},
                    "age": {"type": "integer"}
                },
                "required": ["name"]
            }
        },
        "required": ["field"]
    }

    result = json_schema_dict_to_typeddict(schema)
    assert isinstance(result, _TypedDictMeta)
    assert result.__name__ == "AnonymousTypedDict"

    annotations = result.__annotations__
    assert isinstance(annotations["field"], _TypedDictMeta)
    assert annotations["field"].__name__ == "AnonymousTypedDict"
    assert annotations["field"].__annotations__["name"] is str
    assert annotations["field"].__annotations__["age"] == Optional[int]


def test_json_schema_dict_to_pydantic_basic():
    schema = {
        "type": "object",
        "properties": {
            "name": {"type": "string"},
            "age": {"type": "integer"}
        },
        "required": ["name"]
    }

    result = json_schema_dict_to_pydantic(schema, "Person")
    assert issubclass(result, BaseModel)
    assert result.__name__ == "Person"

    assert result.model_fields["name"].annotation is str
    assert result.model_fields["age"].annotation == Optional[int]
    assert result.model_fields["name"].default == PydanticUndefined
    result.model_fields["age"].default is None


def test_json_schema_dict_to_pydantic_array_enum():
    schema = {
        "type": "object",
        "properties": {
            "tags": {
                "type": "array",
                "items": {"type": "string"}
            },
            "status": {
                "enum": ["active", "inactive", "pending"]
            },
        },
        "required": ["status"]
    }

    result = json_schema_dict_to_pydantic(schema)
    assert issubclass(result, BaseModel)
    assert result.__name__ == "AnonymousPydanticModel"

    assert result.model_fields["tags"].annotation == Optional[List[str]]
    assert result.model_fields["status"].annotation == Literal[("active", "inactive", "pending")]
    assert result.model_fields["tags"].default is None
    assert result.model_fields["status"].default == PydanticUndefined


def test_json_schema_dict_to_pydantic_nested_object():
    schema = {
        "type": "object",
        "properties": {
            "field": {
                "type": "object",
                "properties": {
                    "name": {"type": "string"},
                    "age": {"type": "integer"}
                },
                "required": ["name"]
            }
        },
        "required": ["field"]
    }

    result = json_schema_dict_to_pydantic(schema)
    assert issubclass(result, BaseModel)
    assert result.__name__ == "AnonymousPydanticModel"

    assert issubclass(result.model_fields["field"].annotation, BaseModel)
    assert result.model_fields["field"].annotation.__name__ == "AnonymousPydanticModel"

    field = result.model_fields["field"].annotation
    assert field.model_fields["name"].annotation is str
    assert field.model_fields["age"].annotation == Optional[int]
    assert field.model_fields["name"].default == PydanticUndefined
    assert field.model_fields["age"].default is None


def test_json_schema_dict_to_dataclass_basic():
    schema = {
        "type": "object",
        "properties": {
            "name": {"type": "string"},
            "age": {"type": "integer"}
        },
        "required": ["name"]
    }

    result = json_schema_dict_to_dataclass(schema, "Person")
    assert is_dataclass(result)
    assert result.__name__ == "Person"

    annotations = result.__annotations__
    assert annotations["name"] is str
    assert annotations["age"] is int
    assert not hasattr(result, "name")
    assert result.age is None


def test_json_schema_dict_to_dataclass_array_enum():
    schema = {
        "type": "object",
        "properties": {
            "status": {
                "enum": ["active", "inactive", "pending"]
            },
            "tags": {
                "type": "array",
                "items": {"type": "string"}
            },
        },
        "required": ["status"]
    }

    result = json_schema_dict_to_dataclass(schema)
    assert is_dataclass(result)
    assert result.__name__ == "AnonymousDataclass"

    annotations = result.__annotations__
    assert annotations["tags"] == List[str]
    assert annotations["status"] == Literal[("active", "inactive", "pending")]
    assert not hasattr(result, "status")
    assert result.tags is None


def test_json_schema_dict_to_dataclass_nested_object():
    schema = {
        "type": "object",
        "properties": {
            "field": {
                "type": "object",
                "properties": {
                    "name": {"type": "string"},
                    "age": {"type": "integer"}
                },
                "required": ["name"]
            }
        },
        "required": ["field"]
    }

    result = json_schema_dict_to_dataclass(schema)
    assert is_dataclass(result)
    assert result.__name__ == "AnonymousDataclass"

    annotations = result.__annotations__
    assert is_dataclass(annotations["field"])
    assert annotations["field"].__name__ == "AnonymousDataclass"

    field = annotations["field"]
    assert field.__annotations__["name"] is str
    assert field.__annotations__["age"] is int
    assert not hasattr(field, "name")
    assert field.age is None


================================================
FILE: tests/types/test_to_regex.py
================================================
import pytest


from outlines.types.dsl import (
    Choice,
    String,
    Regex,
    JsonSchema,
    KleeneStar,
    KleenePlus,
    QuantifyBetween,
    QuantifyExact,
    QuantifyMaximum,
    QuantifyMinimum,
    Sequence,
    Alternatives,
    Optional,
    Term,
    to_regex,
)


def test_to_regex_simple():
    a = String("a")
    assert to_regex(a) == "a"
    assert a.matches("a") is True

    a = Regex("[0-9]")
    assert to_regex(a) == "([0-9])"
    assert a.matches(0) is True
    assert a.matches(10) is False
    assert a.matches("a") is False

    a = JsonSchema({"type": "integer"})
    assert to_regex(a) == r"((-)?(0|[1-9][0-9]*))"
    assert a.matches(1) is True
    assert a.matches("1") is True
    assert a.matches("a") is False

    a = Choice(["a", "b"])
    assert to_regex(a) == "(a|b)"
    assert a.matches("a") is True
    assert a.matches("b") is True
    assert a.matches("c") is False

    a = Optional(String("a"))
    assert to_regex(a) == "(a)?"
    assert a.matches("") is True
    assert a.matches("a") is True

    a = KleeneStar(String("a"))
    assert to_regex(a) == "(a)*"
    assert a.matches("") is True
    assert a.matches("a") is True
    assert a.matches("aaaaa") is True

    a = KleenePlus(String("a"))
    assert to_regex(a) == "(a)+"
    assert a.matches("") is False
    assert a.matches("a") is True
    assert a.matches("aaaaa") is True

    a = QuantifyExact(String("a"), 2)
    assert to_regex(a) == "(a){2}"
    assert a.matches("a") is False
    assert a.matches("aa") is True
    assert a.matches("aaa") is False

    a = QuantifyMinimum(String("a"), 2)
    assert to_regex(a) == "(a){2,}"
    assert a.matches("a") is False
    assert a.matches("aa") is True
    assert a.matches("aaa") is True

    a = QuantifyMaximum(String("a"), 2)
    assert to_regex(a) == "(a){,2}"
    assert a.matches("aa") is True
    assert a.matches("aaa") is False

    a = QuantifyBetween(String("a"), 1, 2)
    assert to_regex(a) == "(a){1,2}"
    assert a.matches("") is False
    assert a.matches("a") is True
    assert a.matches("aa") is True
    assert a.matches("aaa") is False

    with pytest.raises(TypeError, match="Cannot convert"):
        to_regex(Term())


def test_to_regex_combinations():
    a = Sequence([Regex("dog|cat"), String("fish")])
    assert to_regex(a) == "(dog|cat)fish"


================================================
FILE: tests/types/test_types_utils.py
================================================
import datetime
import pytest
import sys
from dataclasses import dataclass
from enum import Enum
if sys.version_info >= (3, 11):
    from enum import member
else:
    # Python < 3.11 doesn't have enum.member, but also doesn't warn about partial in enums
    def member(x):  # type: ignore[no-redef]
        return x
from functools import partial
from typing import (
    Annotated,
    Any,
    Dict,
    List,
    Literal,
    NewType,
    Optional,
    Tuple,
    Union
)

from genson import SchemaBuilder
from pydantic import BaseModel

from outlines.types.dsl import Choice, JsonSchema
from outlines.types.utils import (
    get_enum_from_choice,
    get_enum_from_literal,
    get_schema_from_enum,
    get_schema_from_signature,
    is_bool,
    is_callable,
    is_date,
    is_dataclass,
    is_datetime,
    is_enum,
    is_float,
    is_float_instance,
    is_genson_schema_builder,
    is_int,
    is_int_instance,
    is_literal,
    is_native_dict,
    is_pydantic_model,
    is_str,
    is_str_instance,
    is_time,
    is_typed_dict,
    is_typing_dict,
    is_typing_list,
    is_typing_tuple,
    is_union
)

if sys.version_info >= (3, 12):
    from typing import TypedDict
else:
    from typing_extensions import TypedDict


# Type identification


@pytest.fixture
def sample_enum():
    class SampleEnum(Enum):
        A = 1
        B = 2

    return SampleEnum

@pytest.fixture
def sample_complex_enum():
    def add_func(a: float, b: float) -> float:
        return a + b

    class SampleComplexEnum(Enum):
        add = member(partial(add_func))
        a = "a"
        b = 2

    return SampleComplexEnum

@pytest.fixture
def sample_empty_enum():
    def add_func(a: float, b: float) -> float:
        return a + b

    # the enum is empty because the function is not registered as callable
    class SampleEmptyEnum(Enum):
        add = add_func

    return SampleEmptyEnum

@pytest.fixture
def sample_class():
    class SampleClass:
        pass

    return SampleClass

@pytest.fixture
def sample_dataclass():
    @dataclass
    class SampleDataclass:
        field1: str
        field2: int

    return SampleDataclass

@pytest.fixture
def sample_typed_dict():
    class SampleTypedDict(TypedDict):
        name: str
        age: int

    return SampleTypedDict

@pytest.fixture
def sample_pydantic_model():
    class SamplePydanticModel(BaseModel):
        name: str
        age: int

    return SamplePydanticModel

@pytest.fixture
def sample_schema_builder():
    builder = SchemaBuilder()
    builder.add_schema({"type": "object", "properties": {}})
    builder.add_object({"hi": "there"})
    builder.add_object({"hi": 5})
    return builder

@pytest.fixture
def sample_function():
    def sample_function(foo: str, bar: List[int]):
        pass

    return sample_function

@pytest.fixture
def sample_function_missing_type():
    def sample_function(foo, bar: List[int]):
        pass

    return sample_function


def test_is_int():
    assert is_int(int)
    assert not is_int(float)
    assert not is_int(1)
    assert not is_int(List[int])
    assert not is_int(Dict[int, int])
    assert is_int(Annotated[int, "some metadata"])
    assert not is_int(Annotated[str, "some metadata"])
    assert is_int(NewType("UserId", int))
    assert not is_int(NewType("UserId", str))


def test_is_int_instance():
    assert is_int_instance(1)
    assert not is_int_instance(True)
    assert not is_int_instance(1.0)
    assert not is_int_instance("1")
    assert not is_int_instance(int)


def test_is_float():
    assert is_float(float)
    assert not is_float(int)
    assert not is_float(1.0)
    assert not is_float(List[float])
    assert not is_float(Dict[float, float])
    assert is_float(Annotated[float, "some metadata"])
    assert not is_float(Annotated[int, "some metadata"])
    assert is_float(NewType("UserId", float))
    assert not is_float(NewType("UserId", int))


def test_is_float_instance():
    assert is_float_instance(1.0)
    assert not is_float_instance(1)
    assert not is_float_instance("1.0")
    assert not is_float_instance(float)


def test_is_str():
    assert is_str(str)
    assert not is_str(int)
    assert not is_str("hello")
    assert not is_str(List[str])
    assert not is_str(Dict[str, str])
    assert is_str(Annotated[str, "some metadata"])
    assert not is_str(Annotated[int, "some metadata"])
    assert is_str(NewType("UserId", str))
    assert not is_str(NewType("UserId", int))


def test_is_str_instance():
    assert is_str_instance("hello")
    assert is_str_instance("")
    assert is_str_instance("123")
    assert not is_str_instance(123)
    assert not is_str_instance(str)


def test_is_bool():
    assert is_bool(bool)
    assert not is_bool(int)
    assert not is_bool(True)
    assert is_bool(Annotated[bool, "some metadata"])
    assert not is_bool(Annotated[int, "some metadata"])
    assert is_bool(NewType("UserId", bool))
    assert not is_bool(NewType("UserId", int))


def test_is_datetime():
    assert is_datetime(datetime.datetime)
    assert not is_datetime(datetime.date)
    assert not is_datetime(datetime.time)
    assert not is_datetime(datetime.datetime.now())


def test_is_date():
    assert is_date(datetime.date)
    assert not is_date(datetime.datetime)
    assert not is_date(datetime.time)
    assert not is_date(datetime.date.today())


def test_is_time():
    assert is_time(datetime.time)
    assert not is_time(datetime.datetime)
    assert not is_time(datetime.date)
    assert not is_time(datetime.time(12, 30))


def test_is_native_dict():
    assert is_native_dict(dict)
    assert not is_native_dict({})
    assert not is_native_dict({"key": "value"})
    assert not is_native_dict(list)
    assert not is_native_dict(dict[str, int])


def test_is_typing_dict():
    assert is_typing_dict(dict[str, int])
    assert is_typing_dict(Dict[int, str])
    assert not is_typing_dict(dict)
    assert not is_typing_dict({})


def test_is_typing_list():
    assert is_typing_list(list[int])
    assert is_typing_list(List[int])
    assert not is_typing_list(list)
    assert not is_typing_list([])
    assert not is_typing_list(dict)


def test_is_typing_tuple():
    assert is_typing_tuple(tuple[int, str])
    assert is_typing_tuple(Tuple[int, str])
    assert not is_typing_tuple(tuple)
    assert not is_typing_tuple(())
    assert not is_typing_tuple(list)


def test_is_union():
    assert is_union(Union[int, str])
    assert is_union(Optional[int])
    assert not is_union(list)
    assert not is_union(["a", "b"])
    assert not is_union(Literal[int, str])


def test_is_literal():
    assert is_literal(Literal["a", "b"])
    assert not is_literal(str)
    assert not is_literal("a")
    assert not is_literal(["a", "b"])
    assert not is_literal(Union[str, int])


def test_is_dataclass(
    sample_dataclass,
    sample_class,
    sample_typed_dict,
    sample_pydantic_model
):
    assert is_dataclass(sample_dataclass)
    assert not is_dataclass(sample_dataclass(field1="test", field2=123))
    assert not is_dataclass(dict)
    assert not is_dataclass(sample_class)
    assert not is_dataclass(sample_typed_dict)
    assert not is_dataclass(sample_pydantic_model)


def test_is_typed_dict(
    sample_typed_dict,
    sample_class,
    sample_dataclass,
    sample_pydantic_model
):
    assert is_typed_dict(sample_typed_dict)
    assert not is_typed_dict(sample_typed_dict(name="test", age=30))
    assert not is_typed_dict(dict)
    assert not is_typed_dict(sample_class)
    assert not is_typed_dict(sample_dataclass)
    assert not is_typed_dict(sample_pydantic_model)


def test_is_pydantic_model(
    sample_pydantic_model,
    sample_class,
    sample_dataclass,
    sample_typed_dict
):
    assert is_pydantic_model(sample_pydantic_model)
    assert not is_pydantic_model(sample_pydantic_model(name="test", age=30))  # Instance
    assert not is_pydantic_model(dict)
    assert not is_pydantic_model(sample_class)
    assert not is_pydantic_model(sample_dataclass)
    assert not is_pydantic_model(sample_typed_dict)


def test_is_genson_schema_builder(
    sample_schema_builder,
    sample_class,
    sample_dataclass,
    sample_typed_dict,
    sample_pydantic_model
):
    assert is_genson_schema_builder(sample_schema_builder)
    assert not is_genson_schema_builder(dict)
    assert not is_genson_schema_builder(str)
    assert not is_genson_schema_builder({"type": 'object', "properties": {}})
    assert not is_genson_schema_builder('{"type": "object", "properties": {}}')
    assert not is_genson_schema_builder(sample_class)
    assert not is_genson_schema_builder(sample_dataclass)
    assert not is_genson_schema_builder(sample_typed_dict)
    assert not is_genson_schema_builder(sample_pydantic_model)


def test_is_enum(sample_enum):
    assert is_enum(sample_enum)
    assert not is_enum(sample_enum.A)
    assert not is_enum(dict)
    assert not is_enum(Literal["a", "b"])
    assert not is_enum(["a", "b"])


def test_is_callable(sample_function, sample_class, sample_dataclass, sample_typed_dict, sample_pydantic_model):
    assert is_callable(sample_function)
    assert is_callable(lambda x: x)
    assert not is_callable(dict)
    assert not is_callable(sample_class)
    assert not is_callable(sample_dataclass)
    assert not is_callable(sample_typed_dict)
    assert not is_callable(sample_pydantic_model)


# Type conversion


def test_get_enum_from_choice(sample_enum):
    choice = Choice(["a", "b", sample_enum.A])
    enum = get_enum_from_choice(choice)
    assert is_enum(enum)
    assert enum.a.value == "a"
    assert enum.b.value == "b"
    assert getattr(enum, "SampleEnum.A").value == sample_enum.A


def test_get_enum_from_literal(sample_enum):
    basic_enum = get_enum_from_literal(Literal["a", "b"])
    assert(is_enum(basic_enum))
    assert basic_enum.a.value == "a"
    assert basic_enum.b.value == "b"

    complex_enum = get_enum_from_literal(Literal["a", 1, True, None, sample_enum.A])
    assert is_enum(complex_enum)
    assert complex_enum.a.value == "a"
    assert getattr(complex_enum, "1").value == 1
    assert getattr(complex_enum, "True").value
    assert getattr(complex_enum, "None").value is None
    assert getattr(complex_enum, "SampleEnum.A").value == sample_enum.A


def test_get_schema_from_signature(sample_function, sample_function_missing_type):
    result = get_schema_from_signature(sample_function)
    assert result["type"] == "object"
    assert list(result["properties"].keys()) == ["foo", "bar"]
    assert result["properties"]["foo"]["type"] == "string"
    assert result["properties"]["bar"]["type"] == "array"
    assert result["properties"]["bar"]["items"]["type"] == "integer"

    # in case of a function missing type annotations
    with pytest.raises(ValueError):
        get_schema_from_signature(sample_function_missing_type)


def test_get_schema_from_enum(sample_complex_enum, sample_empty_enum):
    schema = get_schema_from_enum(sample_complex_enum)
    assert JsonSchema(schema)
    assert schema["title"] == sample_complex_enum.__name__
    assert len(schema["oneOf"]) == len(sample_complex_enum)
    for elt in schema["oneOf"]:
        assert type(elt) in [int, float, bool, type(None), str, dict]

    # in case of an empty enum because the function member is not registered as callable
    with pytest.raises(ValueError):
        get_schema_from_enum(sample_empty_enum)